diff --git a/.github/workflows/libclang-python-tests.yml b/.github/workflows/libclang-python-tests.yml
index 43ded0af3ac21cf..801a701724789ad 100644
--- a/.github/workflows/libclang-python-tests.yml
+++ b/.github/workflows/libclang-python-tests.yml
@@ -22,12 +22,6 @@ on:
       - '.github/workflows/libclang-python-tests.yml'
       - '.github/workflows/llvm-project-tests.yml'
 
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
 jobs:
   check-clang-python:
     # Build libclang and then run the libclang Python binding's unit tests.
diff --git a/.github/workflows/llvm-project-tests.yml b/.github/workflows/llvm-project-tests.yml
index 17a54be16badc15..95a3890c0d2dc7f 100644
--- a/.github/workflows/llvm-project-tests.yml
+++ b/.github/workflows/llvm-project-tests.yml
@@ -51,7 +51,7 @@ concurrency:
   # Cancel intermediate builds: only if it is a pull request build.
   # If the group name here is the same as the group name in the workflow that includes
   # this one, then the action will try to wait on itself and get stuck.
-  group: llvm-project-${{ github.workflow }}-${{ inputs.projects }}${{ github.ref }}
+  group: llvm-project-${{ github.workflow }}-${{ inputs.projects }}-${{ inputs.python_version }}${{ github.ref }}
   cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
 
 jobs:
diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 0c8935457366dbb..6d3b797da3787e3 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -113,6 +113,10 @@
 
   Prints out offsets for abbrev and debug_info of Skeleton CUs that get patched.
 
+- `--debug-thread-count=<uint>`
+
+  Specifies the number of threads to be used when processing DWO debug information.
+
 - `--dot-tooltip-code`
 
   Add basic block instructions as tool tips on nodes
diff --git a/bolt/docs/OptimizingLinux.md b/bolt/docs/OptimizingLinux.md
index 0045f0ead9fd0d0..c85fecabcccc292 100644
--- a/bolt/docs/OptimizingLinux.md
+++ b/bolt/docs/OptimizingLinux.md
@@ -37,7 +37,7 @@ Convert `perf` profile into a format suitable for BOLT passing the `vmlinux` bin
 
 
 ```bash
-$ sudo chwon $USER perf.data
+$ sudo chown $USER perf.data
 $ perf2bolt -p perf.data -o perf.fdata vmlinux
 ```
 
diff --git a/bolt/include/bolt/Core/ParallelUtilities.h b/bolt/include/bolt/Core/ParallelUtilities.h
index e7b35a79acc78cc..9f75e2f6bd81d29 100644
--- a/bolt/include/bolt/Core/ParallelUtilities.h
+++ b/bolt/include/bolt/Core/ParallelUtilities.h
@@ -50,7 +50,8 @@ enum SchedulingPolicy {
 };
 
 /// Return the managed thread pool and initialize it if not initialized.
-ThreadPoolInterface &getThreadPool();
+ThreadPoolInterface &
+getThreadPool(const unsigned ThreadsCount = opts::ThreadCount);
 
 /// Perform the work on each BinaryFunction except those that are accepted
 /// by SkipPredicate, scheduling heuristic is based on SchedPolicy.
diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index deaf179140c0145..624245650a0924b 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -184,7 +184,8 @@ class DWARFRewriter {
   /// Output .dwo files.
   void writeDWOFiles(DWARFUnit &, const OverriddenSectionsMap &,
                      const std::string &, DebugLocWriter &,
-                     DebugStrOffsetsWriter &, DebugStrWriter &);
+                     DebugStrOffsetsWriter &, DebugStrWriter &,
+                     DebugRangesSectionWriter &);
   using KnownSectionsEntry = std::pair<MCSection *, DWARFSectionKind>;
 };
 
diff --git a/bolt/lib/Core/ParallelUtilities.cpp b/bolt/lib/Core/ParallelUtilities.cpp
index a24c37c06f1ac1c..3a8a7dc0aee7bd7 100644
--- a/bolt/lib/Core/ParallelUtilities.cpp
+++ b/bolt/lib/Core/ParallelUtilities.cpp
@@ -49,7 +49,7 @@ namespace ParallelUtilities {
 
 namespace {
 /// A single thread pool that is used to run parallel tasks
-std::unique_ptr<DefaultThreadPool> ThreadPoolPtr;
+std::unique_ptr<ThreadPoolInterface> ThreadPoolPtr;
 
 unsigned computeCostFor(const BinaryFunction &BF,
                         const PredicateTy &SkipPredicate,
@@ -102,12 +102,15 @@ inline unsigned estimateTotalCost(const BinaryContext &BC,
 
 } // namespace
 
-ThreadPoolInterface &getThreadPool() {
+ThreadPoolInterface &getThreadPool(const unsigned ThreadsCount) {
   if (ThreadPoolPtr.get())
     return *ThreadPoolPtr;
 
-  ThreadPoolPtr = std::make_unique<DefaultThreadPool>(
-      llvm::hardware_concurrency(opts::ThreadCount));
+  if (ThreadsCount > 1)
+    ThreadPoolPtr = std::make_unique<DefaultThreadPool>(
+        llvm::hardware_concurrency(ThreadsCount));
+  else
+    ThreadPoolPtr = std::make_unique<SingleThreadExecutor>();
   return *ThreadPoolPtr;
 }
 
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 98f81f44d64901f..f9cb1b3895e79bb 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -329,6 +329,12 @@ static cl::opt<bool> KeepARanges(
         "keep or generate .debug_aranges section if .gdb_index is written"),
     cl::Hidden, cl::cat(BoltCategory));
 
+static cl::opt<unsigned>
+    DebugThreadCount("debug-thread-count",
+                     cl::desc("specifies thread count for the multithreading "
+                              "for updating DWO debug info"),
+                     cl::init(1), cl::cat(BoltCategory));
+
 static cl::opt<std::string> DwarfOutputPath(
     "dwarf-output-path",
     cl::desc("Path to where .dwo files will be written out to."), cl::init(""),
@@ -475,8 +481,8 @@ static void emitDWOBuilder(const std::string &DWOName,
                            DWARFUnit &SplitCU, DWARFUnit &CU,
                            DebugLocWriter &LocWriter,
                            DebugStrOffsetsWriter &StrOffstsWriter,
-                           DebugStrWriter &StrWriter,
-                           GDBIndex &GDBIndexSection) {
+                           DebugStrWriter &StrWriter, GDBIndex &GDBIndexSection,
+                           DebugRangesSectionWriter &TempRangesSectionWriter) {
   // Populate debug_info and debug_abbrev for current dwo into StringRef.
   DWODIEBuilder.generateAbbrevs();
   DWODIEBuilder.finish();
@@ -532,7 +538,7 @@ static void emitDWOBuilder(const std::string &DWOName,
     OverriddenSections[Kind] = Contents;
   }
   Rewriter.writeDWOFiles(CU, OverriddenSections, DWOName, LocWriter,
-                         StrOffstsWriter, StrWriter);
+                         StrOffstsWriter, StrWriter, TempRangesSectionWriter);
 }
 
 using DWARFUnitVec = std::vector<DWARFUnit *>;
@@ -646,7 +652,6 @@ void DWARFRewriter::updateDebugInfo() {
                                          *StrWriter);
   GDBIndex GDBIndexSection(BC);
   auto processSplitCU = [&](DWARFUnit &Unit, DWARFUnit &SplitCU,
-                            DIEBuilder &DIEBlder,
                             DebugRangesSectionWriter &TempRangesSectionWriter,
                             DebugAddrWriter &AddressWriter,
                             const std::string &DWOName,
@@ -669,7 +674,7 @@ void DWARFRewriter::updateDebugInfo() {
 
     emitDWOBuilder(DWOName, DWODIEBuilder, *this, SplitCU, Unit,
                    DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
-                   GDBIndexSection);
+                   GDBIndexSection, TempRangesSectionWriter);
   };
   auto processMainBinaryCU = [&](DWARFUnit &Unit, DIEBuilder &DIEBlder) {
     std::optional<DWARFUnit *> SplitCU;
@@ -716,9 +721,13 @@ void DWARFRewriter::updateDebugInfo() {
       finalizeTypeSections(DIEBlder, *Streamer, GDBIndexSection);
 
   CUPartitionVector PartVec = partitionCUs(*BC.DwCtx);
+  const unsigned int ThreadCount =
+      std::min(opts::DebugThreadCount, opts::ThreadCount);
   for (std::vector<DWARFUnit *> &Vec : PartVec) {
     DIEBlder.buildCompileUnits(Vec);
     llvm::SmallVector<std::unique_ptr<DIEBuilder>, 72> DWODIEBuildersByCU;
+    ThreadPoolInterface &ThreadPool =
+        ParallelUtilities::getThreadPool(ThreadCount);
     for (DWARFUnit *CU : DIEBlder.getProcessedCUs()) {
       createRangeLocListAddressWriters(*CU);
       std::optional<DWARFUnit *> SplitCU;
@@ -729,9 +738,9 @@ void DWARFRewriter::updateDebugInfo() {
         continue;
       DebugAddrWriter &AddressWriter =
           *AddressWritersByCU[CU->getOffset()].get();
-      DebugRangesSectionWriter *TempRangesSectionWriter =
-          CU->getVersion() >= 5 ? RangeListsWritersByCU[*DWOId].get()
-                                : LegacyRangesWritersByCU[*DWOId].get();
+      DebugRangesSectionWriter &TempRangesSectionWriter =
+          CU->getVersion() >= 5 ? *RangeListsWritersByCU[*DWOId].get()
+                                : *LegacyRangesWritersByCU[*DWOId].get();
       std::optional<std::string> DwarfOutputPath =
           opts::DwarfOutputPath.empty()
               ? std::nullopt
@@ -744,9 +753,17 @@ void DWARFRewriter::updateDebugInfo() {
           *DWODIEBuildersByCU.emplace_back(std::move(DWODIEBuilderPtr)).get();
       if (CU->getVersion() >= 5)
         StrOffstsWriter->finalizeSection(*CU, DIEBlder);
-      processSplitCU(*CU, **SplitCU, DIEBlder, *TempRangesSectionWriter,
-                     AddressWriter, DWOName, DwarfOutputPath, DWODIEBuilder);
+      // Important to capture CU and SplitCU by value here, otherwise when the
+      // thread is executed at some point after the current iteration of the
+      // loop, dereferencing CU/SplitCU in the call to processSplitCU means it
+      // will dereference a different variable than the one intended, causing a
+      // seg fault.
+      ThreadPool.async([&, DwarfOutputPath, DWOName, CU, SplitCU] {
+        processSplitCU(*CU, **SplitCU, TempRangesSectionWriter, AddressWriter,
+                       DWOName, DwarfOutputPath, DWODIEBuilder);
+      });
     }
+    ThreadPool.wait();
     for (std::unique_ptr<DIEBuilder> &DWODIEBuilderPtr : DWODIEBuildersByCU)
       DWODIEBuilderPtr->updateDebugNamesTable();
     for (DWARFUnit *CU : DIEBlder.getProcessedCUs())
@@ -1807,7 +1824,8 @@ std::optional<StringRef> updateDebugData(
 void DWARFRewriter::writeDWOFiles(
     DWARFUnit &CU, const OverriddenSectionsMap &OverridenSections,
     const std::string &DWOName, DebugLocWriter &LocWriter,
-    DebugStrOffsetsWriter &StrOffstsWriter, DebugStrWriter &StrWriter) {
+    DebugStrOffsetsWriter &StrOffstsWriter, DebugStrWriter &StrWriter,
+    DebugRangesSectionWriter &TempRangesSectionWriter) {
   // Setup DWP code once.
   DWARFContext *DWOCtx = BC.getDWOContext();
   const uint64_t DWOId = *CU.getDWOId();
@@ -1854,9 +1872,8 @@ void DWARFRewriter::writeDWOFiles(
 
   DebugRangeListsSectionWriter *RangeListssWriter = nullptr;
   if (CU.getVersion() == 5) {
-    assert(RangeListsWritersByCU.count(DWOId) != 0 &&
-           "No RangeListsWriter for DWO ID.");
-    RangeListssWriter = RangeListsWritersByCU[DWOId].get();
+    RangeListssWriter =
+        llvm::dyn_cast<DebugRangeListsSectionWriter>(&TempRangesSectionWriter);
 
     // Handling .debug_rnglists.dwo separately. The original .o/.dwo might not
     // have .debug_rnglists so won't be part of the loop below.
diff --git a/bolt/test/X86/dwarf4-cross-cu-backward-different-abbrev.test b/bolt/test/X86/dwarf4-cross-cu-backward-different-abbrev.test
index 555887a067589f3..b06cec6fe6b962c 100644
--- a/bolt/test/X86/dwarf4-cross-cu-backward-different-abbrev.test
+++ b/bolt/test/X86/dwarf4-cross-cu-backward-different-abbrev.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-cross-reference-different-abbrev-dst.s -o %t.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-cross-reference-different-abbrev-src.s -o %t1.o
 # RUN: %clang %cflags -gdwarf-4 %t.o %t1.o -o %t.exe
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
diff --git a/bolt/test/X86/dwarf4-cross-cu-forward-different-abbrev.test b/bolt/test/X86/dwarf4-cross-cu-forward-different-abbrev.test
index 74c9491d95d36ea..9adbf4eef9114ca 100644
--- a/bolt/test/X86/dwarf4-cross-cu-forward-different-abbrev.test
+++ b/bolt/test/X86/dwarf4-cross-cu-forward-different-abbrev.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-cross-reference-different-abbrev-dst.s -o %t.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-cross-reference-different-abbrev-src.s -o %t1.o
 # RUN: %clang %cflags -gdwarf-4 %t1.o %t.o -o %t.exe
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
diff --git a/bolt/test/X86/dwarf4-cross-cu-loclist-dwarf4-loclist--dwarf5-loclist.test b/bolt/test/X86/dwarf4-cross-cu-loclist-dwarf4-loclist--dwarf5-loclist.test
index 6bcf8892ed0a8a0..6f14c95236f1319 100644
--- a/bolt/test/X86/dwarf4-cross-cu-loclist-dwarf4-loclist--dwarf5-loclist.test
+++ b/bolt/test/X86/dwarf4-cross-cu-loclist-dwarf4-loclist--dwarf5-loclist.test
@@ -4,7 +4,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-loclist.s -o %t1.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-two-entries-loclist.s -o %t2.o
 # RUN: %clang %cflags %t1.o %t2.o %t.o -o %t.exe
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
diff --git a/bolt/test/X86/dwarf4-df-dualcu-loclist.test b/bolt/test/X86/dwarf4-df-dualcu-loclist.test
index 57c75e282421aca..a094d46c8354c73 100644
--- a/bolt/test/X86/dwarf4-df-dualcu-loclist.test
+++ b/bolt/test/X86/dwarf4-df-dualcu-loclist.test
@@ -6,7 +6,7 @@
 ; RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-df-dualcu-loclist-helper.s \
 ; RUN: -split-dwarf-file=helper.dwo -o helper.o
 ; RUN: %clang %cflags -gdwarf-5 -O2 -gsplit-dwarf=split main.o helper.o -o main.exe
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.dwo | FileCheck -check-prefix=PRE-BOLT-DWO-MAIN %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.dwo.dwo | FileCheck -check-prefix=BOLT-DWO-MAIN %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info helper.dwo | FileCheck -check-prefix=PRE-BOLT-DWO-HELPER %s
diff --git a/bolt/test/X86/dwarf4-split-dwarf-no-address.test b/bolt/test/X86/dwarf4-split-dwarf-no-address.test
index fc6d8d324b9597c..014f76ca3d21ca5 100644
--- a/bolt/test/X86/dwarf4-split-dwarf-no-address.test
+++ b/bolt/test/X86/dwarf4-split-dwarf-no-address.test
@@ -6,7 +6,7 @@
 ; RUN: llvm-mc --split-dwarf-file=helper.dwo --triple=x86_64-unknown-linux-gnu \
 ; RUN: --filetype=obj %p/Inputs/dwarf4-split-dwarf-no-address-helper.s -o=helper.o
 ; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o helper.o -o main.exe -fno-pic -no-pie
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe.bolt | FileCheck -check-prefix=BOLT %s
 
 ;; Testing that there are no asserts/crashes when one of the DWARF4 CUs does not modify .debug_addr
diff --git a/bolt/test/X86/dwarf4-subprogram-multiple-ranges-cus.test b/bolt/test/X86/dwarf4-subprogram-multiple-ranges-cus.test
index c9ade995b70878d..4e98299c7aa3696 100644
--- a/bolt/test/X86/dwarf4-subprogram-multiple-ranges-cus.test
+++ b/bolt/test/X86/dwarf4-subprogram-multiple-ranges-cus.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-subprogram-multiple-ranges-main.s -o %t1.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-subprogram-multiple-ranges-other.s -o %t2.o
 # RUN: %clang %cflags %t1.o %t2.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-objdump %t.bolt --disassemble > %t1.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt >> %t1.txt
 # RUN: cat %t1.txt | FileCheck --check-prefix=POSTCHECK %s
diff --git a/bolt/test/X86/dwarf4-types-dwarf5-types.test b/bolt/test/X86/dwarf4-types-dwarf5-types.test
index a253f2283609017..af01d1467e73f44 100644
--- a/bolt/test/X86/dwarf4-types-dwarf5-types.test
+++ b/bolt/test/X86/dwarf4-types-dwarf5-types.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-types-dwarf5-types-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-types-dwarf5-types-helper.s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-types %t.bolt | FileCheck --check-prefix=POSTCHECKTU %s
 
diff --git a/bolt/test/X86/dwarf4-types-dwarf5.test b/bolt/test/X86/dwarf4-types-dwarf5.test
index 1eb42683e40ee81..dd0a8efe3f520a9 100644
--- a/bolt/test/X86/dwarf4-types-dwarf5.test
+++ b/bolt/test/X86/dwarf4-types-dwarf5.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-types-dwarf5-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-types-dwarf5-helper.s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-types %t.bolt | FileCheck --check-prefix=POSTCHECKTU %s
 
diff --git a/bolt/test/X86/dwarf5-addr-section-reuse.s b/bolt/test/X86/dwarf5-addr-section-reuse.s
index cf511d6d111e079..10cecf7b2964247 100644
--- a/bolt/test/X86/dwarf5-addr-section-reuse.s
+++ b/bolt/test/X86/dwarf5-addr-section-reuse.s
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-helper2-addr-section-reuse.s -o %thelper2.o
 # RUN: %clang %cflags -dwarf-5 %thelper1.o %tmain.o %thelper2.o -o %t.exe -Wl,-q
 # RUN: llvm-dwarfdump --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
-# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --debug-info %t.exe.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## This test checks that when a binary is bolted if CU is not modified and has DW_AT_addr_base that is shared
diff --git a/bolt/test/X86/dwarf5-call-pc-function-null-check.test b/bolt/test/X86/dwarf5-call-pc-function-null-check.test
index 761a4da696217c5..2b489542f998289 100644
--- a/bolt/test/X86/dwarf5-call-pc-function-null-check.test
+++ b/bolt/test/X86/dwarf5-call-pc-function-null-check.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-call-pc-function-null-check-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-call-pc-function-null-check-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe > %t.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe.bolt >> %t.txt
 # RUN: cat %t.txt | FileCheck --check-prefix=CHECK %s
diff --git a/bolt/test/X86/dwarf5-call-pc.test b/bolt/test/X86/dwarf5-call-pc.test
index dc7773dc053d90a..a4359295556b9b0 100644
--- a/bolt/test/X86/dwarf5-call-pc.test
+++ b/bolt/test/X86/dwarf5-call-pc.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-call-pc-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-call-pc-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections -reorder-blocks=reverse
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections -reorder-blocks=reverse --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe > %tmain.txt
 # RUN: llvm-objdump %t.exe --disassemble >> %tmain.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe.bolt > %tmainbolt.txt
diff --git a/bolt/test/X86/dwarf5-cu-no-debug-addr.test b/bolt/test/X86/dwarf5-cu-no-debug-addr.test
index e78b68680d6cc10..44721b187504b1e 100644
--- a/bolt/test/X86/dwarf5-cu-no-debug-addr.test
+++ b/bolt/test/X86/dwarf5-cu-no-debug-addr.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-cu-no-debug-addr-main.s -o %t1main.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-cu-no-debug-addr-helper.s -o %t1helper.o
 # RUN: %clang %cflags -dwarf-5 %t1main.o %t1helper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe  | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
diff --git a/bolt/test/X86/dwarf5-df-input-lowpc-ranges-cus.test b/bolt/test/X86/dwarf5-df-input-lowpc-ranges-cus.test
index a325395fd532027..801389c60feacc4 100644
--- a/bolt/test/X86/dwarf5-df-input-lowpc-ranges-cus.test
+++ b/bolt/test/X86/dwarf5-df-input-lowpc-ranges-cus.test
@@ -6,7 +6,7 @@
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-input-lowpc-ranges-other.s \
 ; RUN: -split-dwarf-file=mainOther.dwo -o other.o
 ; RUN: %clang %cflags main.o other.o -o main.exe
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-rnglists main.exe.bolt &> %t/foo.txt
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-addr main.exe.bolt >> %t/foo.txt
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe.bolt >> %t/foo.txt
diff --git a/bolt/test/X86/dwarf5-df-mono-dualcu.test b/bolt/test/X86/dwarf5-df-mono-dualcu.test
index 13272cc1c3c4da9..66c0a3b1ad6fb33 100644
--- a/bolt/test/X86/dwarf5-df-mono-dualcu.test
+++ b/bolt/test/X86/dwarf5-df-mono-dualcu.test
@@ -5,7 +5,7 @@
 ; RUN: -split-dwarf-file=main.dwo -o main.o
 ; RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux-gnu %p/Inputs/dwarf5-df-mono-helper.s -o=helper.o
 ; RUN: %clang %cflags -gdwarf-5 main.o helper.o -o main.exe -fno-pic -no-pie
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --always-convert-to-ranges
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --always-convert-to-ranges --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe | FileCheck -check-prefix=PRE-BOLT %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-addr main.exe.bolt &> %t/foo.txt
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe.bolt >> %t/foo.txt
diff --git a/bolt/test/X86/dwarf5-df-output-dir-same-name.test b/bolt/test/X86/dwarf5-df-output-dir-same-name.test
index b466f87d95e5eb1..29658a8b5910e87 100644
--- a/bolt/test/X86/dwarf5-df-output-dir-same-name.test
+++ b/bolt/test/X86/dwarf5-df-output-dir-same-name.test
@@ -9,7 +9,7 @@
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-output-dir-same-name-helper.s \
 ; RUN: -split-dwarf-file=objects/o2/split.dwo -o helper.o
 ; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --dwarf-output-path=%t/dwo
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --dwarf-output-path=%t/dwo --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: ls -l %t/dwo > log
 ; RUN: llvm-dwarfdump --debug-info main.exe.bolt >> log
 ; RUN: cat log | FileCheck -check-prefix=BOLT %s
diff --git a/bolt/test/X86/dwarf5-df-types-debug-names.test b/bolt/test/X86/dwarf5-df-types-debug-names.test
index 7c1c8e4fd5b383d..f96a5b8dccf0b53 100644
--- a/bolt/test/X86/dwarf5-df-types-debug-names.test
+++ b/bolt/test/X86/dwarf5-df-types-debug-names.test
@@ -6,7 +6,7 @@
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-types-debug-names-helper.s \
 ; RUN: -split-dwarf-file=helper.dwo -o helper.o
 ; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --debug-info -r 0 main.dwo.dwo > log.txt
 ; RUN: llvm-dwarfdump --debug-info -r 0 helper.dwo.dwo >> log.txt
 ; RUN: llvm-dwarfdump --debug-info --debug-names main.exe.bolt >> log.txt
diff --git a/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test b/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
index c8cfd82753d7795..e8c78b211cc4b9d 100644
--- a/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
+++ b/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
@@ -6,7 +6,7 @@
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-types-dup-helper.s \
 ; RUN: -split-dwarf-file=helper.dwo -o helper.o
 ; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --debug-info -r 0 main.exe.bolt > log.txt
 ; RUN: llvm-dwarfdump --debug-info -r 0 main.dwo.dwo >> log.txt
 ; RUN: llvm-dwarfdump --debug-info -r 0 helper.dwo.dwo >> log.txt
diff --git a/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test b/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test
index 12a7f648c23257f..024bb5613f94b1f 100644
--- a/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test
+++ b/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test
@@ -6,7 +6,7 @@
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-types-debug-names-helper.s \
 ; RUN: -split-dwarf-file=helper.dwo -o helper.o
 ; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --debug-info -r 0 main.exe.bolt > log.txt
 ; RUN: llvm-dwarfdump --debug-info -r 0 main.dwo.dwo >> log.txt
 ; RUN: llvm-dwarfdump --debug-info -r 0 helper.dwo.dwo >> log.txt
diff --git a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb11.test b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb11.test
index 10ad6ed404f1c1a..465062560d4fc14 100644
--- a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb11.test
+++ b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb11.test
@@ -4,7 +4,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-gdb-index-types-helper.s -o %thelpergdb.o
 # RUN: %clang %cflags %tmaingdb.o %thelpergdb.o -o %tgdb.exe -Wl,-q
 # RUN: llvm-objcopy %tgdb.exe --add-section=.gdb_index=%p/Inputs/dwarf5-dwarf4-gdb-index-types-v8.generted-gdb11.gdb-index
-# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections
+# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index %tgdb.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by GDB.
diff --git a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb9.test b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb9.test
index 2da0bcca89b2ac7..484cf0f1526781d 100644
--- a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb9.test
+++ b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb9.test
@@ -4,7 +4,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-gdb-index-types-helper.s -o %thelpergdb.o
 # RUN: %clang %cflags %tmaingdb.o %thelpergdb.o -o %tgdb.exe -Wl,-q
 # RUN: llvm-objcopy %tgdb.exe --add-section=.gdb_index=%p/Inputs/dwarf5-dwarf4-gdb-index-types-v8.generted-gdb9.gdb-index
-# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections
+# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index %tgdb.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by GDB.
diff --git a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-lld-generated.test b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-lld-generated.test
index 9be540352005de1..7589bfac57f58aa 100644
--- a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-lld-generated.test
+++ b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-lld-generated.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-gdb-index-types-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-gdb-index-types-helper.s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe -Wl,-q -Wl,--gdb-index
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by LLD.
diff --git a/bolt/test/X86/dwarf5-dwarf4-monolithic.test b/bolt/test/X86/dwarf5-dwarf4-monolithic.test
index ff0f6990aaac0f3..37fded1f3d6fdd7 100644
--- a/bolt/test/X86/dwarf5-dwarf4-monolithic.test
+++ b/bolt/test/X86/dwarf5-dwarf4-monolithic.test
@@ -5,7 +5,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-dwarf4-monolithic-helper1.s -o %t1.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-dwarf4-monolithic-helper2.s -o %t2.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %t0.o %t1.o %t2.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt --always-convert-to-ranges %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt --always-convert-to-ranges %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-line %t.exe > %t_line.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt > %t.txt
diff --git a/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test b/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test
index b48d6a5dc20d4d9..9ff64cb1ca250e7 100644
--- a/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test
+++ b/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-dwarf4-types-backward-forward-cross-reference-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-dwarf4-types-backward-forward-cross-reference-helper.s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt | FileCheck --check-prefix=POSTCHECKADDR %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-types %t.bolt | FileCheck --check-prefix=POSTCHECKTU %s
diff --git a/bolt/test/X86/dwarf5-empty-arange.test b/bolt/test/X86/dwarf5-empty-arange.test
index 61e966204843420..4ed3c1dc0a6e496 100644
--- a/bolt/test/X86/dwarf5-empty-arange.test
+++ b/bolt/test/X86/dwarf5-empty-arange.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-empty-arange-main.s   -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-empty-arange-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,--entry=main -Wl,-q -Wl,-gc-sections
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --debug-aranges %t.bolt > %t.txt
 # RUN: llvm-dwarfdump --debug-info -r 0 %t.bolt >> %t.txt
 # RUN: cat %t.txt | FileCheck --check-prefix=POSTCHECK %s
diff --git a/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb11.test b/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb11.test
index 338a476e46f3b3e..139b24afa1b0dac 100644
--- a/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb11.test
+++ b/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb11.test
@@ -4,7 +4,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-gdb-index-types-helper.s -o %thelpergdb.o
 # RUN: %clang %cflags %tmaingdb.o %thelpergdb.o -o %tgdb.exe -Wl,-q
 # RUN: llvm-objcopy %tgdb.exe --add-section=.gdb_index=%p/Inputs/dwarf5-gdb-index-types-v8.generted-gdb11.gdb-index
-# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections
+# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index %tgdb.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by GDB.
diff --git a/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb9.test b/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb9.test
index c9d3913a1933cda..26ee101e9d1d189 100644
--- a/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb9.test
+++ b/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb9.test
@@ -4,7 +4,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-gdb-index-types-helper.s -o %thelpergdb.o
 # RUN: %clang %cflags %tmaingdb.o %thelpergdb.o -o %tgdb.exe -Wl,-q
 # RUN: llvm-objcopy %tgdb.exe --add-section=.gdb_index=%p/Inputs/dwarf5-gdb-index-types-v8.generted-gdb9.gdb-index
-# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections
+# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index %tgdb.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by GDB.
diff --git a/bolt/test/X86/dwarf5-gdb-index-types-lld-generated.test b/bolt/test/X86/dwarf5-gdb-index-types-lld-generated.test
index a770e40260dde34..731c56013339957 100644
--- a/bolt/test/X86/dwarf5-gdb-index-types-lld-generated.test
+++ b/bolt/test/X86/dwarf5-gdb-index-types-lld-generated.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-gdb-index-types-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-gdb-index-types-helper.s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe -Wl,-q -Wl,--gdb-index
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by LLD.
diff --git a/bolt/test/X86/dwarf5-locexpr-referrence.test b/bolt/test/X86/dwarf5-locexpr-referrence.test
index cc7bb27ce602e7b..5b38987e0a7126a 100644
--- a/bolt/test/X86/dwarf5-locexpr-referrence.test
+++ b/bolt/test/X86/dwarf5-locexpr-referrence.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-locexpr-referrence-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-locexpr-referrence-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=CHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt | FileCheck --check-prefix=CHECKADDR %s
 
diff --git a/bolt/test/X86/dwarf5-loclist-offset-form.test b/bolt/test/X86/dwarf5-loclist-offset-form.test
index 3178c11a67069fb..de6a6090efa7f4b 100644
--- a/bolt/test/X86/dwarf5-loclist-offset-form.test
+++ b/bolt/test/X86/dwarf5-loclist-offset-form.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-loclist-offset-form-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-loclist-offset-form-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt > %t.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt >> %t.txt
diff --git a/bolt/test/X86/dwarf5-one-loclists-two-bases.test b/bolt/test/X86/dwarf5-one-loclists-two-bases.test
index f25f6c7a468581e..9e6fdb94b916a8d 100644
--- a/bolt/test/X86/dwarf5-one-loclists-two-bases.test
+++ b/bolt/test/X86/dwarf5-one-loclists-two-bases.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-loc-base-no-loc-accesshelper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt > %t.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt >> %t.txt
diff --git a/bolt/test/X86/dwarf5-return-pc-form-addr.test b/bolt/test/X86/dwarf5-return-pc-form-addr.test
index 5a83615cac031cc..8e35ef670cfc2cb 100644
--- a/bolt/test/X86/dwarf5-return-pc-form-addr.test
+++ b/bolt/test/X86/dwarf5-return-pc-form-addr.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-return-pc-form-addr-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-return-pc-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections -reorder-blocks=reverse
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections -reorder-blocks=reverse --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe > %tmain.txt
 # RUN: llvm-objdump %t.exe --disassemble >> %tmain.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe.bolt > %tmainbolt.txt
diff --git a/bolt/test/X86/dwarf5-return-pc.test b/bolt/test/X86/dwarf5-return-pc.test
index e9ef99ef5b945fb..21b6854474a62fc 100644
--- a/bolt/test/X86/dwarf5-return-pc.test
+++ b/bolt/test/X86/dwarf5-return-pc.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-return-pc-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-return-pc-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections -reorder-blocks=reverse
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections -reorder-blocks=reverse --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe > %tmain.txt
 # RUN: llvm-objdump %t.exe --disassemble >> %tmain.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe.bolt > %tmainbolt.txt
diff --git a/bolt/test/X86/dwarf5-shared-str-offset-base.s b/bolt/test/X86/dwarf5-shared-str-offset-base.s
index d8492298a1604b5..f00efd54a4b02c6 100644
--- a/bolt/test/X86/dwarf5-shared-str-offset-base.s
+++ b/bolt/test/X86/dwarf5-shared-str-offset-base.s
@@ -3,7 +3,7 @@
 # RUN: llvm-mc --filetype=obj --triple x86_64 %s -o %tmain.o --defsym MAIN=0
 # RUN: llvm-mc --filetype=obj --triple x86_64 %s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %tmain.exe
-# RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+# RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --debug-info %tmain.exe.bolt > %tout.text
 # RUN: llvm-dwarfdump --show-section-sizes %tmain.exe >> %tout.text
 # RUN: llvm-dwarfdump --show-section-sizes %tmain.exe.bolt >> %tout.text
diff --git a/bolt/test/X86/dwarf5-split-dwarf4-monolithic.test b/bolt/test/X86/dwarf5-split-dwarf4-monolithic.test
index 2cfe5e26bd4cdc1..3eb6a724523d987 100644
--- a/bolt/test/X86/dwarf5-split-dwarf4-monolithic.test
+++ b/bolt/test/X86/dwarf5-split-dwarf4-monolithic.test
@@ -9,7 +9,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux -split-dwarf-file=helper1.dwo %p/Inputs/dwarf5-split-dwarf4-monolithic-helper1.s -o helper1.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-dwarf4-monolithic-helper2.s -o helper2.o
 # RUN: %clang %cflags -dwarf-5 main.o helper0.o helper1.o helper2.o -o main.exe -Wl,-q
-# RUN: llvm-bolt --always-convert-to-ranges main.exe -o main.bolt --update-debug-sections
+# RUN: llvm-bolt --always-convert-to-ranges main.exe -o main.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-line main.exe | FileCheck --check-prefix=PRECHECK-LINE %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr main.bolt >  boltout.txt
diff --git a/bolt/test/X86/dwarf5-split-gdb-index-types-gdb-generated.test b/bolt/test/X86/dwarf5-split-gdb-index-types-gdb-generated.test
index ec2b8f7084c78d6..6fcb5a97c1488c9 100644
--- a/bolt/test/X86/dwarf5-split-gdb-index-types-gdb-generated.test
+++ b/bolt/test/X86/dwarf5-split-gdb-index-types-gdb-generated.test
@@ -7,7 +7,7 @@
 # RUN: llvm-mc --split-dwarf-file=helper.dwo -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-split-gdb-index-types-helper.s -o helpergdb.o
 # RUN: %clang %cflags maingdb.o helpergdb.o -o maingdb.exe -Wl,-q
 # RUN: llvm-objcopy maingdb.exe --add-section=.gdb_index=%p/Inputs/dwarf5-split-gdb-index-types-v8.gdb-index
-# RUN: llvm-bolt maingdb.exe -o maingdb.exe.bolt --update-debug-sections
+# RUN: llvm-bolt maingdb.exe -o maingdb.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index maingdb.exe.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by GDB with split-dwarf DWARF4.
diff --git a/bolt/test/X86/dwarf5-subprogram-multiple-ranges-cus.test b/bolt/test/X86/dwarf5-subprogram-multiple-ranges-cus.test
index bcf63fe6a0d8ceb..57f8d7e99bcafa8 100644
--- a/bolt/test/X86/dwarf5-subprogram-multiple-ranges-cus.test
+++ b/bolt/test/X86/dwarf5-subprogram-multiple-ranges-cus.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-subprogram-multiple-ranges-main.s -o %t1.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-subprogram-multiple-ranges-other.s -o %t2.o
 # RUN: %clang %cflags %t1.o %t2.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-objdump %t.bolt --disassemble > %t1.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt >> %t1.txt
 # RUN: cat %t1.txt | FileCheck --check-prefix=POSTCHECK %s
diff --git a/bolt/test/X86/dwarf5-two-cu-str-offset-table.test b/bolt/test/X86/dwarf5-two-cu-str-offset-table.test
index 20503951df4e18f..e59664e3281a1a7 100644
--- a/bolt/test/X86/dwarf5-two-cu-str-offset-table.test
+++ b/bolt/test/X86/dwarf5-two-cu-str-offset-table.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets %t.exe > %t.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets %t.bolt >> %t.txt
 # RUN: cat %t.txt | FileCheck --check-prefix=CHECK %s
diff --git a/bolt/test/X86/dwarf5-two-loclists.test b/bolt/test/X86/dwarf5-two-loclists.test
index a7c6351f9813cc6..5b3417e86109a92 100644
--- a/bolt/test/X86/dwarf5-two-loclists.test
+++ b/bolt/test/X86/dwarf5-two-loclists.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt > %t.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt >> %t.txt
diff --git a/bolt/test/X86/dwarf5-two-rnglists.test b/bolt/test/X86/dwarf5-two-rnglists.test
index 98f2e347d7673b6..3db47f983eaa26c 100644
--- a/bolt/test/X86/dwarf5-two-rnglists.test
+++ b/bolt/test/X86/dwarf5-two-rnglists.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt --always-convert-to-ranges %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt --always-convert-to-ranges %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt > %t.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt >> %t.txt
diff --git a/bolt/test/X86/dwarf5-type-unit-no-cu-str-offset-table.test b/bolt/test/X86/dwarf5-type-unit-no-cu-str-offset-table.test
index 21ced6ce687b5c2..dc6255ff8c7bc2d 100644
--- a/bolt/test/X86/dwarf5-type-unit-no-cu-str-offset-table.test
+++ b/bolt/test/X86/dwarf5-type-unit-no-cu-str-offset-table.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-basic-cu.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-no-cu.s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets %t.exe | FileCheck -check-prefix=PRE-BOLT %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets %t.bolt | FileCheck -check-prefix=POST-BOLT %s
 
diff --git a/bolt/test/X86/dwarf5-types-backward-cross-reference.s b/bolt/test/X86/dwarf5-types-backward-cross-reference.s
index 2345cac2fde9697..17a8d620493a608 100644
--- a/bolt/test/X86/dwarf5-types-backward-cross-reference.s
+++ b/bolt/test/X86/dwarf5-types-backward-cross-reference.s
@@ -2,7 +2,7 @@
 
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s -o %t.o
 # RUN: %clang %cflags -gdwarf-5 %t.o -o %t.exe
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## This test checks that BOLT handles backward cross CU references for dwarf5
diff --git a/bolt/test/X86/dwarf5-types-debug-names.test b/bolt/test/X86/dwarf5-types-debug-names.test
index 94624298e289de7..8da35574b9e997d 100644
--- a/bolt/test/X86/dwarf5-types-debug-names.test
+++ b/bolt/test/X86/dwarf5-types-debug-names.test
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-debug-names-main.s   -o %tmain.o
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-debug-names-helper.s -o %thelper.o
 ; RUN: %clang %cflags -gdwarf-5 %tmain.o %thelper.o -o %tmain.exe
-; RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+; RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --debug-info --debug-names %tmain.exe.bolt > %tlog.txt
 ; RUN: cat %tlog.txt | FileCheck -check-prefix=BOLT %s
 
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
index 8837ac16e882817..be52af77ae0a519 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
@@ -164,6 +164,10 @@ void RedundantSmartptrGetCheck::check(const MatchFinder::MatchResult &Result) {
   StringRef SmartptrText = Lexer::getSourceText(
       CharSourceRange::getTokenRange(Smartptr->getSourceRange()),
       *Result.SourceManager, getLangOpts());
+  // Check if the last two characters are "->" and remove them
+  if (SmartptrText.ends_with("->")) {
+    SmartptrText = SmartptrText.drop_back(2);
+  }
   // Replace foo->get() with *foo, and foo.get() with foo.
   std::string Replacement = Twine(IsPtrToPtr ? "*" : "", SmartptrText).str();
   diag(GetCall->getBeginLoc(), "redundant get() call on smart pointer")
diff --git a/clang-tools-extra/clang-tidy/tool/CMakeLists.txt b/clang-tools-extra/clang-tidy/tool/CMakeLists.txt
index 9f327ce838b7073..b220cbea80f1b6d 100644
--- a/clang-tools-extra/clang-tidy/tool/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/tool/CMakeLists.txt
@@ -33,7 +33,6 @@ clang_target_link_libraries(clangTidyMain
 # Support plugins.
 if(CLANG_PLUGIN_SUPPORT)
   set(support_plugins SUPPORT_PLUGINS)
-  set(export_symbols EXPORT_SYMBOLS_FOR_PLUGINS)
 endif()
 
 add_clang_tool(clang-tidy
@@ -42,7 +41,6 @@ add_clang_tool(clang-tidy
   DEPENDS
   clang-resource-headers
   ${support_plugins}
-  ${export_symbols}
   )
 clang_target_link_libraries(clang-tidy
   PRIVATE
@@ -59,6 +57,10 @@ target_link_libraries(clang-tidy
   ${ALL_CLANG_TIDY_CHECKS}
   )
 
+if(CLANG_PLUGIN_SUPPORT)
+  export_executable_symbols_for_plugins(clang-tidy)
+endif()
+
 install(PROGRAMS clang-tidy-diff.py
   DESTINATION "${CMAKE_INSTALL_DATADIR}/clang"
   COMPONENT clang-tidy)
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 642ad39cc0c1c5d..b72d109b3d3938b 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -104,6 +104,10 @@ New check aliases
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Improved :doc:`readability-redundant-smartptr-get
+  <clang-tidy/checks/readability/redundant-smartptr-get>` check to
+  remove `->`, when reduntant `get()` is removed.
+
 Removed checks
 ^^^^^^^^^^^^^^
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get.cpp
index 01f12b6bfe6ea0b..ec4ca4cb79484b2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get.cpp
@@ -20,6 +20,23 @@ struct shared_ptr {
   explicit operator bool() const noexcept;
 };
 
+template <typename T>
+struct vector {
+  vector();
+  bool operator==(const vector<T>& other) const;
+  bool operator!=(const vector<T>& other) const;
+  unsigned long size() const;
+  bool empty() const;
+
+  using iterator = T*;
+
+  iterator begin();
+  iterator end();
+
+  T* data;
+  unsigned long sz;
+};
+
 }  // namespace std
 
 struct Bar {
@@ -235,3 +252,34 @@ void Negative() {
   if (MACRO(x) == nullptr)
     ;
 }
+
+void test_redundant_get() {
+  std::vector<std::shared_ptr<int>> v;
+  auto f = [](int) {};
+  for (auto i = v.begin(); i != v.end(); ++i) {
+    f(*i->get());
+    // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: redundant get() call
+    // CHECK-FIXES: f(**i);
+  }
+}
+
+struct Inner {
+  int a;
+  int *getValue()  { return &a; }
+};
+
+struct Example {
+  Inner inner;
+  Inner* get() { return &inner; }
+  int *getValue()  { return inner.getValue(); }
+};
+
+void test_redundant_get_with_member() {
+  std::vector<std::shared_ptr<Example>> v;
+  auto f = [](int) {};
+  for (auto i = v.begin(); i != v.end(); ++i) {
+    f(*(*i).get()->get()->getValue());
+    // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: redundant get() call
+    // CHECK-FIXES: f(**i->get()->getValue());
+ }
+}
diff --git a/clang/cmake/modules/AddClang.cmake b/clang/cmake/modules/AddClang.cmake
index 9f264720b1e9ee9..5327b5d2f089288 100644
--- a/clang/cmake/modules/AddClang.cmake
+++ b/clang/cmake/modules/AddClang.cmake
@@ -160,7 +160,7 @@ macro(add_clang_tool name)
      AND (NOT LLVM_DISTRIBUTION_COMPONENTS OR ${name} IN_LIST LLVM_DISTRIBUTION_COMPONENTS)
     )
     set(get_obj_args ${ARGN})
-    list(FILTER get_obj_args EXCLUDE REGEX "^(SUPPORT_PLUGINS|EXPORT_SYMBOLS_FOR_PLUGINS)$")
+    list(FILTER get_obj_args EXCLUDE REGEX "^SUPPORT_PLUGINS$")
     generate_llvm_objects(${name} ${get_obj_args})
     add_custom_target(${name} DEPENDS llvm-driver clang-resource-headers)
   else()
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a355b8db11faf0a..7beef7be0e6a533 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -75,8 +75,8 @@ sections with improvements to Clang's support for those languages.
 
 C++ Language Changes
 --------------------
-- Allow single element access of GCC vector/ext_vector_type object to be 
-  constant expression. Supports the `V.xyzw` syntax and other tidbits 
+- Allow single element access of GCC vector/ext_vector_type object to be
+  constant expression. Supports the `V.xyzw` syntax and other tidbits
   as seen in OpenCL. Selecting multiple elements is left as a future work.
 
 C++17 Feature Support
@@ -101,6 +101,16 @@ C++2c Feature Support
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Allow calling initializer list constructors from initializer lists with
+  a single element of the same type instead of always copying.
+  (`CWG2137: List-initialization from object of same type <https://cplusplus.github.io/CWG/issues/2137.html>`)
+
+- Speculative resolution for CWG2311 implemented so that the implementation of CWG2137 doesn't remove
+  previous cases where guaranteed copy elision was done. Given a prvalue ``e`` of class type
+  ``T``, ``T{e}`` will try to resolve an initializer list constructor and will use it if successful.
+  Otherwise, if there is no initializer list constructor, the copy will be elided as if it was ``T(e)``.
+  (`CWG2311: Missed case for guaranteed copy elision <https://cplusplus.github.io/CWG/issues/2311.html>`)
+
 C Language Changes
 ------------------
 
@@ -156,7 +166,6 @@ Improvements to Clang's diagnostics
 - Clang now diagnoses undefined behavior in constant expressions more consistently. This includes invalid shifts, and signed overflow in arithmetic.
 
 - -Wdangling-assignment-gsl is enabled by default.
-- Clang now does a better job preserving the template arguments as written when specializing concepts.
 - Clang now always preserves the template arguments as written used
   to specialize template type aliases.
 
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index a4804e4c6f61ca6..27618604192c51f 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2134,6 +2134,23 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     unsigned hasTypeDifferentFromDecl : 1;
   };
 
+  class TemplateTypeParmTypeBitfields {
+    friend class TemplateTypeParmType;
+
+    LLVM_PREFERRED_TYPE(TypeBitfields)
+    unsigned : NumTypeBits;
+
+    /// The depth of the template parameter.
+    unsigned Depth : 15;
+
+    /// Whether this is a template parameter pack.
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned ParameterPack : 1;
+
+    /// The index of the template parameter.
+    unsigned Index : 16;
+  };
+
   class SubstTemplateTypeParmTypeBitfields {
     friend class SubstTemplateTypeParmType;
 
@@ -2257,6 +2274,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     TypeWithKeywordBitfields TypeWithKeywordBits;
     ElaboratedTypeBitfields ElaboratedTypeBits;
     VectorTypeBitfields VectorTypeBits;
+    TemplateTypeParmTypeBitfields TemplateTypeParmTypeBits;
     SubstTemplateTypeParmTypeBitfields SubstTemplateTypeParmTypeBits;
     SubstTemplateTypeParmPackTypeBitfields SubstTemplateTypeParmPackTypeBits;
     TemplateSpecializationTypeBitfields TemplateSpecializationTypeBits;
@@ -6135,52 +6153,30 @@ class BTFTagAttributedType : public Type, public llvm::FoldingSetNode {
 class TemplateTypeParmType : public Type, public llvm::FoldingSetNode {
   friend class ASTContext; // ASTContext creates these
 
-  // Helper data collector for canonical types.
-  struct CanonicalTTPTInfo {
-    unsigned Depth : 15;
-    unsigned ParameterPack : 1;
-    unsigned Index : 16;
-  };
-
-  union {
-    // Info for the canonical type.
-    CanonicalTTPTInfo CanTTPTInfo;
-
-    // Info for the non-canonical type.
-    TemplateTypeParmDecl *TTPDecl;
-  };
+  // The associated TemplateTypeParmDecl for the non-canonical type.
+  TemplateTypeParmDecl *TTPDecl;
 
-  /// Build a non-canonical type.
-  TemplateTypeParmType(TemplateTypeParmDecl *TTPDecl, QualType Canon)
+  TemplateTypeParmType(unsigned D, unsigned I, bool PP,
+                       TemplateTypeParmDecl *TTPDecl, QualType Canon)
       : Type(TemplateTypeParm, Canon,
              TypeDependence::DependentInstantiation |
-                 (Canon->getDependence() & TypeDependence::UnexpandedPack)),
-        TTPDecl(TTPDecl) {}
-
-  /// Build the canonical type.
-  TemplateTypeParmType(unsigned D, unsigned I, bool PP)
-      : Type(TemplateTypeParm, QualType(this, 0),
-             TypeDependence::DependentInstantiation |
-                 (PP ? TypeDependence::UnexpandedPack : TypeDependence::None)) {
-    CanTTPTInfo.Depth = D;
-    CanTTPTInfo.Index = I;
-    CanTTPTInfo.ParameterPack = PP;
-  }
-
-  const CanonicalTTPTInfo& getCanTTPTInfo() const {
-    QualType Can = getCanonicalTypeInternal();
-    return Can->castAs<TemplateTypeParmType>()->CanTTPTInfo;
+                 (PP ? TypeDependence::UnexpandedPack : TypeDependence::None)),
+        TTPDecl(TTPDecl) {
+    assert(!TTPDecl == Canon.isNull());
+    TemplateTypeParmTypeBits.Depth = D;
+    TemplateTypeParmTypeBits.Index = I;
+    TemplateTypeParmTypeBits.ParameterPack = PP;
   }
 
 public:
-  unsigned getDepth() const { return getCanTTPTInfo().Depth; }
-  unsigned getIndex() const { return getCanTTPTInfo().Index; }
-  bool isParameterPack() const { return getCanTTPTInfo().ParameterPack; }
-
-  TemplateTypeParmDecl *getDecl() const {
-    return isCanonicalUnqualified() ? nullptr : TTPDecl;
+  unsigned getDepth() const { return TemplateTypeParmTypeBits.Depth; }
+  unsigned getIndex() const { return TemplateTypeParmTypeBits.Index; }
+  bool isParameterPack() const {
+    return TemplateTypeParmTypeBits.ParameterPack;
   }
 
+  TemplateTypeParmDecl *getDecl() const { return TTPDecl; }
+
   IdentifierInfo *getIdentifier() const;
 
   bool isSugared() const { return false; }
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5cdf36660b2a66d..554dbaff2ce0d87 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11693,6 +11693,8 @@ def err_module_not_defined : Error<
 def err_module_redeclaration : Error<
   "translation unit contains multiple module declarations">;
 def note_prev_module_declaration : Note<"previous module declaration is here">;
+def err_module_declaration_missing : Error<
+  "missing 'export module' declaration in module interface unit">;
 def err_module_declaration_missing_after_global_module_introducer : Error<
   "missing 'module' declaration at end of global module fragment "
   "introduced here">;
diff --git a/clang/include/clang/Basic/DiagnosticSerializationKinds.td b/clang/include/clang/Basic/DiagnosticSerializationKinds.td
index 51d0abbbec252ab..9854972cbfe7e40 100644
--- a/clang/include/clang/Basic/DiagnosticSerializationKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSerializationKinds.td
@@ -29,20 +29,20 @@ def note_pch_rebuild_required : Note<"please rebuild precompiled header '%0'">;
 def note_module_cache_path : Note<
     "after modifying system headers, please delete the module cache at '%0'">;
 
-def err_pch_targetopt_mismatch : Error<
-    "PCH file was compiled for the %0 '%1' but the current translation "
-    "unit is being compiled for target '%2'">;
-def err_pch_targetopt_feature_mismatch : Error<
-    "%select{AST file was|current translation unit is}0 compiled with the target "
-    "feature '%1' but the %select{current translation unit is|AST file was}0 "
+def err_ast_file_targetopt_mismatch : Error<
+    "AST file '%0' was compiled for the %1 '%2' but the current translation "
+    "unit is being compiled for target '%3'">;
+def err_ast_file_targetopt_feature_mismatch : Error<
+    "%select{AST file '%1' was|current translation unit is}0 compiled with the target "
+    "feature '%2' but the %select{current translation unit is|AST file '%1' was}0 "
     "not">;
-def err_pch_langopt_mismatch : Error<"%0 was %select{disabled|enabled}1 in "
-    "PCH file but is currently %select{disabled|enabled}2">;
-def err_pch_langopt_value_mismatch : Error<
-  "%0 differs in PCH file vs. current file">;
-def err_pch_diagopt_mismatch : Error<"%0 is currently enabled, but was not in "
-  "the PCH file">;
-def err_pch_modulecache_mismatch : Error<"PCH was compiled with module cache "
+def err_ast_file_langopt_mismatch : Error<"%0 was %select{disabled|enabled}1 in "
+    "AST file '%3' but is currently %select{disabled|enabled}2">;
+def err_ast_file_langopt_value_mismatch : Error<
+  "%0 differs in AST file '%1' vs. current file">;
+def err_ast_file_diagopt_mismatch : Error<"%0 is currently enabled, but was not in "
+  "the AST file '%1'">;
+def err_ast_file_modulecache_mismatch : Error<"AST file '%2' was compiled with module cache "
   "path '%0', but the path is currently '%1'">;
 def warn_pch_vfsoverlay_mismatch : Warning<
   "PCH was compiled with different VFS overlay files than are currently in use">,
@@ -99,19 +99,19 @@ def err_module_different_modmap : Error<
     "module '%0' %select{uses|does not use}1 additional module map '%2'"
     "%select{| not}1 used when the module was built">;
 
-def err_pch_macro_def_undef : Error<
-    "macro '%0' was %select{defined|undef'd}1 in the precompiled header but "
+def err_ast_file_macro_def_undef : Error<
+    "macro '%0' was %select{defined|undef'd}1 in the AST file '%2' but "
     "%select{undef'd|defined}1 on the command line">;
-def err_pch_macro_def_conflict : Error<
-    "definition of macro '%0' differs between the precompiled header ('%1') "
+def err_ast_file_macro_def_conflict : Error<
+    "definition of macro '%0' differs between the AST file '%3' ('%1') "
     "and the command line ('%2')">;
-def err_pch_undef : Error<
-    "%select{command line contains|precompiled header was built with}0 "
-    "'-undef' but %select{precompiled header was not built with it|"
+def err_ast_file_undef : Error<
+    "%select{command line contains|AST file '%1' was built with}0 "
+    "'-undef' but %select{AST file '%1' was not built with it|"
     "it is not present on the command line}0">;
-def err_pch_pp_detailed_record : Error<
-    "%select{command line contains|precompiled header was built with}0 "
-    "'-detailed-preprocessing-record' but %select{precompiled header was not "
+def err_ast_file_pp_detailed_record : Error<
+    "%select{command line contains|AST file '%1' was built with}0 "
+    "'-detailed-preprocessing-record' but %select{AST file '%1' was not "
     "built with it|it is not present on the command line}0">;
 
 def err_module_odr_violation_missing_decl : Error<
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 6945f8b01e91cde..54e689a7a42213d 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -91,6 +91,7 @@ LANGOPT(C2y               , 1, 0, "C2y")
 LANGOPT(MSVCCompat        , 1, 0, "Microsoft Visual C++ full compatibility mode")
 LANGOPT(Kernel            , 1, 0, "Kernel mode")
 LANGOPT(MicrosoftExt      , 1, 0, "Microsoft C++ extensions")
+LANGOPT(ZOSExt            , 1, 0, "z/OS extensions")
 LANGOPT(AsmBlocks         , 1, 0, "Microsoft inline asm blocks")
 LANGOPT(Borland           , 1, 0, "Borland extensions")
 LANGOPT(CPlusPlus         , 1, 0, "C++")
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 2cea64e2bd590b7..421dbb413fed939 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -292,6 +292,7 @@ PUNCTUATOR(caretcaret,            "^^")
 //   CHAR8SUPPORT - This is a keyword if 'char8_t' is a built-in type
 //   KEYFIXEDPOINT - This is a keyword according to the N1169 fixed point
 //                   extension.
+//   KEYZOS - This is a keyword in C/C++ on z/OS
 //
 KEYWORD(auto                        , KEYALL)
 KEYWORD(break                       , KEYALL)
@@ -725,7 +726,7 @@ KEYWORD(__funcref                     , KEYALL)
 
 // Microsoft extensions which should be disabled in strict conformance mode
 KEYWORD(__ptr64                       , KEYMS)
-KEYWORD(__ptr32                       , KEYMS)
+KEYWORD(__ptr32                       , KEYMS | KEYZOS)
 KEYWORD(__sptr                        , KEYMS)
 KEYWORD(__uptr                        , KEYMS)
 KEYWORD(__w64                         , KEYMS)
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 51ec29f1dc32120..e196c3dc5cb3be8 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3066,6 +3066,10 @@ dll version.}]>;
 def fms_omit_default_lib : Joined<["-"], "fms-omit-default-lib">,
   Group<f_Group>, Flags<[]>,
   Visibility<[ClangOption, CLOption]>;
+def fzos_extensions : Flag<["-"], "fzos-extensions">, Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Accept some non-standard constructs supported by the z/OS compiler">;
+def fno_zos_extensions : Flag<["-"], "fno-zos-extensions">, Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Do not accept non-standard constructs supported by the z/OS compiler">;
 defm delayed_template_parsing : BoolFOption<"delayed-template-parsing",
   LangOpts<"DelayedTemplateParsing">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option],
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index d616865afe807e6..0340c23fd170d62 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -234,8 +234,7 @@ enum class TemplateSubstitutionKind : char {
     /// Replaces the current 'innermost' level with the provided argument list.
     /// This is useful for type deduction cases where we need to get the entire
     /// list from the AST, but then add the deduced innermost list.
-    void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args,
-                                           bool Final = false) {
+    void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args) {
       assert((!TemplateArgumentLists.empty() || NumRetainedOuterLevels) &&
              "Replacing in an empty list?");
 
@@ -247,7 +246,8 @@ enum class TemplateSubstitutionKind : char {
         TemplateArgumentLists[0].Args = Args;
       } else {
         --NumRetainedOuterLevels;
-        TemplateArgumentLists.push_back({{AssociatedDecl, Final}, Args});
+        TemplateArgumentLists.push_back(
+            {{AssociatedDecl, /*Final=*/false}, Args});
       }
     }
 
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index 6e4b0ec3d27c1b4..549396a3b6b387b 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -130,7 +130,7 @@ class ASTReaderListener {
   ///
   /// \returns true to indicate the options are invalid or false otherwise.
   virtual bool ReadLanguageOptions(const LangOptions &LangOpts,
-                                   bool Complain,
+                                   StringRef ModuleFilename, bool Complain,
                                    bool AllowCompatibleDifferences) {
     return false;
   }
@@ -139,7 +139,8 @@ class ASTReaderListener {
   ///
   /// \returns true to indicate the target options are invalid, or false
   /// otherwise.
-  virtual bool ReadTargetOptions(const TargetOptions &TargetOpts, bool Complain,
+  virtual bool ReadTargetOptions(const TargetOptions &TargetOpts,
+                                 StringRef ModuleFilename, bool Complain,
                                  bool AllowCompatibleDifferences) {
     return false;
   }
@@ -150,7 +151,7 @@ class ASTReaderListener {
   /// otherwise.
   virtual bool
   ReadDiagnosticOptions(IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts,
-                        bool Complain) {
+                        StringRef ModuleFilename, bool Complain) {
     return false;
   }
 
@@ -172,6 +173,7 @@ class ASTReaderListener {
   /// \returns true to indicate the header search options are invalid, or false
   /// otherwise.
   virtual bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                                       StringRef ModuleFilename,
                                        StringRef SpecificModuleCachePath,
                                        bool Complain) {
     return false;
@@ -200,6 +202,7 @@ class ASTReaderListener {
   /// \returns true to indicate the preprocessor options are invalid, or false
   /// otherwise.
   virtual bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
+                                       StringRef ModuleFilename,
                                        bool ReadMacros, bool Complain,
                                        std::string &SuggestedPredefines) {
     return false;
@@ -262,20 +265,24 @@ class ChainedASTReaderListener : public ASTReaderListener {
   bool ReadFullVersionInformation(StringRef FullVersion) override;
   void ReadModuleName(StringRef ModuleName) override;
   void ReadModuleMapFile(StringRef ModuleMapPath) override;
-  bool ReadLanguageOptions(const LangOptions &LangOpts, bool Complain,
+  bool ReadLanguageOptions(const LangOptions &LangOpts,
+                           StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override;
-  bool ReadTargetOptions(const TargetOptions &TargetOpts, bool Complain,
+  bool ReadTargetOptions(const TargetOptions &TargetOpts,
+                         StringRef ModuleFilename, bool Complain,
                          bool AllowCompatibleDifferences) override;
   bool ReadDiagnosticOptions(IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts,
-                             bool Complain) override;
+                             StringRef ModuleFilename, bool Complain) override;
   bool ReadFileSystemOptions(const FileSystemOptions &FSOpts,
                              bool Complain) override;
 
   bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                               StringRef ModuleFilename,
                                StringRef SpecificModuleCachePath,
                                bool Complain) override;
   bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                               bool ReadMacros, bool Complain,
+                               StringRef ModuleFilename, bool ReadMacros,
+                               bool Complain,
                                std::string &SuggestedPredefines) override;
 
   void ReadCounter(const serialization::ModuleFile &M, unsigned Value) override;
@@ -299,16 +306,20 @@ class PCHValidator : public ASTReaderListener {
   PCHValidator(Preprocessor &PP, ASTReader &Reader)
       : PP(PP), Reader(Reader) {}
 
-  bool ReadLanguageOptions(const LangOptions &LangOpts, bool Complain,
+  bool ReadLanguageOptions(const LangOptions &LangOpts,
+                           StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override;
-  bool ReadTargetOptions(const TargetOptions &TargetOpts, bool Complain,
+  bool ReadTargetOptions(const TargetOptions &TargetOpts,
+                         StringRef ModuleFilename, bool Complain,
                          bool AllowCompatibleDifferences) override;
   bool ReadDiagnosticOptions(IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts,
-                             bool Complain) override;
+                             StringRef ModuleFilename, bool Complain) override;
   bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                               bool ReadMacros, bool Complain,
+                               StringRef ModuleFilename, bool ReadMacros,
+                               bool Complain,
                                std::string &SuggestedPredefines) override;
   bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                               StringRef ModuleFilename,
                                StringRef SpecificModuleCachePath,
                                bool Complain) override;
   void ReadCounter(const serialization::ModuleFile &M, unsigned Value) override;
@@ -325,7 +336,8 @@ class SimpleASTReaderListener : public ASTReaderListener {
   SimpleASTReaderListener(Preprocessor &PP) : PP(PP) {}
 
   bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                               bool ReadMacros, bool Complain,
+                               StringRef ModuleFilename, bool ReadMacros,
+                               bool Complain,
                                std::string &SuggestedPredefines) override;
 };
 
@@ -1366,10 +1378,12 @@ class ASTReader
                                  SmallVectorImpl<ImportedModule> &Loaded,
                                  const ModuleFile *ImportedBy,
                                  unsigned ClientLoadCapabilities);
-  static ASTReadResult ReadOptionsBlock(
-      llvm::BitstreamCursor &Stream, unsigned ClientLoadCapabilities,
-      bool AllowCompatibleConfigurationMismatch, ASTReaderListener &Listener,
-      std::string &SuggestedPredefines);
+  static ASTReadResult
+  ReadOptionsBlock(llvm::BitstreamCursor &Stream, StringRef Filename,
+                   unsigned ClientLoadCapabilities,
+                   bool AllowCompatibleConfigurationMismatch,
+                   ASTReaderListener &Listener,
+                   std::string &SuggestedPredefines);
 
   /// Read the unhashed control block.
   ///
@@ -1378,12 +1392,11 @@ class ASTReader
   ASTReadResult readUnhashedControlBlock(ModuleFile &F, bool WasImportedBy,
                                          unsigned ClientLoadCapabilities);
 
-  static ASTReadResult
-  readUnhashedControlBlockImpl(ModuleFile *F, llvm::StringRef StreamData,
-                               unsigned ClientLoadCapabilities,
-                               bool AllowCompatibleConfigurationMismatch,
-                               ASTReaderListener *Listener,
-                               bool ValidateDiagnosticOptions);
+  static ASTReadResult readUnhashedControlBlockImpl(
+      ModuleFile *F, llvm::StringRef StreamData, StringRef Filename,
+      unsigned ClientLoadCapabilities,
+      bool AllowCompatibleConfigurationMismatch, ASTReaderListener *Listener,
+      bool ValidateDiagnosticOptions);
 
   llvm::Error ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities);
   llvm::Error ReadExtensionBlock(ModuleFile &F);
@@ -1396,21 +1409,26 @@ class ASTReader
                                        unsigned ClientLoadCapabilities);
   llvm::Error ReadSubmoduleBlock(ModuleFile &F,
                                  unsigned ClientLoadCapabilities);
-  static bool ParseLanguageOptions(const RecordData &Record, bool Complain,
+  static bool ParseLanguageOptions(const RecordData &Record,
+                                   StringRef ModuleFilename, bool Complain,
                                    ASTReaderListener &Listener,
                                    bool AllowCompatibleDifferences);
-  static bool ParseTargetOptions(const RecordData &Record, bool Complain,
+  static bool ParseTargetOptions(const RecordData &Record,
+                                 StringRef ModuleFilename, bool Complain,
                                  ASTReaderListener &Listener,
                                  bool AllowCompatibleDifferences);
-  static bool ParseDiagnosticOptions(const RecordData &Record, bool Complain,
+  static bool ParseDiagnosticOptions(const RecordData &Record,
+                                     StringRef ModuleFilename, bool Complain,
                                      ASTReaderListener &Listener);
   static bool ParseFileSystemOptions(const RecordData &Record, bool Complain,
                                      ASTReaderListener &Listener);
-  static bool ParseHeaderSearchOptions(const RecordData &Record, bool Complain,
+  static bool ParseHeaderSearchOptions(const RecordData &Record,
+                                       StringRef ModuleFilename, bool Complain,
                                        ASTReaderListener &Listener);
   static bool ParseHeaderSearchPaths(const RecordData &Record, bool Complain,
                                      ASTReaderListener &Listener);
-  static bool ParsePreprocessorOptions(const RecordData &Record, bool Complain,
+  static bool ParsePreprocessorOptions(const RecordData &Record,
+                                       StringRef ModuleFilename, bool Complain,
                                        ASTReaderListener &Listener,
                                        std::string &SuggestedPredefines);
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 8ccf4a2e7732219..0dbbe0043f70317 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -5300,15 +5300,15 @@ QualType ASTContext::getTemplateTypeParmType(unsigned Depth, unsigned Index,
   if (TTPDecl) {
     QualType Canon = getTemplateTypeParmType(Depth, Index, ParameterPack);
     TypeParm = new (*this, alignof(TemplateTypeParmType))
-        TemplateTypeParmType(TTPDecl, Canon);
+        TemplateTypeParmType(Depth, Index, ParameterPack, TTPDecl, Canon);
 
     TemplateTypeParmType *TypeCheck
       = TemplateTypeParmTypes.FindNodeOrInsertPos(ID, InsertPos);
     assert(!TypeCheck && "Template type parameter canonical type broken");
     (void)TypeCheck;
   } else
-    TypeParm = new (*this, alignof(TemplateTypeParmType))
-        TemplateTypeParmType(Depth, Index, ParameterPack);
+    TypeParm = new (*this, alignof(TemplateTypeParmType)) TemplateTypeParmType(
+        Depth, Index, ParameterPack, /*TTPDecl=*/nullptr, /*Canon=*/QualType());
 
   Types.push_back(TypeParm);
   TemplateTypeParmTypes.InsertNode(TypeParm, InsertPos);
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index d832ce4190ff1a5..e125143bc1b2703 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -583,12 +583,6 @@ static bool isSingleLineLanguageLinkage(const Decl &D) {
   return false;
 }
 
-static bool isDeclaredInModuleInterfaceOrPartition(const NamedDecl *D) {
-  if (auto *M = D->getOwningModule())
-    return M->isInterfaceOrPartition();
-  return false;
-}
-
 static LinkageInfo getExternalLinkageFor(const NamedDecl *D) {
   return LinkageInfo::external();
 }
@@ -642,7 +636,13 @@ LinkageComputer::getLVForNamespaceScopeDecl(const NamedDecl *D,
     // (There is no equivalent in C99.)
     if (Context.getLangOpts().CPlusPlus && Var->getType().isConstQualified() &&
         !Var->getType().isVolatileQualified() && !Var->isInline() &&
-        !isDeclaredInModuleInterfaceOrPartition(Var) &&
+        ![Var]() {
+          // Check if it is module purview except private module fragment
+          // and implementation unit.
+          if (auto *M = Var->getOwningModule())
+            return M->isInterfaceOrPartition() || M->isImplicitGlobalModule();
+          return false;
+        }() &&
         !isa<VarTemplateSpecializationDecl>(Var) &&
         !Var->getDescribedVarTemplate()) {
       const VarDecl *PrevVar = Var->getPreviousDecl();
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 423508b3adb996f..832fc028ad6696c 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -2421,7 +2421,7 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.peek<Pointer>();
 
-  if (!Ptr.isZero()) {
+  if (!Ptr.isZero() && !Offset.isZero()) {
     if (!CheckArray(S, OpPC, Ptr))
       return false;
   }
@@ -2437,7 +2437,7 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (!Ptr.isZero()) {
+  if (!Ptr.isZero() && !Offset.isZero()) {
     if (!CheckArray(S, OpPC, Ptr))
       return false;
   }
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 07c6a404e08b50f..976670d1efa5618 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -2795,7 +2795,11 @@ void CXXNameMangler::mangleQualifiers(Qualifiers Quals, const DependentAddressSp
         ASString = "ptr32_sptr";
         break;
       case LangAS::ptr32_uptr:
-        ASString = "ptr32_uptr";
+        // For z/OS, there are no special mangling rules applied to the ptr32
+        // qualifier. Ex: void foo(int * __ptr32 p) -> _Z3f2Pi. The mangling for
+        // "p" is treated the same as a regular integer pointer.
+        if (!getASTContext().getTargetInfo().getTriple().isOSzOS())
+          ASString = "ptr32_uptr";
         break;
       case LangAS::ptr64:
         ASString = "ptr64";
diff --git a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
index 9cf081e9e26c139..c9c9d927a5902ef 100644
--- a/clang/lib/Basic/IdentifierTable.cpp
+++ b/clang/lib/Basic/IdentifierTable.cpp
@@ -107,12 +107,14 @@ enum TokenKey : unsigned {
   KEYMSCOMPAT = 0x400000,
   KEYSYCL = 0x800000,
   KEYCUDA = 0x1000000,
-  KEYHLSL = 0x2000000,
-  KEYFIXEDPOINT = 0x4000000,
+  KEYZOS = 0x2000000,
+  KEYNOZOS = 0x4000000,
+  KEYHLSL = 0x8000000,
+  KEYFIXEDPOINT = 0x10000000,
   KEYMAX = KEYFIXEDPOINT, // The maximum key
   KEYALLCXX = KEYCXX | KEYCXX11 | KEYCXX20,
-  KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 &
-           ~KEYNOOPENCL // KEYNOMS18 and KEYNOOPENCL are used to exclude.
+  KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 & ~KEYNOOPENCL &
+           ~KEYNOZOS // KEYNOMS18, KEYNOOPENCL, KEYNOZOS are excluded.
 };
 
 /// How a keyword is treated in the selected standard. This enum is ordered
@@ -199,6 +201,8 @@ static KeywordStatus getKeywordStatusHelper(const LangOptions &LangOpts,
     return LangOpts.isSYCL() ? KS_Enabled : KS_Unknown;
   case KEYCUDA:
     return LangOpts.CUDA ? KS_Enabled : KS_Unknown;
+  case KEYZOS:
+    return LangOpts.ZOSExt ? KS_Enabled : KS_Unknown;
   case KEYHLSL:
     return LangOpts.HLSL ? KS_Enabled : KS_Unknown;
   case KEYNOCXX:
@@ -206,9 +210,8 @@ static KeywordStatus getKeywordStatusHelper(const LangOptions &LangOpts,
     // reasons as well.
     return LangOpts.CPlusPlus ? KS_Unknown : KS_Enabled;
   case KEYNOOPENCL:
-    // The disable behavior for this is handled in getKeywordStatus.
-    return KS_Unknown;
   case KEYNOMS18:
+  case KEYNOZOS:
     // The disable behavior for this is handled in getKeywordStatus.
     return KS_Unknown;
   case KEYFIXEDPOINT:
@@ -230,7 +233,8 @@ static KeywordStatus getKeywordStatus(const LangOptions &LangOpts,
   if (LangOpts.MSVCCompat && (Flags & KEYNOMS18) &&
       !LangOpts.isCompatibleWithMSVC(LangOptions::MSVC2015))
     return KS_Disabled;
-
+  if (LangOpts.ZOSExt && (Flags & KEYNOZOS))
+    return KS_Disabled;
   KeywordStatus CurStatus = KS_Unknown;
 
   while (Flags != 0) {
diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index b5f9adfdd515b0e..04dc436eb1b9cdf 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -469,21 +469,36 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts,
 // set of options.
 static bool ppcUserFeaturesCheck(DiagnosticsEngine &Diags,
                                  const std::vector<std::string> &FeaturesVec) {
-  // Cannot allow soft-float with Altivec.
-  if (llvm::is_contained(FeaturesVec, "-hard-float") &&
-      llvm::is_contained(FeaturesVec, "+altivec")) {
-    Diags.Report(diag::err_opt_not_valid_with_opt) << "-msoft-float"
-                                                   << "-maltivec";
+  auto FindVSXSubfeature = [&](StringRef Feature, StringRef SubOption,
+                               StringRef Option) {
+    if (llvm::is_contained(FeaturesVec, Feature)) {
+      Diags.Report(diag::err_opt_not_valid_with_opt) << SubOption << Option;
+      return true;
+    }
     return false;
-  }
+  };
 
-  // Cannot allow soft-float with VSX.
-  if (llvm::is_contained(FeaturesVec, "-hard-float") &&
-      llvm::is_contained(FeaturesVec, "+vsx")) {
-    Diags.Report(diag::err_opt_not_valid_with_opt) << "-msoft-float"
-                                                   << "-mvsx";
-    return false;
+  // Cannot allow soft-float with VSX, Altivec, or any
+  // VSX subfeatures.
+  bool Found = false;
+  if (llvm::is_contained(FeaturesVec, "-hard-float")) {
+    Found |= FindVSXSubfeature("+vsx", "-mvsx", "-msoft-float");
+    Found |= FindVSXSubfeature("+altivec", "-maltivec", "-msoft-float");
+    Found |=
+        FindVSXSubfeature("+power8-vector", "-mpower8-vector", "-msoft-float");
+    Found |= FindVSXSubfeature("+direct-move", "-mdirect-move", "-msoft-float");
+    Found |= FindVSXSubfeature("+float128", "-mfloat128", "-msoft-float");
+    Found |=
+        FindVSXSubfeature("+power9-vector", "-mpower9-vector", "-msoft-float");
+    Found |= FindVSXSubfeature("+paired-vector-memops",
+                               "-mpaired-vector-memops", "-msoft-float");
+    Found |= FindVSXSubfeature("+mma", "-mmma", "-msoft-float");
+    Found |= FindVSXSubfeature("+crypto", "-mcrypto", "-msoft-float");
+    Found |= FindVSXSubfeature("+power10-vector", "-mpower10-vector",
+                               "-msoft-float");
   }
+  if (Found)
+    return false;
 
   // Cannot allow VSX with no Altivec.
   if (llvm::is_contained(FeaturesVec, "+vsx") &&
@@ -497,21 +512,14 @@ static bool ppcUserFeaturesCheck(DiagnosticsEngine &Diags,
   if (!llvm::is_contained(FeaturesVec, "-vsx"))
     return true;
 
-  auto FindVSXSubfeature = [&](StringRef Feature, StringRef Option) {
-    if (llvm::is_contained(FeaturesVec, Feature)) {
-      Diags.Report(diag::err_opt_not_valid_with_opt) << Option << "-mno-vsx";
-      return true;
-    }
-    return false;
-  };
-
-  bool Found = FindVSXSubfeature("+power8-vector", "-mpower8-vector");
-  Found |= FindVSXSubfeature("+direct-move", "-mdirect-move");
-  Found |= FindVSXSubfeature("+float128", "-mfloat128");
-  Found |= FindVSXSubfeature("+power9-vector", "-mpower9-vector");
-  Found |= FindVSXSubfeature("+paired-vector-memops", "-mpaired-vector-memops");
-  Found |= FindVSXSubfeature("+mma", "-mmma");
-  Found |= FindVSXSubfeature("+power10-vector", "-mpower10-vector");
+  Found = FindVSXSubfeature("+power8-vector", "-mpower8-vector", "-mno-vsx");
+  Found |= FindVSXSubfeature("+direct-move", "-mdirect-move", "-mno-vsx");
+  Found |= FindVSXSubfeature("+float128", "-mfloat128", "-mno-vsx");
+  Found |= FindVSXSubfeature("+power9-vector", "-mpower9-vector", "-mno-vsx");
+  Found |= FindVSXSubfeature("+paired-vector-memops", "-mpaired-vector-memops",
+                             "-mno-vsx");
+  Found |= FindVSXSubfeature("+mma", "-mmma", "-mno-vsx");
+  Found |= FindVSXSubfeature("+power10-vector", "-mpower10-vector", "-mno-vsx");
 
   // Return false if any vsx subfeatures was found.
   return !Found;
@@ -693,7 +701,6 @@ bool PPCTargetInfo::initFeatureMap(
     Diags.Report(diag::err_opt_not_valid_with_opt) << "-mprivileged" << CPU;
     return false;
   }
-
   return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
 }
 
@@ -783,13 +790,16 @@ void PPCTargetInfo::setFeatureEnabled(llvm::StringMap<bool> &Features,
   } else {
     if (Name == "spe")
       Features["efpu2"] = false;
-    // If we're disabling altivec or vsx go ahead and disable all of the vsx
-    // features.
-    if ((Name == "altivec") || (Name == "vsx"))
+    // If we're disabling altivec, hard-float, or vsx go ahead and disable all
+    // of the vsx features.
+    if ((Name == "altivec") || (Name == "vsx") || (Name == "hard-float")) {
+      if (Name != "vsx")
+        Features["altivec"] = Features["crypto"] = false;
       Features["vsx"] = Features["direct-move"] = Features["power8-vector"] =
           Features["float128"] = Features["power9-vector"] =
               Features["paired-vector-memops"] = Features["mma"] =
                   Features["power10-vector"] = false;
+    }
     if (Name == "power8-vector")
       Features["power9-vector"] = Features["paired-vector-memops"] =
           Features["mma"] = Features["power10-vector"] = false;
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index 3bc6f2c1d308328..7390f25d6efb1de 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -21,6 +21,30 @@
 namespace clang {
 namespace targets {
 
+static const unsigned ZOSAddressMap[] = {
+    0, // Default
+    0, // opencl_global
+    0, // opencl_local
+    0, // opencl_constant
+    0, // opencl_private
+    0, // opencl_generic
+    0, // opencl_global_device
+    0, // opencl_global_host
+    0, // cuda_device
+    0, // cuda_constant
+    0, // cuda_shared
+    0, // sycl_global
+    0, // sycl_global_device
+    0, // sycl_global_host
+    0, // sycl_local
+    0, // sycl_private
+    0, // ptr32_sptr
+    1, // ptr32_uptr
+    0, // ptr64
+    0, // hlsl_groupshared
+    0  // wasm_funcref
+};
+
 class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
 
   static const char *const GCCRegNames[];
@@ -30,6 +54,7 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
   bool HasVector;
   bool SoftFloat;
   bool UnalignedSymbols;
+  enum AddrSpace { ptr32 = 1 };
 
 public:
   SystemZTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
@@ -49,6 +74,9 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
     MinGlobalAlign = 16;
     HasUnalignedAccess = true;
     if (Triple.isOSzOS()) {
+      if (Triple.isArch64Bit()) {
+        AddrSpaceMap = &ZOSAddressMap;
+      }
       TLSSupported = false;
       // All vector types are default aligned on an 8-byte boundary, even if the
       // vector facility is not available. That is different from Linux.
@@ -56,7 +84,7 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
       // Compared to Linux/ELF, the data layout differs only in some details:
       // - name mangling is GOFF.
       // - 32 bit pointers, either as default or special address space
-      resetDataLayout("E-m:l-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-"
+      resetDataLayout("E-m:l-p1:32:32-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-"
                       "a:8:16-n32:64");
     } else {
       TLSSupported = true;
@@ -224,6 +252,16 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
   std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
     return std::make_pair(256, 256);
   }
+  uint64_t getPointerWidthV(LangAS AddrSpace) const override {
+    return (getTriple().isOSzOS() && getTriple().isArch64Bit() &&
+            getTargetAddressSpace(AddrSpace) == ptr32)
+               ? 32
+               : PointerWidth;
+  }
+
+  uint64_t getPointerAlignV(LangAS AddrSpace) const override {
+    return getPointerWidthV(AddrSpace);
+  }
 };
 } // namespace targets
 } // namespace clang
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 2054c8fe928e2e4..c698d38b80e578f 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7514,6 +7514,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       (C.isForDiagnostics() && !HaveModules))
     CmdArgs.push_back("-frewrite-includes");
 
+  if (Args.hasFlag(options::OPT_fzos_extensions,
+                   options::OPT_fno_zos_extensions, false))
+    CmdArgs.push_back("-fzos-extensions");
+  else if (Args.hasArg(options::OPT_fno_zos_extensions))
+    CmdArgs.push_back("-fno-zos-extensions");
+
   // Only allow -traditional or -traditional-cpp outside in preprocessing modes.
   if (Arg *A = Args.getLastArg(options::OPT_traditional,
                                options::OPT_traditional_cpp)) {
diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
index 67d4c07d1ce39a0..84e273a3949ef02 100644
--- a/clang/lib/Frontend/ASTUnit.cpp
+++ b/clang/lib/Frontend/ASTUnit.cpp
@@ -536,7 +536,8 @@ class ASTInfoCollector : public ASTReaderListener {
         LangOpt(LangOpt), TargetOpts(TargetOpts), Target(Target),
         Counter(Counter) {}
 
-  bool ReadLanguageOptions(const LangOptions &LangOpts, bool Complain,
+  bool ReadLanguageOptions(const LangOptions &LangOpts,
+                           StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override {
     if (InitializedLanguage)
       return false;
@@ -559,6 +560,7 @@ class ASTInfoCollector : public ASTReaderListener {
   }
 
   bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                               StringRef ModuleFilename,
                                StringRef SpecificModuleCachePath,
                                bool Complain) override {
     // llvm::SaveAndRestore doesn't support bit field.
@@ -597,13 +599,15 @@ class ASTInfoCollector : public ASTReaderListener {
   }
 
   bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                               bool ReadMacros, bool Complain,
+                               StringRef ModuleFilename, bool ReadMacros,
+                               bool Complain,
                                std::string &SuggestedPredefines) override {
     this->PPOpts = PPOpts;
     return false;
   }
 
-  bool ReadTargetOptions(const TargetOptions &TargetOpts, bool Complain,
+  bool ReadTargetOptions(const TargetOptions &TargetOpts,
+                         StringRef ModuleFilename, bool Complain,
                          bool AllowCompatibleDifferences) override {
     // If we've already initialized the target, don't do it again.
     if (Target)
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 225bd6416ce5fc3..4dbb13fb723eaf6 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3650,6 +3650,11 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
       GenerateArg(Consumer, OPT_ftrigraphs);
   }
 
+  if (T.isOSzOS() && !Opts.ZOSExt)
+    GenerateArg(Consumer, OPT_fno_zos_extensions);
+  else if (Opts.ZOSExt)
+    GenerateArg(Consumer, OPT_fzos_extensions);
+
   if (Opts.Blocks && !(Opts.OpenCL && Opts.OpenCLVersion == 200))
     GenerateArg(Consumer, OPT_fblocks);
 
@@ -4051,6 +4056,9 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
   Opts.Trigraphs =
       Args.hasFlag(OPT_ftrigraphs, OPT_fno_trigraphs, Opts.Trigraphs);
 
+  Opts.ZOSExt =
+      Args.hasFlag(OPT_fzos_extensions, OPT_fno_zos_extensions, T.isOSzOS());
+
   Opts.Blocks = Args.hasArg(OPT_fblocks) || (Opts.OpenCL
     && Opts.OpenCLVersion == 200);
 
diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index e70210d55fe28d5..9f5d09e33ce244f 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -622,7 +622,8 @@ namespace {
       Out.indent(2) << "Module map file: " << ModuleMapPath << "\n";
     }
 
-    bool ReadLanguageOptions(const LangOptions &LangOpts, bool Complain,
+    bool ReadLanguageOptions(const LangOptions &LangOpts,
+                             StringRef ModuleFilename, bool Complain,
                              bool AllowCompatibleDifferences) override {
       Out.indent(2) << "Language options:\n";
 #define LANGOPT(Name, Bits, Default, Description) \
@@ -645,7 +646,8 @@ namespace {
       return false;
     }
 
-    bool ReadTargetOptions(const TargetOptions &TargetOpts, bool Complain,
+    bool ReadTargetOptions(const TargetOptions &TargetOpts,
+                           StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override {
       Out.indent(2) << "Target options:\n";
       Out.indent(4) << "  Triple: " << TargetOpts.Triple << "\n";
@@ -665,6 +667,7 @@ namespace {
     }
 
     bool ReadDiagnosticOptions(IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts,
+                               StringRef ModuleFilename,
                                bool Complain) override {
       Out.indent(2) << "Diagnostic options:\n";
 #define DIAGOPT(Name, Bits, Default) DUMP_BOOLEAN(DiagOpts->Name, #Name);
@@ -684,6 +687,7 @@ namespace {
     }
 
     bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                                 StringRef ModuleFilename,
                                  StringRef SpecificModuleCachePath,
                                  bool Complain) override {
       Out.indent(2) << "Header search options:\n";
@@ -717,7 +721,8 @@ namespace {
     }
 
     bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                                 bool ReadMacros, bool Complain,
+                                 StringRef ModuleFilename, bool ReadMacros,
+                                 bool Complain,
                                  std::string &SuggestedPredefines) override {
       Out.indent(2) << "Preprocessor options:\n";
       DUMP_BOOLEAN(PPOpts.UsePredefines,
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 5ebe71e496a2e83..04c2f1d380bc48b 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -629,11 +629,6 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result,
                                Sema::ModuleImportState &ImportState) {
   DestroyTemplateIdAnnotationsRAIIObj CleanupRAII(*this);
 
-  // Skip over the EOF token, flagging end of previous input for incremental
-  // processing
-  if (PP.isIncrementalProcessingEnabled() && Tok.is(tok::eof))
-    ConsumeToken();
-
   Result = nullptr;
   switch (Tok.getKind()) {
   case tok::annot_pragma_unused:
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 19d8692ee64849b..633b8220ffbf11d 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1272,6 +1272,18 @@ void Sema::ActOnEndOfTranslationUnit() {
                                    Module::ExplicitGlobalModuleFragment) {
     Diag(ModuleScopes.back().BeginLoc,
          diag::err_module_declaration_missing_after_global_module_introducer);
+  } else if (getLangOpts().getCompilingModule() ==
+                 LangOptions::CMK_ModuleInterface &&
+             // We can't use ModuleScopes here since ModuleScopes is always
+             // empty if we're compiling the BMI.
+             !getASTContext().getCurrentNamedModule()) {
+    // If we are building a module interface unit, we should have seen the
+    // module declaration.
+    //
+    // FIXME: Make a better guess as to where to put the module declaration.
+    Diag(getSourceManager().getLocForStartOfFile(
+             getSourceManager().getMainFileID()),
+         diag::err_module_declaration_missing);
   }
 
   // Now we can decide whether the modules we're building need an initializer.
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index d4c9d044985e34d..d1e62fb5cee6233 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -414,8 +414,7 @@ DiagRecursiveConstraintEval(Sema &S, llvm::FoldingSetNodeID &ID,
   E->Profile(ID, S.Context, /*Canonical=*/true);
   for (const auto &List : MLTAL)
     for (const auto &TemplateArg : List.Args)
-      S.Context.getCanonicalTemplateArgument(TemplateArg)
-          .Profile(ID, S.Context);
+      TemplateArg.Profile(ID, S.Context);
 
   // Note that we have to do this with our own collection, because there are
   // times where a constraint-expression check can cause us to need to evaluate
@@ -643,8 +642,8 @@ bool Sema::CheckConstraintSatisfaction(
   // here.
   llvm::SmallVector<TemplateArgument, 4> FlattenedArgs;
   for (auto List : TemplateArgsLists)
-    for (const TemplateArgument &Arg : List.Args)
-      FlattenedArgs.emplace_back(Context.getCanonicalTemplateArgument(Arg));
+    FlattenedArgs.insert(FlattenedArgs.end(), List.Args.begin(),
+                         List.Args.end());
 
   llvm::FoldingSetNodeID ID;
   ConstraintSatisfaction::Profile(ID, Context, Template, FlattenedArgs);
@@ -828,8 +827,6 @@ Sema::SetupConstraintCheckingTemplateArgumentsAndScope(
                                    /*RelativeToPrimary=*/true,
                                    /*Pattern=*/nullptr,
                                    /*ForConstraintInstantiation=*/true);
-  if (TemplateArgs)
-    MLTAL.replaceInnermostTemplateArguments(FD, *TemplateArgs, /*Final=*/true);
   if (SetupConstraintScope(FD, TemplateArgs, MLTAL, Scope))
     return std::nullopt;
 
@@ -1483,7 +1480,7 @@ static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N,
                                         const ConceptSpecializationExpr *CSE) {
   MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
       CSE->getNamedConcept(), CSE->getNamedConcept()->getLexicalDeclContext(),
-      /*Final=*/true, CSE->getTemplateArguments(),
+      /*Final=*/false, CSE->getTemplateArguments(),
       /*RelativeToPrimary=*/true,
       /*Pattern=*/nullptr,
       /*ForConstraintInstantiation=*/true);
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 1b56b4cabd133ef..124435330ca104f 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -9330,13 +9330,14 @@ Sema::BuildExprRequirement(
     //     be satisfied.
     TemplateParameterList *TPL =
         ReturnTypeRequirement.getTypeConstraintTemplateParameterList();
-    QualType MatchedType = Context.getReferenceQualifiedType(E);
+    QualType MatchedType =
+        Context.getReferenceQualifiedType(E).getCanonicalType();
     llvm::SmallVector<TemplateArgument, 1> Args;
     Args.push_back(TemplateArgument(MatchedType));
 
     auto *Param = cast<TemplateTypeParmDecl>(TPL->getParam(0));
 
-    MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/true);
+    MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/false);
     MLTAL.addOuterRetainedLevels(TPL->getDepth());
     const TypeConstraint *TC = Param->getTypeConstraint();
     assert(TC && "Type Constraint cannot be null here");
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index abd4401e029817d..2666e60c0dd67c4 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -4340,7 +4340,7 @@ static OverloadingResult ResolveConstructorOverload(
 /// \param IsListInit     Is this list-initialization?
 /// \param IsInitListCopy Is this non-list-initialization resulting from a
 ///                       list-initialization from {x} where x is the same
-///                       type as the entity?
+///                       aggregate type as the entity?
 static void TryConstructorInitialization(Sema &S,
                                          const InitializedEntity &Entity,
                                          const InitializationKind &Kind,
@@ -4370,6 +4370,14 @@ static void TryConstructorInitialization(Sema &S,
         Entity.getKind() !=
             InitializedEntity::EK_LambdaToBlockConversionBlockElement);
 
+  bool CopyElisionPossible = false;
+  auto ElideConstructor = [&] {
+    // Convert qualifications if necessary.
+    Sequence.AddQualificationConversionStep(DestType, VK_PRValue);
+    if (ILE)
+      Sequence.RewrapReferenceInitList(DestType, ILE);
+  };
+
   // C++17 [dcl.init]p17:
   //     - If the initializer expression is a prvalue and the cv-unqualified
   //       version of the source type is the same class as the class of the
@@ -4382,11 +4390,33 @@ static void TryConstructorInitialization(Sema &S,
   if (S.getLangOpts().CPlusPlus17 && !RequireActualConstructor &&
       UnwrappedArgs.size() == 1 && UnwrappedArgs[0]->isPRValue() &&
       S.Context.hasSameUnqualifiedType(UnwrappedArgs[0]->getType(), DestType)) {
-    // Convert qualifications if necessary.
-    Sequence.AddQualificationConversionStep(DestType, VK_PRValue);
-    if (ILE)
-      Sequence.RewrapReferenceInitList(DestType, ILE);
-    return;
+    if (ILE && !DestType->isAggregateType()) {
+      // CWG2311: T{ prvalue_of_type_T } is not eligible for copy elision
+      // Make this an elision if this won't call an initializer-list
+      // constructor. (Always on an aggregate type or check constructors first.)
+
+      // This effectively makes our resolution as follows. The parts in angle
+      // brackets are additions.
+      // C++17 [over.match.list]p(1.2):
+      //   - If no viable initializer-list constructor is found <and the
+      //     initializer list does not consist of exactly a single element with
+      //     the same cv-unqualified class type as T>, [...]
+      // C++17 [dcl.init.list]p(3.6):
+      //   - Otherwise, if T is a class type, constructors are considered. The
+      //     applicable constructors are enumerated and the best one is chosen
+      //     through overload resolution. <If no constructor is found and the
+      //     initializer list consists of exactly a single element with the same
+      //     cv-unqualified class type as T, the object is initialized from that
+      //     element (by copy-initialization for copy-list-initialization, or by
+      //     direct-initialization for direct-list-initialization). Otherwise, >
+      //     if a narrowing conversion [...]
+      assert(!IsInitListCopy &&
+             "IsInitListCopy only possible with aggregate types");
+      CopyElisionPossible = true;
+    } else {
+      ElideConstructor();
+      return;
+    }
   }
 
   const RecordType *DestRecordType = DestType->getAs<RecordType>();
@@ -4431,6 +4461,12 @@ static void TryConstructorInitialization(Sema &S,
           S, Kind.getLocation(), Args, CandidateSet, DestType, Ctors, Best,
           CopyInitialization, AllowExplicit,
           /*OnlyListConstructors=*/true, IsListInit, RequireActualConstructor);
+
+    if (CopyElisionPossible && Result == OR_No_Viable_Function) {
+      // No initializer list candidate
+      ElideConstructor();
+      return;
+    }
   }
 
   // C++11 [over.match.list]p1:
@@ -4712,9 +4748,9 @@ static void TryListInitialization(Sema &S,
     return;
   }
 
-  // C++11 [dcl.init.list]p3, per DR1467:
-  // - If T is a class type and the initializer list has a single element of
-  //   type cv U, where U is T or a class derived from T, the object is
+  // C++11 [dcl.init.list]p3, per DR1467 and DR2137:
+  // - If T is an aggregate class and the initializer list has a single element
+  //   of type cv U, where U is T or a class derived from T, the object is
   //   initialized from that element (by copy-initialization for
   //   copy-list-initialization, or by direct-initialization for
   //   direct-list-initialization).
@@ -4725,7 +4761,7 @@ static void TryListInitialization(Sema &S,
   // - Otherwise, if T is an aggregate, [...] (continue below).
   if (S.getLangOpts().CPlusPlus11 && InitList->getNumInits() == 1 &&
       !IsDesignatedInit) {
-    if (DestType->isRecordType()) {
+    if (DestType->isRecordType() && DestType->isAggregateType()) {
       QualType InitType = InitList->getInit(0)->getType();
       if (S.Context.hasSameUnqualifiedType(InitType, DestType) ||
           S.IsDerivedFrom(InitList->getBeginLoc(), InitType, DestType)) {
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index fd88b6a74297dc7..0f196ddf812fdfc 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1619,19 +1619,36 @@ TryUserDefinedConversion(Sema &S, Expr *From, QualType ToType,
     //   called for those cases.
     if (CXXConstructorDecl *Constructor
           = dyn_cast<CXXConstructorDecl>(ICS.UserDefined.ConversionFunction)) {
-      QualType FromCanon
-        = S.Context.getCanonicalType(From->getType().getUnqualifiedType());
+      QualType FromType;
+      SourceLocation FromLoc;
+      // C++11 [over.ics.list]p6, per DR2137:
+      // C++17 [over.ics.list]p6:
+      //   If C is not an initializer-list constructor and the initializer list
+      //   has a single element of type cv U, where U is X or a class derived
+      //   from X, the implicit conversion sequence has Exact Match rank if U is
+      //   X, or Conversion rank if U is derived from X.
+      if (const auto *InitList = dyn_cast<InitListExpr>(From);
+          InitList && InitList->getNumInits() == 1 &&
+          !S.isInitListConstructor(Constructor)) {
+        const Expr *SingleInit = InitList->getInit(0);
+        FromType = SingleInit->getType();
+        FromLoc = SingleInit->getBeginLoc();
+      } else {
+        FromType = From->getType();
+        FromLoc = From->getBeginLoc();
+      }
+      QualType FromCanon =
+          S.Context.getCanonicalType(FromType.getUnqualifiedType());
       QualType ToCanon
         = S.Context.getCanonicalType(ToType).getUnqualifiedType();
-      if (Constructor->isCopyConstructor() &&
-          (FromCanon == ToCanon ||
-           S.IsDerivedFrom(From->getBeginLoc(), FromCanon, ToCanon))) {
+      if ((FromCanon == ToCanon ||
+           S.IsDerivedFrom(FromLoc, FromCanon, ToCanon))) {
         // Turn this into a "standard" conversion sequence, so that it
         // gets ranked with standard conversion sequences.
         DeclAccessPair Found = ICS.UserDefined.FoundConversionFunction;
         ICS.setStandard();
         ICS.Standard.setAsIdentityConversion();
-        ICS.Standard.setFromType(From->getType());
+        ICS.Standard.setFromType(FromType);
         ICS.Standard.setAllToTypes(ToType);
         ICS.Standard.CopyConstructor = Constructor;
         ICS.Standard.FoundCopyConstructor = Found;
@@ -5335,18 +5352,18 @@ TryListConversion(Sema &S, InitListExpr *From, QualType ToType,
       IsDesignatedInit)
     return Result;
 
-  // Per DR1467:
-  //   If the parameter type is a class X and the initializer list has a single
-  //   element of type cv U, where U is X or a class derived from X, the
-  //   implicit conversion sequence is the one required to convert the element
-  //   to the parameter type.
+  // Per DR1467 and DR2137:
+  //   If the parameter type is an aggregate class X and the initializer list
+  //   has a single element of type cv U, where U is X or a class derived from
+  //   X, the implicit conversion sequence is the one required to convert the
+  //   element to the parameter type.
   //
   //   Otherwise, if the parameter type is a character array [... ]
   //   and the initializer list has a single element that is an
   //   appropriately-typed string literal (8.5.2 [dcl.init.string]), the
   //   implicit conversion sequence is the identity conversion.
   if (From->getNumInits() == 1 && !IsDesignatedInit) {
-    if (ToType->isRecordType()) {
+    if (ToType->isRecordType() && ToType->isAggregateType()) {
       QualType InitType = From->getInit(0)->getType();
       if (S.Context.hasSameUnqualifiedType(InitType, ToType) ||
           S.IsDerivedFrom(From->getBeginLoc(), InitType, ToType))
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index cd3ee31fcca610b..29e7978ba5b1f86 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -4358,13 +4358,13 @@ Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
 
   auto *CSD = ImplicitConceptSpecializationDecl::Create(
       Context, NamedConcept->getDeclContext(), NamedConcept->getLocation(),
-      SugaredConverted);
+      CanonicalConverted);
   ConstraintSatisfaction Satisfaction;
   bool AreArgsDependent =
       TemplateSpecializationType::anyDependentTemplateArguments(
-          *TemplateArgs, SugaredConverted);
-  MultiLevelTemplateArgumentList MLTAL(NamedConcept, SugaredConverted,
-                                       /*Final=*/true);
+          *TemplateArgs, CanonicalConverted);
+  MultiLevelTemplateArgumentList MLTAL(NamedConcept, CanonicalConverted,
+                                       /*Final=*/false);
   LocalInstantiationScope Scope(*this);
 
   EnterExpressionEvaluationContext EECtx{
@@ -5583,7 +5583,7 @@ bool Sema::CheckTemplateArgumentList(
     CXXThisScopeRAII(*this, RD, ThisQuals, RD != nullptr);
 
     MultiLevelTemplateArgumentList MLTAL = getTemplateInstantiationArgs(
-        Template, NewContext, /*Final=*/true, SugaredConverted,
+        Template, NewContext, /*Final=*/false, CanonicalConverted,
         /*RelativeToPrimary=*/true,
         /*Pattern=*/nullptr,
         /*ForConceptInstantiation=*/true);
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 978f1a9dc1a933d..e9705ec43d86cc6 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3078,7 +3078,7 @@ CheckDeducedArgumentConstraints(Sema &S, TemplateDeclT *Template,
   // If we don't need to replace the deduced template arguments,
   // we can add them immediately as the inner-most argument list.
   if (!DeducedArgsNeedReplacement(Template))
-    Innermost = SugaredDeducedArgs;
+    Innermost = CanonicalDeducedArgs;
 
   MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
       Template, Template->getDeclContext(), /*Final=*/false, Innermost,
@@ -3090,7 +3090,7 @@ CheckDeducedArgumentConstraints(Sema &S, TemplateDeclT *Template,
   // not class-scope explicit specialization, so replace with Deduced Args
   // instead of adding to inner-most.
   if (!Innermost)
-    MLTAL.replaceInnermostTemplateArguments(Template, SugaredDeducedArgs);
+    MLTAL.replaceInnermostTemplateArguments(Template, CanonicalDeducedArgs);
 
   if (S.CheckConstraintSatisfaction(Template, AssociatedConstraints, MLTAL,
                                     Info.getLocation(),
@@ -3913,13 +3913,13 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
       (CanonicalBuilder.size() ==
        FunctionTemplate->getTemplateParameters()->size())) {
     if (CheckInstantiatedFunctionTemplateConstraints(
-            Info.getLocation(), Specialization, SugaredBuilder,
+            Info.getLocation(), Specialization, CanonicalBuilder,
             Info.AssociatedConstraintsSatisfaction))
       return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
     if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) {
-      Info.reset(TemplateArgumentList::CreateCopy(Context, SugaredBuilder),
-                 Info.takeCanonical());
+      Info.reset(Info.takeSugared(),
+                 TemplateArgumentList::CreateCopy(Context, CanonicalBuilder));
       return TemplateDeductionResult::ConstraintsNotSatisfied;
     }
   }
@@ -4993,8 +4993,8 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
                                   /*PartialTemplateArgs=*/false,
                                   SugaredConverted, CanonicalConverted))
     return true;
-  MultiLevelTemplateArgumentList MLTAL(Concept, SugaredConverted,
-                                       /*Final=*/true);
+  MultiLevelTemplateArgumentList MLTAL(Concept, CanonicalConverted,
+                                       /*Final=*/false);
   // Build up an EvaluationContext with an ImplicitConceptSpecializationDecl so
   // that the template arguments of the constraint can be preserved. For
   // example:
@@ -5008,7 +5008,7 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
       S, Sema::ExpressionEvaluationContext::Unevaluated,
       ImplicitConceptSpecializationDecl::Create(
           S.getASTContext(), Concept->getDeclContext(), Concept->getLocation(),
-          SugaredConverted));
+          CanonicalConverted));
   if (S.CheckConstraintSatisfaction(Concept, {Concept->getConstraintExpr()},
                                     MLTAL, TypeLoc.getLocalSourceRange(),
                                     Satisfaction))
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 8f8e9183330ec96..08020f9f889270a 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -7065,7 +7065,7 @@ static bool handleMSPointerTypeQualifierAttr(TypeProcessingState &State,
     else if (Attrs[attr::UPtr])
       ASIdx = LangAS::ptr32_uptr;
   } else if (PtrWidth == 64 && Attrs[attr::Ptr32]) {
-    if (Attrs[attr::UPtr])
+    if (S.Context.getTargetInfo().getTriple().isOSzOS() || Attrs[attr::UPtr])
       ASIdx = LangAS::ptr32_uptr;
     else
       ASIdx = LangAS::ptr32_sptr;
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index e7ccf8a60feb5df..ad8d6c336f27809 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -172,29 +172,29 @@ void ChainedASTReaderListener::ReadModuleMapFile(StringRef ModuleMapPath) {
   Second->ReadModuleMapFile(ModuleMapPath);
 }
 
-bool
-ChainedASTReaderListener::ReadLanguageOptions(const LangOptions &LangOpts,
-                                              bool Complain,
-                                              bool AllowCompatibleDifferences) {
-  return First->ReadLanguageOptions(LangOpts, Complain,
+bool ChainedASTReaderListener::ReadLanguageOptions(
+    const LangOptions &LangOpts, StringRef ModuleFilename, bool Complain,
+    bool AllowCompatibleDifferences) {
+  return First->ReadLanguageOptions(LangOpts, ModuleFilename, Complain,
                                     AllowCompatibleDifferences) ||
-         Second->ReadLanguageOptions(LangOpts, Complain,
+         Second->ReadLanguageOptions(LangOpts, ModuleFilename, Complain,
                                      AllowCompatibleDifferences);
 }
 
 bool ChainedASTReaderListener::ReadTargetOptions(
-    const TargetOptions &TargetOpts, bool Complain,
+    const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
     bool AllowCompatibleDifferences) {
-  return First->ReadTargetOptions(TargetOpts, Complain,
+  return First->ReadTargetOptions(TargetOpts, ModuleFilename, Complain,
                                   AllowCompatibleDifferences) ||
-         Second->ReadTargetOptions(TargetOpts, Complain,
+         Second->ReadTargetOptions(TargetOpts, ModuleFilename, Complain,
                                    AllowCompatibleDifferences);
 }
 
 bool ChainedASTReaderListener::ReadDiagnosticOptions(
-    IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts, bool Complain) {
-  return First->ReadDiagnosticOptions(DiagOpts, Complain) ||
-         Second->ReadDiagnosticOptions(DiagOpts, Complain);
+    IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts, StringRef ModuleFilename,
+    bool Complain) {
+  return First->ReadDiagnosticOptions(DiagOpts, ModuleFilename, Complain) ||
+         Second->ReadDiagnosticOptions(DiagOpts, ModuleFilename, Complain);
 }
 
 bool
@@ -205,21 +205,21 @@ ChainedASTReaderListener::ReadFileSystemOptions(const FileSystemOptions &FSOpts,
 }
 
 bool ChainedASTReaderListener::ReadHeaderSearchOptions(
-    const HeaderSearchOptions &HSOpts, StringRef SpecificModuleCachePath,
-    bool Complain) {
-  return First->ReadHeaderSearchOptions(HSOpts, SpecificModuleCachePath,
-                                        Complain) ||
-         Second->ReadHeaderSearchOptions(HSOpts, SpecificModuleCachePath,
-                                         Complain);
+    const HeaderSearchOptions &HSOpts, StringRef ModuleFilename,
+    StringRef SpecificModuleCachePath, bool Complain) {
+  return First->ReadHeaderSearchOptions(HSOpts, ModuleFilename,
+                                        SpecificModuleCachePath, Complain) ||
+         Second->ReadHeaderSearchOptions(HSOpts, ModuleFilename,
+                                         SpecificModuleCachePath, Complain);
 }
 
 bool ChainedASTReaderListener::ReadPreprocessorOptions(
-    const PreprocessorOptions &PPOpts, bool ReadMacros, bool Complain,
-    std::string &SuggestedPredefines) {
-  return First->ReadPreprocessorOptions(PPOpts, ReadMacros, Complain,
-                                        SuggestedPredefines) ||
-         Second->ReadPreprocessorOptions(PPOpts, ReadMacros, Complain,
-                                         SuggestedPredefines);
+    const PreprocessorOptions &PPOpts, StringRef ModuleFilename,
+    bool ReadMacros, bool Complain, std::string &SuggestedPredefines) {
+  return First->ReadPreprocessorOptions(PPOpts, ModuleFilename, ReadMacros,
+                                        Complain, SuggestedPredefines) ||
+         Second->ReadPreprocessorOptions(PPOpts, ModuleFilename, ReadMacros,
+                                         Complain, SuggestedPredefines);
 }
 
 void ChainedASTReaderListener::ReadCounter(const serialization::ModuleFile &M,
@@ -282,35 +282,37 @@ ASTReaderListener::~ASTReaderListener() = default;
 /// \returns true if the languagae options mis-match, false otherwise.
 static bool checkLanguageOptions(const LangOptions &LangOpts,
                                  const LangOptions &ExistingLangOpts,
+                                 StringRef ModuleFilename,
                                  DiagnosticsEngine *Diags,
                                  bool AllowCompatibleDifferences = true) {
-#define LANGOPT(Name, Bits, Default, Description)                   \
-  if (ExistingLangOpts.Name != LangOpts.Name) {                     \
-    if (Diags) {                                                    \
-      if (Bits == 1)                                                \
-        Diags->Report(diag::err_pch_langopt_mismatch)               \
-          << Description << LangOpts.Name << ExistingLangOpts.Name; \
-      else                                                          \
-        Diags->Report(diag::err_pch_langopt_value_mismatch)         \
-          << Description;                                           \
-    }                                                               \
-    return true;                                                    \
-  }
-
-#define VALUE_LANGOPT(Name, Bits, Default, Description)   \
-  if (ExistingLangOpts.Name != LangOpts.Name) {           \
-    if (Diags)                                            \
-      Diags->Report(diag::err_pch_langopt_value_mismatch) \
-        << Description;                                   \
-    return true;                                          \
-  }
-
-#define ENUM_LANGOPT(Name, Type, Bits, Default, Description)   \
-  if (ExistingLangOpts.get##Name() != LangOpts.get##Name()) {  \
-    if (Diags)                                                 \
-      Diags->Report(diag::err_pch_langopt_value_mismatch)      \
-        << Description;                                        \
-    return true;                                               \
+#define LANGOPT(Name, Bits, Default, Description)                              \
+  if (ExistingLangOpts.Name != LangOpts.Name) {                                \
+    if (Diags) {                                                               \
+      if (Bits == 1)                                                           \
+        Diags->Report(diag::err_ast_file_langopt_mismatch)                     \
+            << Description << LangOpts.Name << ExistingLangOpts.Name           \
+            << ModuleFilename;                                                 \
+      else                                                                     \
+        Diags->Report(diag::err_ast_file_langopt_value_mismatch)               \
+            << Description << ModuleFilename;                                  \
+    }                                                                          \
+    return true;                                                               \
+  }
+
+#define VALUE_LANGOPT(Name, Bits, Default, Description)                        \
+  if (ExistingLangOpts.Name != LangOpts.Name) {                                \
+    if (Diags)                                                                 \
+      Diags->Report(diag::err_ast_file_langopt_value_mismatch)                 \
+          << Description << ModuleFilename;                                    \
+    return true;                                                               \
+  }
+
+#define ENUM_LANGOPT(Name, Type, Bits, Default, Description)                   \
+  if (ExistingLangOpts.get##Name() != LangOpts.get##Name()) {                  \
+    if (Diags)                                                                 \
+      Diags->Report(diag::err_ast_file_langopt_value_mismatch)                 \
+          << Description << ModuleFilename;                                    \
+    return true;                                                               \
   }
 
 #define COMPATIBLE_LANGOPT(Name, Bits, Default, Description)  \
@@ -332,22 +334,23 @@ static bool checkLanguageOptions(const LangOptions &LangOpts,
 
   if (ExistingLangOpts.ModuleFeatures != LangOpts.ModuleFeatures) {
     if (Diags)
-      Diags->Report(diag::err_pch_langopt_value_mismatch) << "module features";
+      Diags->Report(diag::err_ast_file_langopt_value_mismatch)
+          << "module features" << ModuleFilename;
     return true;
   }
 
   if (ExistingLangOpts.ObjCRuntime != LangOpts.ObjCRuntime) {
     if (Diags)
-      Diags->Report(diag::err_pch_langopt_value_mismatch)
-      << "target Objective-C runtime";
+      Diags->Report(diag::err_ast_file_langopt_value_mismatch)
+          << "target Objective-C runtime" << ModuleFilename;
     return true;
   }
 
   if (ExistingLangOpts.CommentOpts.BlockCommandNames !=
       LangOpts.CommentOpts.BlockCommandNames) {
     if (Diags)
-      Diags->Report(diag::err_pch_langopt_value_mismatch)
-        << "block command names";
+      Diags->Report(diag::err_ast_file_langopt_value_mismatch)
+          << "block command names" << ModuleFilename;
     return true;
   }
 
@@ -369,8 +372,8 @@ static bool checkLanguageOptions(const LangOptions &LangOpts,
     bool InExistingModule = ExistingSanitizers.has(SanitizerKind::ID);         \
     bool InImportedModule = ImportedSanitizers.has(SanitizerKind::ID);         \
     if (InExistingModule != InImportedModule)                                  \
-      Diags->Report(diag::err_pch_targetopt_feature_mismatch)                  \
-          << InExistingModule << (Flag + NAME);                                \
+      Diags->Report(diag::err_ast_file_targetopt_feature_mismatch)             \
+          << InExistingModule << ModuleFilename << (Flag + NAME);              \
   }
 #include "clang/Basic/Sanitizers.def"
       }
@@ -389,14 +392,16 @@ static bool checkLanguageOptions(const LangOptions &LangOpts,
 /// \returns true if the target options mis-match, false otherwise.
 static bool checkTargetOptions(const TargetOptions &TargetOpts,
                                const TargetOptions &ExistingTargetOpts,
+                               StringRef ModuleFilename,
                                DiagnosticsEngine *Diags,
                                bool AllowCompatibleDifferences = true) {
-#define CHECK_TARGET_OPT(Field, Name)                             \
-  if (TargetOpts.Field != ExistingTargetOpts.Field) {             \
-    if (Diags)                                                    \
-      Diags->Report(diag::err_pch_targetopt_mismatch)             \
-        << Name << TargetOpts.Field << ExistingTargetOpts.Field;  \
-    return true;                                                  \
+#define CHECK_TARGET_OPT(Field, Name)                                          \
+  if (TargetOpts.Field != ExistingTargetOpts.Field) {                          \
+    if (Diags)                                                                 \
+      Diags->Report(diag::err_ast_file_targetopt_mismatch)                     \
+          << ModuleFilename << Name << TargetOpts.Field                        \
+          << ExistingTargetOpts.Field;                                         \
+    return true;                                                               \
   }
 
   // The triple and ABI must match exactly.
@@ -439,31 +444,30 @@ static bool checkTargetOptions(const TargetOptions &TargetOpts,
 
   if (Diags) {
     for (StringRef Feature : UnmatchedReadFeatures)
-      Diags->Report(diag::err_pch_targetopt_feature_mismatch)
-          << /* is-existing-feature */ false << Feature;
+      Diags->Report(diag::err_ast_file_targetopt_feature_mismatch)
+          << /* is-existing-feature */ false << ModuleFilename << Feature;
     for (StringRef Feature : UnmatchedExistingFeatures)
-      Diags->Report(diag::err_pch_targetopt_feature_mismatch)
-          << /* is-existing-feature */ true << Feature;
+      Diags->Report(diag::err_ast_file_targetopt_feature_mismatch)
+          << /* is-existing-feature */ true << ModuleFilename << Feature;
   }
 
   return !UnmatchedReadFeatures.empty() || !UnmatchedExistingFeatures.empty();
 }
 
-bool
-PCHValidator::ReadLanguageOptions(const LangOptions &LangOpts,
-                                  bool Complain,
-                                  bool AllowCompatibleDifferences) {
+bool PCHValidator::ReadLanguageOptions(const LangOptions &LangOpts,
+                                       StringRef ModuleFilename, bool Complain,
+                                       bool AllowCompatibleDifferences) {
   const LangOptions &ExistingLangOpts = PP.getLangOpts();
-  return checkLanguageOptions(LangOpts, ExistingLangOpts,
+  return checkLanguageOptions(LangOpts, ExistingLangOpts, ModuleFilename,
                               Complain ? &Reader.Diags : nullptr,
                               AllowCompatibleDifferences);
 }
 
 bool PCHValidator::ReadTargetOptions(const TargetOptions &TargetOpts,
-                                     bool Complain,
+                                     StringRef ModuleFilename, bool Complain,
                                      bool AllowCompatibleDifferences) {
   const TargetOptions &ExistingTargetOpts = PP.getTargetInfo().getTargetOpts();
-  return checkTargetOptions(TargetOpts, ExistingTargetOpts,
+  return checkTargetOptions(TargetOpts, ExistingTargetOpts, ModuleFilename,
                             Complain ? &Reader.Diags : nullptr,
                             AllowCompatibleDifferences);
 }
@@ -478,6 +482,7 @@ using DeclsMap = llvm::DenseMap<DeclarationName, SmallVector<NamedDecl *, 8>>;
 
 static bool checkDiagnosticGroupMappings(DiagnosticsEngine &StoredDiags,
                                          DiagnosticsEngine &Diags,
+                                         StringRef ModuleFilename,
                                          bool Complain) {
   using Level = DiagnosticsEngine::Level;
 
@@ -496,8 +501,11 @@ static bool checkDiagnosticGroupMappings(DiagnosticsEngine &StoredDiags,
           StoredDiags.getDiagnosticLevel(DiagID, SourceLocation());
       if (StoredLevel < DiagnosticsEngine::Error) {
         if (Complain)
-          Diags.Report(diag::err_pch_diagopt_mismatch) << "-Werror=" +
-              Diags.getDiagnosticIDs()->getWarningOptionForDiag(DiagID).str();
+          Diags.Report(diag::err_ast_file_diagopt_mismatch)
+              << "-Werror=" + Diags.getDiagnosticIDs()
+                                  ->getWarningOptionForDiag(DiagID)
+                                  .str()
+              << ModuleFilename;
         return true;
       }
     }
@@ -514,7 +522,8 @@ static bool isExtHandlingFromDiagsError(DiagnosticsEngine &Diags) {
 }
 
 static bool checkDiagnosticMappings(DiagnosticsEngine &StoredDiags,
-                                    DiagnosticsEngine &Diags, bool IsSystem,
+                                    DiagnosticsEngine &Diags,
+                                    StringRef ModuleFilename, bool IsSystem,
                                     bool SystemHeaderWarningsInModule,
                                     bool Complain) {
   // Top-level options
@@ -526,32 +535,37 @@ static bool checkDiagnosticMappings(DiagnosticsEngine &StoredDiags,
     if (StoredDiags.getSuppressSystemWarnings() &&
         !SystemHeaderWarningsInModule) {
       if (Complain)
-        Diags.Report(diag::err_pch_diagopt_mismatch) << "-Wsystem-headers";
+        Diags.Report(diag::err_ast_file_diagopt_mismatch)
+            << "-Wsystem-headers" << ModuleFilename;
       return true;
     }
   }
 
   if (Diags.getWarningsAsErrors() && !StoredDiags.getWarningsAsErrors()) {
     if (Complain)
-      Diags.Report(diag::err_pch_diagopt_mismatch) << "-Werror";
+      Diags.Report(diag::err_ast_file_diagopt_mismatch)
+          << "-Werror" << ModuleFilename;
     return true;
   }
 
   if (Diags.getWarningsAsErrors() && Diags.getEnableAllWarnings() &&
       !StoredDiags.getEnableAllWarnings()) {
     if (Complain)
-      Diags.Report(diag::err_pch_diagopt_mismatch) << "-Weverything -Werror";
+      Diags.Report(diag::err_ast_file_diagopt_mismatch)
+          << "-Weverything -Werror" << ModuleFilename;
     return true;
   }
 
   if (isExtHandlingFromDiagsError(Diags) &&
       !isExtHandlingFromDiagsError(StoredDiags)) {
     if (Complain)
-      Diags.Report(diag::err_pch_diagopt_mismatch) << "-pedantic-errors";
+      Diags.Report(diag::err_ast_file_diagopt_mismatch)
+          << "-pedantic-errors" << ModuleFilename;
     return true;
   }
 
-  return checkDiagnosticGroupMappings(StoredDiags, Diags, Complain);
+  return checkDiagnosticGroupMappings(StoredDiags, Diags, ModuleFilename,
+                                      Complain);
 }
 
 /// Return the top import module if it is implicit, nullptr otherwise.
@@ -580,7 +594,8 @@ static Module *getTopImportImplicitModule(ModuleManager &ModuleMgr,
 }
 
 bool PCHValidator::ReadDiagnosticOptions(
-    IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts, bool Complain) {
+    IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts, StringRef ModuleFilename,
+    bool Complain) {
   DiagnosticsEngine &ExistingDiags = PP.getDiagnostics();
   IntrusiveRefCntPtr<DiagnosticIDs> DiagIDs(ExistingDiags.getDiagnosticIDs());
   IntrusiveRefCntPtr<DiagnosticsEngine> Diags(
@@ -605,8 +620,9 @@ bool PCHValidator::ReadDiagnosticOptions(
 
   // FIXME: if the diagnostics are incompatible, save a DiagnosticOptions that
   // contains the union of their flags.
-  return checkDiagnosticMappings(*Diags, ExistingDiags, TopM->IsSystem,
-                                 SystemHeaderWarningsInModule, Complain);
+  return checkDiagnosticMappings(*Diags, ExistingDiags, ModuleFilename,
+                                 TopM->IsSystem, SystemHeaderWarningsInModule,
+                                 Complain);
 }
 
 /// Collect the macro definitions provided by the given preprocessor
@@ -665,8 +681,8 @@ enum OptionValidation {
 ///        are no differences in the options between the two.
 static bool checkPreprocessorOptions(
     const PreprocessorOptions &PPOpts,
-    const PreprocessorOptions &ExistingPPOpts, bool ReadMacros,
-    DiagnosticsEngine *Diags, FileManager &FileMgr,
+    const PreprocessorOptions &ExistingPPOpts, StringRef ModuleFilename,
+    bool ReadMacros, DiagnosticsEngine *Diags, FileManager &FileMgr,
     std::string &SuggestedPredefines, const LangOptions &LangOpts,
     OptionValidation Validation = OptionValidateContradictions) {
   if (ReadMacros) {
@@ -695,7 +711,8 @@ static bool checkPreprocessorOptions(
           // If strict matches are requested, don't tolerate any extra defines
           // on the command line that are missing in the AST file.
           if (Diags) {
-            Diags->Report(diag::err_pch_macro_def_undef) << MacroName << true;
+            Diags->Report(diag::err_ast_file_macro_def_undef)
+                << MacroName << true << ModuleFilename;
           }
           return true;
         }
@@ -721,8 +738,8 @@ static bool checkPreprocessorOptions(
       // conflict.
       if (Existing.second != Known->second.second) {
         if (Diags) {
-          Diags->Report(diag::err_pch_macro_def_undef)
-              << MacroName << Known->second.second;
+          Diags->Report(diag::err_ast_file_macro_def_undef)
+              << MacroName << Known->second.second << ModuleFilename;
         }
         return true;
       }
@@ -736,8 +753,9 @@ static bool checkPreprocessorOptions(
 
       // The macro bodies differ; complain.
       if (Diags) {
-        Diags->Report(diag::err_pch_macro_def_conflict)
-            << MacroName << Known->second.first << Existing.first;
+        Diags->Report(diag::err_ast_file_macro_def_conflict)
+            << MacroName << Known->second.first << Existing.first
+            << ModuleFilename;
       }
       return true;
     }
@@ -750,7 +768,8 @@ static bool checkPreprocessorOptions(
       // the AST file that are missing on the command line.
       for (const auto &MacroName : ASTFileMacros.keys()) {
         if (Diags) {
-          Diags->Report(diag::err_pch_macro_def_undef) << MacroName << false;
+          Diags->Report(diag::err_ast_file_macro_def_undef)
+              << MacroName << false << ModuleFilename;
         }
         return true;
       }
@@ -761,7 +780,8 @@ static bool checkPreprocessorOptions(
   if (PPOpts.UsePredefines != ExistingPPOpts.UsePredefines &&
       Validation != OptionValidateNone) {
     if (Diags) {
-      Diags->Report(diag::err_pch_undef) << ExistingPPOpts.UsePredefines;
+      Diags->Report(diag::err_ast_file_undef)
+          << ExistingPPOpts.UsePredefines << ModuleFilename;
     }
     return true;
   }
@@ -771,7 +791,8 @@ static bool checkPreprocessorOptions(
       PPOpts.DetailedRecord != ExistingPPOpts.DetailedRecord &&
       Validation != OptionValidateNone) {
     if (Diags) {
-      Diags->Report(diag::err_pch_pp_detailed_record) << PPOpts.DetailedRecord;
+      Diags->Report(diag::err_ast_file_pp_detailed_record)
+          << PPOpts.DetailedRecord << ModuleFilename;
     }
     return true;
   }
@@ -815,22 +836,24 @@ static bool checkPreprocessorOptions(
 }
 
 bool PCHValidator::ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
+                                           StringRef ModuleFilename,
                                            bool ReadMacros, bool Complain,
                                            std::string &SuggestedPredefines) {
   const PreprocessorOptions &ExistingPPOpts = PP.getPreprocessorOpts();
 
   return checkPreprocessorOptions(
-      PPOpts, ExistingPPOpts, ReadMacros, Complain ? &Reader.Diags : nullptr,
-      PP.getFileManager(), SuggestedPredefines, PP.getLangOpts());
+      PPOpts, ExistingPPOpts, ModuleFilename, ReadMacros,
+      Complain ? &Reader.Diags : nullptr, PP.getFileManager(),
+      SuggestedPredefines, PP.getLangOpts());
 }
 
 bool SimpleASTReaderListener::ReadPreprocessorOptions(
-    const PreprocessorOptions &PPOpts, bool ReadMacros, bool Complain,
-    std::string &SuggestedPredefines) {
-  return checkPreprocessorOptions(PPOpts, PP.getPreprocessorOpts(), ReadMacros,
-                                  nullptr, PP.getFileManager(),
-                                  SuggestedPredefines, PP.getLangOpts(),
-                                  OptionValidateNone);
+    const PreprocessorOptions &PPOpts, StringRef ModuleFilename,
+    bool ReadMacros, bool Complain, std::string &SuggestedPredefines) {
+  return checkPreprocessorOptions(PPOpts, PP.getPreprocessorOpts(),
+                                  ModuleFilename, ReadMacros, nullptr,
+                                  PP.getFileManager(), SuggestedPredefines,
+                                  PP.getLangOpts(), OptionValidateNone);
 }
 
 /// Check that the specified and the existing module cache paths are equivalent.
@@ -840,6 +863,7 @@ bool SimpleASTReaderListener::ReadPreprocessorOptions(
 static bool checkModuleCachePath(llvm::vfs::FileSystem &VFS,
                                  StringRef SpecificModuleCachePath,
                                  StringRef ExistingModuleCachePath,
+                                 StringRef ModuleFilename,
                                  DiagnosticsEngine *Diags,
                                  const LangOptions &LangOpts,
                                  const PreprocessorOptions &PPOpts) {
@@ -851,19 +875,20 @@ static bool checkModuleCachePath(llvm::vfs::FileSystem &VFS,
   if (EqualOrErr && *EqualOrErr)
     return false;
   if (Diags)
-    Diags->Report(diag::err_pch_modulecache_mismatch)
-        << SpecificModuleCachePath << ExistingModuleCachePath;
+    Diags->Report(diag::err_ast_file_modulecache_mismatch)
+        << SpecificModuleCachePath << ExistingModuleCachePath << ModuleFilename;
   return true;
 }
 
 bool PCHValidator::ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                                           StringRef ModuleFilename,
                                            StringRef SpecificModuleCachePath,
                                            bool Complain) {
-  return checkModuleCachePath(Reader.getFileManager().getVirtualFileSystem(),
-                              SpecificModuleCachePath,
-                              PP.getHeaderSearchInfo().getModuleCachePath(),
-                              Complain ? &Reader.Diags : nullptr,
-                              PP.getLangOpts(), PP.getPreprocessorOpts());
+  return checkModuleCachePath(
+      Reader.getFileManager().getVirtualFileSystem(), SpecificModuleCachePath,
+      PP.getHeaderSearchInfo().getModuleCachePath(), ModuleFilename,
+      Complain ? &Reader.Diags : nullptr, PP.getLangOpts(),
+      PP.getPreprocessorOpts());
 }
 
 void PCHValidator::ReadCounter(const ModuleFile &M, unsigned Value) {
@@ -2761,9 +2786,9 @@ static bool isDiagnosedResult(ASTReader::ASTReadResult ARR, unsigned Caps) {
 }
 
 ASTReader::ASTReadResult ASTReader::ReadOptionsBlock(
-    BitstreamCursor &Stream, unsigned ClientLoadCapabilities,
-    bool AllowCompatibleConfigurationMismatch, ASTReaderListener &Listener,
-    std::string &SuggestedPredefines) {
+    BitstreamCursor &Stream, StringRef Filename,
+    unsigned ClientLoadCapabilities, bool AllowCompatibleConfigurationMismatch,
+    ASTReaderListener &Listener, std::string &SuggestedPredefines) {
   if (llvm::Error Err = Stream.EnterSubBlock(OPTIONS_BLOCK_ID)) {
     // FIXME this drops errors on the floor.
     consumeError(std::move(Err));
@@ -2806,7 +2831,7 @@ ASTReader::ASTReadResult ASTReader::ReadOptionsBlock(
     switch ((OptionsRecordTypes)MaybeRecordType.get()) {
     case LANGUAGE_OPTIONS: {
       bool Complain = (ClientLoadCapabilities & ARR_ConfigurationMismatch) == 0;
-      if (ParseLanguageOptions(Record, Complain, Listener,
+      if (ParseLanguageOptions(Record, Filename, Complain, Listener,
                                AllowCompatibleConfigurationMismatch))
         Result = ConfigurationMismatch;
       break;
@@ -2814,7 +2839,7 @@ ASTReader::ASTReadResult ASTReader::ReadOptionsBlock(
 
     case TARGET_OPTIONS: {
       bool Complain = (ClientLoadCapabilities & ARR_ConfigurationMismatch) == 0;
-      if (ParseTargetOptions(Record, Complain, Listener,
+      if (ParseTargetOptions(Record, Filename, Complain, Listener,
                              AllowCompatibleConfigurationMismatch))
         Result = ConfigurationMismatch;
       break;
@@ -2831,7 +2856,7 @@ ASTReader::ASTReadResult ASTReader::ReadOptionsBlock(
     case HEADER_SEARCH_OPTIONS: {
       bool Complain = (ClientLoadCapabilities & ARR_ConfigurationMismatch) == 0;
       if (!AllowCompatibleConfigurationMismatch &&
-          ParseHeaderSearchOptions(Record, Complain, Listener))
+          ParseHeaderSearchOptions(Record, Filename, Complain, Listener))
         Result = ConfigurationMismatch;
       break;
     }
@@ -2839,7 +2864,7 @@ ASTReader::ASTReadResult ASTReader::ReadOptionsBlock(
     case PREPROCESSOR_OPTIONS:
       bool Complain = (ClientLoadCapabilities & ARR_ConfigurationMismatch) == 0;
       if (!AllowCompatibleConfigurationMismatch &&
-          ParsePreprocessorOptions(Record, Complain, Listener,
+          ParsePreprocessorOptions(Record, Filename, Complain, Listener,
                                    SuggestedPredefines))
         Result = ConfigurationMismatch;
       break;
@@ -2976,7 +3001,7 @@ ASTReader::ReadControlBlock(ModuleFile &F,
               F.Kind == MK_ExplicitModule || F.Kind == MK_PrebuiltModule;
 
           ASTReadResult Result =
-              ReadOptionsBlock(Stream, ClientLoadCapabilities,
+              ReadOptionsBlock(Stream, F.FileName, ClientLoadCapabilities,
                                AllowCompatibleConfigurationMismatch, *Listener,
                                SuggestedPredefines);
           if (Result == Failure) {
@@ -4872,8 +4897,8 @@ ASTReader::readUnhashedControlBlock(ModuleFile &F, bool WasImportedBy,
   bool DisableValidation = shouldDisableValidationForFile(F);
 
   ASTReadResult Result = readUnhashedControlBlockImpl(
-      &F, F.Data, ClientLoadCapabilities, AllowCompatibleConfigurationMismatch,
-      Listener.get(),
+      &F, F.Data, F.FileName, ClientLoadCapabilities,
+      AllowCompatibleConfigurationMismatch, Listener.get(),
       WasImportedBy ? false : HSOpts.ModulesValidateDiagnosticOptions);
 
   // If F was directly imported by another module, it's implicitly validated by
@@ -4916,9 +4941,9 @@ ASTReader::readUnhashedControlBlock(ModuleFile &F, bool WasImportedBy,
 }
 
 ASTReader::ASTReadResult ASTReader::readUnhashedControlBlockImpl(
-    ModuleFile *F, llvm::StringRef StreamData, unsigned ClientLoadCapabilities,
-    bool AllowCompatibleConfigurationMismatch, ASTReaderListener *Listener,
-    bool ValidateDiagnosticOptions) {
+    ModuleFile *F, llvm::StringRef StreamData, StringRef Filename,
+    unsigned ClientLoadCapabilities, bool AllowCompatibleConfigurationMismatch,
+    ASTReaderListener *Listener, bool ValidateDiagnosticOptions) {
   // Initialize a stream.
   BitstreamCursor Stream(StreamData);
 
@@ -4986,7 +5011,7 @@ ASTReader::ASTReadResult ASTReader::readUnhashedControlBlockImpl(
       bool Complain = (ClientLoadCapabilities & ARR_OutOfDate) == 0;
       if (Listener && ValidateDiagnosticOptions &&
           !AllowCompatibleConfigurationMismatch &&
-          ParseDiagnosticOptions(Record, Complain, *Listener))
+          ParseDiagnosticOptions(Record, Filename, Complain, *Listener))
         Result = OutOfDate; // Don't return early.  Read the signature.
       break;
     }
@@ -5373,32 +5398,37 @@ namespace {
           ExistingModuleCachePath(ExistingModuleCachePath), FileMgr(FileMgr),
           StrictOptionMatches(StrictOptionMatches) {}
 
-    bool ReadLanguageOptions(const LangOptions &LangOpts, bool Complain,
+    bool ReadLanguageOptions(const LangOptions &LangOpts,
+                             StringRef ModuleFilename, bool Complain,
                              bool AllowCompatibleDifferences) override {
-      return checkLanguageOptions(ExistingLangOpts, LangOpts, nullptr,
-                                  AllowCompatibleDifferences);
+      return checkLanguageOptions(ExistingLangOpts, LangOpts, ModuleFilename,
+                                  nullptr, AllowCompatibleDifferences);
     }
 
-    bool ReadTargetOptions(const TargetOptions &TargetOpts, bool Complain,
+    bool ReadTargetOptions(const TargetOptions &TargetOpts,
+                           StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override {
-      return checkTargetOptions(ExistingTargetOpts, TargetOpts, nullptr,
-                                AllowCompatibleDifferences);
+      return checkTargetOptions(ExistingTargetOpts, TargetOpts, ModuleFilename,
+                                nullptr, AllowCompatibleDifferences);
     }
 
     bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                                 StringRef ModuleFilename,
                                  StringRef SpecificModuleCachePath,
                                  bool Complain) override {
-      return checkModuleCachePath(
-          FileMgr.getVirtualFileSystem(), SpecificModuleCachePath,
-          ExistingModuleCachePath, nullptr, ExistingLangOpts, ExistingPPOpts);
+      return checkModuleCachePath(FileMgr.getVirtualFileSystem(),
+                                  SpecificModuleCachePath,
+                                  ExistingModuleCachePath, ModuleFilename,
+                                  nullptr, ExistingLangOpts, ExistingPPOpts);
     }
 
     bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                                 bool ReadMacros, bool Complain,
+                                 StringRef ModuleFilename, bool ReadMacros,
+                                 bool Complain,
                                  std::string &SuggestedPredefines) override {
       return checkPreprocessorOptions(
-          PPOpts, ExistingPPOpts, ReadMacros, /*Diags=*/nullptr, FileMgr,
-          SuggestedPredefines, ExistingLangOpts,
+          PPOpts, ExistingPPOpts, ModuleFilename, ReadMacros, /*Diags=*/nullptr,
+          FileMgr, SuggestedPredefines, ExistingLangOpts,
           StrictOptionMatches ? OptionValidateStrictMatches
                               : OptionValidateContradictions);
     }
@@ -5466,7 +5496,7 @@ bool ASTReader::readASTFileControlBlock(
       switch (Entry.ID) {
       case OPTIONS_BLOCK_ID: {
         std::string IgnoredSuggestedPredefines;
-        if (ReadOptionsBlock(Stream, ClientLoadCapabilities,
+        if (ReadOptionsBlock(Stream, Filename, ClientLoadCapabilities,
                              /*AllowCompatibleConfigurationMismatch*/ false,
                              Listener, IgnoredSuggestedPredefines) != Success)
           return true;
@@ -5692,7 +5722,7 @@ bool ASTReader::readASTFileControlBlock(
 
   // Scan for the UNHASHED_CONTROL_BLOCK_ID block.
   if (readUnhashedControlBlockImpl(
-          nullptr, Bytes, ClientLoadCapabilities,
+          nullptr, Bytes, Filename, ClientLoadCapabilities,
           /*AllowCompatibleConfigurationMismatch*/ false, &Listener,
           ValidateDiagnosticOptions) != Success)
     return true;
@@ -6033,7 +6063,7 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F,
 ///
 /// \returns true if the listener deems the file unacceptable, false otherwise.
 bool ASTReader::ParseLanguageOptions(const RecordData &Record,
-                                     bool Complain,
+                                     StringRef ModuleFilename, bool Complain,
                                      ASTReaderListener &Listener,
                                      bool AllowCompatibleDifferences) {
   LangOptions LangOpts;
@@ -6070,11 +6100,12 @@ bool ASTReader::ParseLanguageOptions(const RecordData &Record,
 
   LangOpts.OMPHostIRFile = ReadString(Record, Idx);
 
-  return Listener.ReadLanguageOptions(LangOpts, Complain,
+  return Listener.ReadLanguageOptions(LangOpts, ModuleFilename, Complain,
                                       AllowCompatibleDifferences);
 }
 
-bool ASTReader::ParseTargetOptions(const RecordData &Record, bool Complain,
+bool ASTReader::ParseTargetOptions(const RecordData &Record,
+                                   StringRef ModuleFilename, bool Complain,
                                    ASTReaderListener &Listener,
                                    bool AllowCompatibleDifferences) {
   unsigned Idx = 0;
@@ -6090,11 +6121,12 @@ bool ASTReader::ParseTargetOptions(const RecordData &Record, bool Complain,
     TargetOpts.Features.push_back(ReadString(Record, Idx));
   }
 
-  return Listener.ReadTargetOptions(TargetOpts, Complain,
+  return Listener.ReadTargetOptions(TargetOpts, ModuleFilename, Complain,
                                     AllowCompatibleDifferences);
 }
 
-bool ASTReader::ParseDiagnosticOptions(const RecordData &Record, bool Complain,
+bool ASTReader::ParseDiagnosticOptions(const RecordData &Record,
+                                       StringRef ModuleFilename, bool Complain,
                                        ASTReaderListener &Listener) {
   IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts(new DiagnosticOptions);
   unsigned Idx = 0;
@@ -6108,7 +6140,7 @@ bool ASTReader::ParseDiagnosticOptions(const RecordData &Record, bool Complain,
   for (unsigned N = Record[Idx++]; N; --N)
     DiagOpts->Remarks.push_back(ReadString(Record, Idx));
 
-  return Listener.ReadDiagnosticOptions(DiagOpts, Complain);
+  return Listener.ReadDiagnosticOptions(DiagOpts, ModuleFilename, Complain);
 }
 
 bool ASTReader::ParseFileSystemOptions(const RecordData &Record, bool Complain,
@@ -6120,6 +6152,7 @@ bool ASTReader::ParseFileSystemOptions(const RecordData &Record, bool Complain,
 }
 
 bool ASTReader::ParseHeaderSearchOptions(const RecordData &Record,
+                                         StringRef ModuleFilename,
                                          bool Complain,
                                          ASTReaderListener &Listener) {
   HeaderSearchOptions HSOpts;
@@ -6139,8 +6172,8 @@ bool ASTReader::ParseHeaderSearchOptions(const RecordData &Record,
   HSOpts.UseLibcxx = Record[Idx++];
   std::string SpecificModuleCachePath = ReadString(Record, Idx);
 
-  return Listener.ReadHeaderSearchOptions(HSOpts, SpecificModuleCachePath,
-                                          Complain);
+  return Listener.ReadHeaderSearchOptions(HSOpts, ModuleFilename,
+                                          SpecificModuleCachePath, Complain);
 }
 
 bool ASTReader::ParseHeaderSearchPaths(const RecordData &Record, bool Complain,
@@ -6176,6 +6209,7 @@ bool ASTReader::ParseHeaderSearchPaths(const RecordData &Record, bool Complain,
 }
 
 bool ASTReader::ParsePreprocessorOptions(const RecordData &Record,
+                                         StringRef ModuleFilename,
                                          bool Complain,
                                          ASTReaderListener &Listener,
                                          std::string &SuggestedPredefines) {
@@ -6208,8 +6242,8 @@ bool ASTReader::ParsePreprocessorOptions(const RecordData &Record,
   PPOpts.ObjCXXARCStandardLibrary =
     static_cast<ObjCXXARCStandardLibraryKind>(Record[Idx++]);
   SuggestedPredefines.clear();
-  return Listener.ReadPreprocessorOptions(PPOpts, ReadMacros, Complain,
-                                          SuggestedPredefines);
+  return Listener.ReadPreprocessorOptions(PPOpts, ModuleFilename, ReadMacros,
+                                          Complain, SuggestedPredefines);
 }
 
 std::pair<ModuleFile *, unsigned>
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index fcf1579d86dda7f..c118f3818467d93 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2391,7 +2391,7 @@ void ASTDeclReader::VisitImplicitConceptSpecializationDecl(
   VisitDecl(D);
   llvm::SmallVector<TemplateArgument, 4> Args;
   for (unsigned I = 0; I < D->NumTemplateArgs; ++I)
-    Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/false));
+    Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/true));
   D->setTemplateArguments(Args);
 }
 
diff --git a/clang/test/AST/Interp/literals.cpp b/clang/test/AST/Interp/literals.cpp
index 815fb67b9bbfc53..a46f6ed747ec2f3 100644
--- a/clang/test/AST/Interp/literals.cpp
+++ b/clang/test/AST/Interp/literals.cpp
@@ -1196,13 +1196,15 @@ namespace incdecbool {
 }
 
 #if __cplusplus >= 201402L
-/// NOTE: The diagnostics of the two interpreters are a little
-/// different here, but they both make sense.
 constexpr int externvar1() { // both-error {{never produces a constant expression}}
-  extern char arr[]; // ref-note {{declared here}}
-   return arr[0]; // ref-note {{read of non-constexpr variable 'arr'}} \
-                  // expected-note {{indexing of array without known bound}}
+  extern char arr[]; // both-note {{declared here}}
+   return arr[0]; // both-note {{read of non-constexpr variable 'arr'}}
 }
+namespace externarr {
+  extern int arr[];
+  constexpr int *externarrindex = &arr[0]; /// No diagnostic.
+}
+
 
 namespace StmtExprs {
   constexpr int foo() {
diff --git a/clang/test/AST/ast-dump-concepts.cpp b/clang/test/AST/ast-dump-concepts.cpp
index 4b8e9026b2916b0..a5e0673c241ef41 100644
--- a/clang/test/AST/ast-dump-concepts.cpp
+++ b/clang/test/AST/ast-dump-concepts.cpp
@@ -20,9 +20,8 @@ struct Foo {
   // CHECK:      TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'binary_concept'
   // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13, col:31> 'bool' Concept {{.*}} 'binary_concept'
   // CHECK-NEXT:   |-ImplicitConceptSpecializationDecl {{.*}} <line:13:9> col:9
-  // CHECK-NEXT:   | |-TemplateArgument type 'R'
-  // CHECK-NEXT:   | | `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0
-  // CHECK-NEXT:   | |   `-TemplateTypeParm {{.*}} 'R'
+  // CHECK-NEXT:   | |-TemplateArgument type 'type-parameter-1-0'  
+  // CHECK-NEXT:   | | `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0
   // CHECK-NEXT:   | `-TemplateArgument type 'int'
   // CHECK-NEXT:   |   `-BuiltinType {{.*}} 'int'
   // CHECK-NEXT:   |-TemplateArgument {{.*}} type 'R'
@@ -36,9 +35,8 @@ struct Foo {
   // CHECK:      TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'unary_concept'
   // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13> 'bool'
   // CHECK-NEXT:   |-ImplicitConceptSpecializationDecl {{.*}} <line:10:9> col:9
-  // CHECK-NEXT:   | `-TemplateArgument type 'R'
-  // CHECK-NEXT:   |   `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0
-  // CHECK-NEXT:   |     `-TemplateTypeParm {{.*}} 'R'
+  // CHECK-NEXT:   | `-TemplateArgument type 'type-parameter-1-0'
+  // CHECK-NEXT:   |   `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0
   template <unary_concept R>
   Foo(R);
 
diff --git a/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c b/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c
index cedb3f1192eda30..075f583784fe192 100644
--- a/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c
+++ b/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes %s -ast-dump | FileCheck %s
 
 #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
 
diff --git a/clang/test/AST/attr-counted-by-struct-ptrs.c b/clang/test/AST/attr-counted-by-struct-ptrs.c
index 79a453d239cd528..0c0525823414312 100644
--- a/clang/test/AST/attr-counted-by-struct-ptrs.c
+++ b/clang/test/AST/attr-counted-by-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes %s -ast-dump | FileCheck %s
 
 #define __counted_by(f)  __attribute__((counted_by(f)))
 
diff --git a/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c b/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c
index 6189799b85ccb2e..73b8a71f23503ea 100644
--- a/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c
+++ b/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes %s -ast-dump | FileCheck %s
 
 #define __sized_by_or_null(f)  __attribute__((sized_by_or_null(f)))
 
diff --git a/clang/test/AST/attr-sized-by-struct-ptrs.c b/clang/test/AST/attr-sized-by-struct-ptrs.c
index 5d9ed0094c685ba..7f7e3dfea2ac73c 100644
--- a/clang/test/AST/attr-sized-by-struct-ptrs.c
+++ b/clang/test/AST/attr-sized-by-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes %s -ast-dump | FileCheck %s
 
 #define __sized_by(f)  __attribute__((sized_by(f)))
 
diff --git a/clang/test/CXX/drs/cwg14xx.cpp b/clang/test/CXX/drs/cwg14xx.cpp
index f01d96ad47f3e3a..a23ac7444363315 100644
--- a/clang/test/CXX/drs/cwg14xx.cpp
+++ b/clang/test/CXX/drs/cwg14xx.cpp
@@ -505,16 +505,6 @@ namespace cwg1467 {  // cwg1467: 3.7 c++11
     }
   } // nonaggregate
 
-  namespace SelfInitIsNotListInit {
-    struct S {
-      S();
-      explicit S(S &);
-      S(const S &);
-    };
-    S s1;
-    S s2 = {s1}; // ok, not list-initialization so we pick the non-explicit constructor
-  }
-
   struct NestedInit { int a, b, c; };
   NestedInit ni[1] = {{NestedInit{1, 2, 3}}};
 
diff --git a/clang/test/CXX/drs/cwg21xx.cpp b/clang/test/CXX/drs/cwg21xx.cpp
index d7bc52dd9d44646..2800228748e609b 100644
--- a/clang/test/CXX/drs/cwg21xx.cpp
+++ b/clang/test/CXX/drs/cwg21xx.cpp
@@ -12,7 +12,15 @@
 #endif
 
 namespace std {
-struct type_info;
+  typedef __SIZE_TYPE__ size_t;
+
+  template<typename E> struct initializer_list {
+    const E *p; size_t n;
+    initializer_list(const E *p, size_t n);
+    initializer_list();
+  };
+
+  struct type_info;
 }
 
 namespace cwg2100 { // cwg2100: 12
@@ -136,6 +144,41 @@ namespace cwg2126 { // cwg2126: 12
 #endif
 }
 
+namespace cwg2137 { // cwg2137: 20
+#if __cplusplus >= 201103L
+  struct Q {
+    Q();
+    Q(Q&&);
+    Q(std::initializer_list<Q>) = delete; // #cwg2137-Qcons
+  };
+
+  Q x = Q { Q() };
+  // since-cxx11-error@-1 {{call to deleted constructor of 'Q'}}
+  //   since-cxx11-note@#cwg2137-Qcons {{'Q' has been explicitly marked deleted here}}
+
+  int f(Q); // #cwg2137-f
+  int y = f({ Q() });
+  // since-cxx11-error@-1 {{call to deleted constructor of 'Q'}}
+  //   since-cxx11-note@#cwg2137-Qcons {{'Q' has been explicitly marked deleted here}}
+  //   since-cxx11-note@#cwg2137-f {{passing argument to parameter here}}
+
+  struct U {
+    U();
+    U(const U&);
+  };
+
+  struct Derived : U {
+    Derived();
+    Derived(const Derived&);
+  } d;
+
+  int g(Derived);
+  int g(U(&&)[1]) = delete;
+
+  int z = g({ d });
+#endif
+}
+
 namespace cwg2140 { // cwg2140: 9
 #if __cplusplus >= 201103L
   union U { int a; decltype(nullptr) b; };
diff --git a/clang/test/CXX/drs/cwg23xx.cpp b/clang/test/CXX/drs/cwg23xx.cpp
index e4a1e90941dbf00..77fd6a337436e3a 100644
--- a/clang/test/CXX/drs/cwg23xx.cpp
+++ b/clang/test/CXX/drs/cwg23xx.cpp
@@ -6,6 +6,16 @@
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
 // RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
 
+namespace std {
+  __extension__ typedef __SIZE_TYPE__ size_t;
+
+  template<typename E> struct initializer_list {
+    const E *p; size_t n;
+    initializer_list(const E *p, size_t n);
+    initializer_list();
+  };
+}
+
 #if __cplusplus >= 201103L
 namespace cwg2303 { // cwg2303: 12
 template <typename... T>
@@ -94,6 +104,95 @@ struct Z : W,
 // cwg2331: na
 // cwg2335 is in cwg2335.cxx
 
+namespace cwg2311 {  // cwg2311 is open with no proposed resolution
+#if __cplusplus >= 201707L
+template<typename T>
+void test() {
+  // Ensure none of these try to call a move constructor.
+  T a = T{T(0)};
+  T b{T(0)};
+  auto c{T(0)};
+  T d = {T(0)};
+  auto e = {T(0)};
+#if __cplusplus >= 202302L
+  auto f = auto{T(0)};
+#endif
+  void(*fn)(T);
+  fn({T(0)});
+}
+
+struct NonMovable {
+  NonMovable(int);
+  NonMovable(NonMovable&&) = delete;
+};
+struct NonMovableNonApplicableIList {
+  NonMovableNonApplicableIList(int);
+  NonMovableNonApplicableIList(NonMovableNonApplicableIList&&) = delete;
+  NonMovableNonApplicableIList(std::initializer_list<int>);
+};
+struct ExplicitMovable {
+  ExplicitMovable(int);
+  explicit ExplicitMovable(ExplicitMovable&&);
+};
+struct ExplicitNonMovable {
+  ExplicitNonMovable(int);
+  explicit ExplicitNonMovable(ExplicitNonMovable&&) = delete;
+};
+struct ExplicitNonMovableNonApplicableIList {
+  ExplicitNonMovableNonApplicableIList(int);
+  explicit ExplicitNonMovableNonApplicableIList(ExplicitNonMovableNonApplicableIList&&) = delete;
+  ExplicitNonMovableNonApplicableIList(std::initializer_list<int>);
+};
+struct CopyOnly {
+  CopyOnly(int);
+  CopyOnly(const CopyOnly&);
+  CopyOnly(CopyOnly&&) = delete;
+};
+struct ExplicitCopyOnly {
+  ExplicitCopyOnly(int);
+  explicit ExplicitCopyOnly(const ExplicitCopyOnly&);
+  explicit ExplicitCopyOnly(ExplicitCopyOnly&&) = delete;
+};
+
+template void test<NonMovable>();
+template void test<NonMovableNonApplicableIList>();
+template void test<ExplicitMovable>();
+template void test<ExplicitNonMovable>();
+template void test<ExplicitNonMovableNonApplicableIList>();
+template void test<CopyOnly>();
+template void test<ExplicitCopyOnly>();
+
+struct any {
+    template<typename T>
+    any(T&&);
+};
+
+template<typename T>
+struct X {
+    X();
+    X(T) = delete; // #cwg2311-X
+};
+
+X<std::initializer_list<any>> x{ X<std::initializer_list<any>>() };
+// since-cxx17-error@-1 {{call to deleted constructor of 'X<std::initializer_list<any>>'}}
+//   since-cxx17-note@#cwg2311-X {{'X' has been explicitly marked deleted here}}
+
+// Per the currently implemented resolution, this does not apply to std::initializer_list.
+// An initializer list initialized from `{ e }` always has exactly one element constructed
+// from `e`, where previously that could have been a copy of an init list or `e.operator std::initializer_list()`
+struct InitListCtor {
+  InitListCtor(int);
+  InitListCtor(InitListCtor&&) = delete;
+  InitListCtor(std::initializer_list<InitListCtor>) = delete; // #cwg2311-InitListCtor
+};
+
+std::initializer_list<InitListCtor> i;
+auto j = std::initializer_list<InitListCtor>{ i };
+// since-cxx17-error@-1 {{conversion function from 'std::initializer_list<InitListCtor>' to 'const cwg2311::InitListCtor' invokes a deleted function}}
+//   since-cxx17-note@#cwg2311-InitListCtor {{'InitListCtor' has been explicitly marked deleted here}}
+#endif
+}
+
 #if __cplusplus >= 201103L
 namespace cwg2338 { // cwg2338: 12
 namespace B {
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
index 3d6f0b11fa99c90..dc0e84280e05672 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
@@ -35,14 +35,14 @@ using r2i2 = r2<A>; // expected-error{{constraints not satisfied for class templ
 using r2i3 = r2<D>;
 using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class template 'r2' [with T = const D]}}
 
-template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}}
+template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}}
 struct r3 {};
 
 using r3i1 = r3<int>;
 using r3i2 = r3<A>;
 using r3i3 = r3<A &>;
 using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}}
-using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}}
+using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}}
 
 // Non-dependent expressions
 
@@ -149,7 +149,7 @@ namespace std_example {
   template<typename T> constexpr bool is_same_v<T, T> = true;
 
   template<typename T, typename U> concept same_as = is_same_v<T, U>;
-  // expected-note@-1 {{because 'is_same_v<int, typename T2::inner>' evaluated to false}}
+  // expected-note@-1 {{because 'is_same_v<int, int *>' evaluated to false}}
 
   static_assert(C1<int>);
   static_assert(C1<int*>);
@@ -173,9 +173,9 @@ namespace std_example {
     int operator *() { return 0; }
   };
   static_assert(C2<T1>);
-  template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'T2' does not satisfy 'C2'}}
+  template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'std_example::T2' does not satisfy 'C2'}}
   using c2c1 = C2_check<int>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = int]}}
-  using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = T2]}}
+  using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::T2]}}
 
   template<typename T>
   void g(T t) noexcept(sizeof(T) == 1) {}
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
index 00ac9d0422d67e5..763d983d20f6152 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
@@ -27,7 +27,7 @@ using r4i = X<void>::r4<int>; // expected-error{{constraints not satisfied for c
 
 // C++ [expr.prim.req.nested] Examples
 namespace std_example {
-  template<typename U> concept C1 = sizeof(U) == 1; // expected-note{{because 'sizeof(decltype(+t)) == 1' (4 == 1) evaluated to false}}
+  template<typename U> concept C1 = sizeof(U) == 1; // expected-note{{because 'sizeof(int) == 1' (4 == 1) evaluated to false}}
   template<typename T> concept D =
     requires (T t) {
       requires C1<decltype (+t)>; // expected-note{{because 'decltype(+t)' (aka 'int') does not satisfy 'C1'}}
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
index abfadfa34884119..7515f5c62d5ea88 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
@@ -39,14 +39,14 @@ using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class
 
 template<typename T> requires requires { sizeof(T); }
 // expected-note@-1{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}}
-// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}}
+// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}}
 struct r3 {};
 
 using r3i1 = r3<int>;
 using r3i2 = r3<A>;
 using r3i3 = r3<A &>;
 using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}}
-using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}}
+using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}}
 
 template<typename T> requires requires (T t) { 0; "a"; (void)'a'; }
 struct r4 {};
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
index 28dff336d053c6f..5433cfb21955dd8 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
@@ -182,14 +182,14 @@ namespace std_example {
   static_assert(C1<has_inner_and_type> && C2<has_inner_and_type> && C3<has_inner_and_type>);
   template<C1 T> struct C1_check {};
   // expected-note@-1 {{because 'int' does not satisfy 'C1'}}
-  // expected-note@-2 {{because 'has_type' does not satisfy 'C1'}}
+  // expected-note@-2 {{because 'std_example::has_type' does not satisfy 'C1'}}
   template<C2 T> struct C2_check {};
-  // expected-note@-1 {{because 'has_inner' does not satisfy 'C2'}}
+  // expected-note@-1 {{because 'std_example::has_inner' does not satisfy 'C2'}}
   template<C3 T> struct C3_check {};
   // expected-note@-1 {{because 'void' does not satisfy 'C3'}}
   using c1 = C1_check<int>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = int]}}
-  using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = has_type]}}
-  using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = has_inner]}}
+  using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = std_example::has_type]}}
+  using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::has_inner]}}
   using c4 = C3_check<void>; // expected-error{{constraints not satisfied for class template 'C3_check' [with T = void]}}
 }
 
@@ -199,10 +199,10 @@ template <typename T> concept C = requires { requires requires { T::a; }; };
 // expected-note@-1 {{because 'T::a' would be invalid: no member named 'a' in 'PR48656::T1'}}
 
 template <C...> struct A {};
-// expected-note@-1 {{because 'T1' does not satisfy 'C'}}
+// expected-note@-1 {{because 'PR48656::T1' does not satisfy 'C'}}
 
 struct T1 {};
-template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <T1>]}}
+template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <PR48656::T1>]}}
 
 struct T2 { static constexpr bool a = false; };
 template struct A<T2>;
diff --git a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
index 1a01ffac0154aef..84ef85126c369a0 100644
--- a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
+++ b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++20 %s -verify -o /dev/null
+// RUN: %clang_cc1 -std=c++20 %s -verify -emit-module-interface -o /dev/null
 // RUN: %clang_cc1 -std=c++20 %s -DINTERFACE -verify -emit-module-interface -o %t
 // RUN: %clang_cc1 -std=c++20 %s -DIMPLEMENTATION -verify -fmodule-file=A=%t -o /dev/null
 //
@@ -15,6 +15,8 @@ module A; // #module-decl
   // expected-error@-2 {{missing 'export' specifier in module declaration while building module interface}}
   #define INTERFACE
  #endif
+#else // Not in a module
+// expected-error@* {{missing 'export module' declaration in module interface unit}}
 #endif
 
 #ifndef INTERFACE
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
index 02c97b4591a1566..d80710937cdfa1d 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
@@ -8,7 +8,7 @@ template<typename T> requires Bar<T> && true struct S<T> { };
 
 template<typename T> concept True2 = sizeof(T) >= 0;
 template<typename T> concept Foo2 = True2<T*>;
-// expected-error@-1{{'type name' declared as a pointer to a reference of type 'T &'}}
+// expected-error@-1{{'type name' declared as a pointer to a reference of type 'type-parameter-0-0 &'}}
 template<typename T> concept Bar2 = Foo2<T&>;
 // expected-note@-1{{while substituting into concept arguments here; substitution failures not allowed in concept arguments}}
 template<typename T> requires Bar2<T> struct S2 { };
diff --git a/clang/test/CXX/temp/temp.param/p10-2a.cpp b/clang/test/CXX/temp/temp.param/p10-2a.cpp
index 97e0ef35837b16d..4f5fdd3b4809ac5 100644
--- a/clang/test/CXX/temp/temp.param/p10-2a.cpp
+++ b/clang/test/CXX/temp/temp.param/p10-2a.cpp
@@ -94,8 +94,8 @@ concept OneOf = (is_same_v<T, Ts> || ...);
 // expected-note@-5 {{and 'is_same_v<short, char>' evaluated to false}}
 // expected-note@-6 3{{because 'is_same_v<int, char[1]>' evaluated to false}}
 // expected-note@-7 3{{and 'is_same_v<int, char[2]>' evaluated to false}}
-// expected-note@-8 2{{because 'is_same_v<decltype(nullptr), char>' evaluated to false}}
-// expected-note@-9 2{{and 'is_same_v<decltype(nullptr), int>' evaluated to false}}
+// expected-note@-8 2{{because 'is_same_v<std::nullptr_t, char>' evaluated to false}}
+// expected-note@-9 2{{and 'is_same_v<std::nullptr_t, int>' evaluated to false}}
 
 template<OneOf<char[1], char[2]> T, OneOf<int, long, char> U>
 // expected-note@-1 2{{because 'OneOf<char, char[1], char[2]>' evaluated to false}}
diff --git a/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-definitions.c b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-definitions.c
new file mode 100644
index 000000000000000..75fc396e3e0afe9
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-definitions.c
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -triple s390x-ibm-zos -emit-llvm < %s | FileCheck %s --check-prefix=PTR32-ZOS
+// RUN: %clang_cc1 -triple s390x-ibm-linux -fzos-extensions -emit-llvm < %s | FileCheck %s --check-prefix=PTR32-LINUX
+// RUN: %clang_cc1 -triple s390x-linux-gnu -fzos-extensions -emit-llvm < %s | FileCheck %s --check-prefix=PTR32-LINUX
+
+void ptr32_declarations() {
+  // PTR32-ZOS-LABEL: @ptr32_declarations()
+  // PTR32-LINUX-LABEL: @ptr32_declarations()
+
+  // PTR32-ZOS: %p1 = alloca ptr addrspace(1), align 4
+  // PTR32-LINUX-NOT: %p1 = alloca i8 addrspace(1)*, align 4
+  // PTR32-LINUX: %p1 = alloca ptr, align 8
+  char * __ptr32 p1;
+
+  // PTR32-ZOS: %p2 = alloca ptr, align 8
+  // PTR32-LINUX-NOT: %p2 = alloca ptr addrspace(1), align 8
+  // PTR32-LINUX: %p2 = alloca ptr, align 8
+  char * __ptr32 *p2;
+
+  // PTR32-ZOS: %p3 = alloca ptr addrspace(1), align 4
+  // PTR32-LINUX-NOT: %p3 = alloca i8* addrspace(1)*, align 4
+  // PTR32-LINUX: %p3 = alloca ptr, align 8
+  char ** __ptr32 p3;
+
+  // PTR32-ZOS: %p4 = alloca ptr, align 8
+  // PTR32-LINUX-NOT: %p4 = alloca ptr addrspace(1), align 8
+  // PTR32-LINUX: %p4 = alloca ptr, align 8
+  char ** __ptr32 *p4;
+
+  // PTR32-ZOS: %p5 = alloca ptr, align 8
+  // PTR32-LINUX-NOT:  %p5 = alloca ptr addrspace(1), align 8
+  // PTR32-LINUX: %p5 = alloca ptr, align 8
+  char *** __ptr32 *p5;
+
+  // PTR32-ZOS: %p6 = alloca ptr, align 8
+  // PTR32-LINUX: %p6 = alloca ptr, align 8
+  char **p6;
+
+  // PTR32-ZOS: %p7 = alloca ptr addrspace(1), align 4
+  // PTR32-LINUX-NOT: %p7 = alloca i8 addrspace(1)* addrspace(1)*, align 4
+  // PTR32-LINUX: %p7 = alloca ptr, align 8
+  char * __ptr32 * __ptr32 p7;
+
+  // PTR32-ZOS: %p8 = alloca ptr addrspace(1), align 4
+  // PTR32-LINUX-NOT: %p8 = alloca i8* addrspace(1)* addrspace(1)*, align 4
+  // PTR32-LINUX: %p8 = alloca ptr, align 8
+  char ** __ptr32 * __ptr32 p8;
+
+  // PTR32-ZOS: %p9 = alloca ptr, align 8
+  // PTR32-LINUX-NOT: %p9 = alloca i8* addrspace(1)* addrspace(1)**, align 8
+  // PTR32-LINUX: %p9 = alloca ptr, align 8
+  char ** __ptr32 * __ptr32 *p9;
+
+}
diff --git a/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-malloc.c b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-malloc.c
new file mode 100644
index 000000000000000..1b05e8aa5052aff
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-malloc.c
@@ -0,0 +1,84 @@
+// RUN: %clang_cc1 -triple s390x-ibm-zos -O2 -emit-llvm %s -o - | FileCheck %s --check-prefix=X64
+#include <stddef.h>
+void *__malloc31(size_t);
+
+int test_1() {
+  // X64-LABEL: define {{.*}} i32 @test_1()
+  // X64: ret i32 135
+  int *__ptr32 a;
+  int *b;
+  int i;
+  int sum1, sum2, sum3;
+
+  a = (int *__ptr32)__malloc31(sizeof(int) * 10);
+
+  b = a;
+  sum1 = 0;
+  for (i = 0; i < 10; ++i) {
+    a[i] = i;
+    sum1 += i;
+  }
+
+  sum2 = 0;
+  for (i = 0; i < 10; ++i) {
+    sum2 += a[i];
+  }
+  sum3 = 0;
+  for (i = 0; i < 10; ++i) {
+    sum3 += b[i];
+  }
+
+  return (sum1 + sum2 + sum3);
+}
+
+int test_2() {
+  // X64-LABEL: define {{.*}} i32 @test_2()
+  // X64: ret i32 4
+  int *a = (int *)__malloc31(sizeof(int));
+  int *__ptr32 b;
+
+  *a = 99;
+  b = a;
+  *b = 44;
+
+  // Test should return 4
+  return (*b - 40);
+}
+
+int test_3() {
+  // X64-LABEL: define {{.*}} i32 @test_3()
+  // X64: ret i32 4
+  int *a = (int *)__malloc31(sizeof(int));
+  int *__ptr32 b;
+
+  *a = 99;
+  b = a;
+
+  // Test should return 4
+  return (*b - 95);
+}
+
+int test_4() {
+  // X64-LABEL: define {{.*}} i32 @test_4()
+  // X64: ret i32 1
+  int *a = (int *)__malloc31(sizeof(int));
+  float *d = (float *)__malloc31(sizeof(float));
+
+  int *__ptr32 b;
+  int *c;
+
+  float *__ptr32 e;
+  float *f;
+
+  *a = 0;
+  *d = 0.0;
+
+  b = a;
+  c = a;
+  e = d;
+  f = d;
+
+  // Test should return 1
+  return (b == c && e == f);
+}
+
diff --git a/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-sizeof.c b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-sizeof.c
new file mode 100644
index 000000000000000..6b434a926f706b0
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-sizeof.c
@@ -0,0 +1,94 @@
+// RUN: %clang_cc1 -emit-llvm -triple s390x-ibm-zos -fzos-extensions -fdump-record-layouts < %s | FileCheck %s --check-prefix=PTR32-ZOS
+// RUN: %clang_cc1 -emit-llvm -triple s390x-ibm-linux -fzos-extensions -fdump-record-layouts < %s | FileCheck %s --check-prefix=PTR32-LINUX
+// RUN: %clang_cc1 -emit-llvm -triple s390x-linux-gnu -fzos-extensions -fdump-record-layouts < %s | FileCheck %s --check-prefix=PTR32-LINUX
+
+// PTR32-ZOS:          0  | struct s1
+// PTR32-ZOS-NEXT:     0  | long a
+// PTR32-ZOS-NEXT:     8  | int b
+// PTR32-ZOS-NEXT:     12 | int * __ptr32 c
+// PTR32-ZOS-NEXT:     16 | int d
+// PTR32-ZOS-NEXT:        | [sizeof=24, align=8]
+
+// PTR32-LINUX:        0  | struct s1
+// PTR32-LINUX-NEXT:   0  | long a
+// PTR32-LINUX-NEXT:   8  | int b
+// PTR32-LINUX-NEXT:   16 | int * __ptr32 c
+// PTR32-LINUX-NEXT:   24 | int d
+// PTR32-LINUX-NEXT:      | [sizeof=32, align=8]
+struct s1 {
+  long a;
+  int b;
+  int * __ptr32 c;
+  int d;
+} S1;
+
+// PTR32-ZOS:          0  | struct s2
+// PTR32-ZOS-NEXT:     0  | long a
+// PTR32-ZOS-NEXT:     8  | int b
+// PTR32-ZOS-NEXT:     16 | int * c
+// PTR32-ZOS-NEXT:     24 | int d
+// PTR32-ZOS-NEXT:        | [sizeof=32, align=8]
+
+// PTR32-LINUX:        0  | struct s2
+// PTR32-LINUX-NEXT:   0  | long a
+// PTR32-LINUX-NEXT:   8  | int b
+// PTR32-LINUX-NEXT:   16 | int * c
+// PTR32-LINUX-NEXT:   24 | int d
+// PTR32-LINUX-NEXT:      | [sizeof=32, align=8]
+struct s2 {
+  long a;
+  int b;
+  int *c;
+  int d;
+} S2;
+
+// PTR32-ZOS:          0  | struct s3
+// PTR32-ZOS-NEXT:     0  | int a
+// PTR32-ZOS-NEXT:     4  | int * __ptr32 b
+// PTR32-ZOS-NEXT:     8  | int * __ptr32 c
+// PTR32-ZOS-NEXT:     12 | int * d
+// PTR32-ZOS-NEXT:        | [sizeof=20, align=1]
+
+struct __attribute__((packed)) s3 {
+  int a;
+  int *__ptr32 b;
+  int *__ptr32 c;
+  int *d;
+};
+struct s3 S3;
+
+// PTR32-ZOS:          0 | union u1
+// PTR32-ZOS-NEXT:     0 | int * __ptr32 a
+// PTR32-ZOS-NEXT:     0 | int * b
+// PTR32-ZOS-NEXT:       | [sizeof=8, align=8]
+
+// PTR32-LINUX:        0 | union u1
+// PTR32-LINUX-NEXT:   0 | int * __ptr32 a
+// PTR32-LINUX-NEXT:   0 | int * b
+// PTR32-LINUX-NEXT:     | [sizeof=8, align=8]
+union u1 {
+  int *__ptr32 a;
+  int *b;
+} U1;
+
+// PTR32-ZOS:          0 | union u2
+// PTR32-ZOS-NEXT:     0 | int * __ptr32 a
+// PTR32-ZOS-NEXT:     0 | int * b
+// PTR32-ZOS-NEXT:       | [sizeof=8, align=1]
+
+union __attribute__((packed)) u2 {
+  int *__ptr32 a;
+  int *b;
+};
+union u2 U2;
+
+// PTR32-ZOS:          0 | union u3
+// PTR32-ZOS-NEXT:     0 | int * __ptr32 a
+// PTR32-ZOS-NEXT:     0 | short b
+// PTR32-ZOS-NEXT:       | [sizeof=4, align=1]
+
+union __attribute__((packed)) u3 {
+  int *__ptr32 a;
+  short b;
+};
+union u3 U3;
diff --git a/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes.c b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes.c
new file mode 100644
index 000000000000000..6194c9b1804fb0b
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes.c
@@ -0,0 +1,298 @@
+// RUN: %clang_cc1 -triple s390x-ibm-zos -emit-llvm -O2 < %s | FileCheck %s --check-prefix=X64
+
+#define PSA_PTR 0x00
+#define PSAAOLD 0x224
+
+struct Foo {
+  int * __ptr32 p32;
+  int *p64;
+  char *cp64;
+};
+
+void use_foo(struct Foo *f);
+
+void ptr32_to_ptr(struct Foo *f, int * __ptr32 i) {
+  // X64-LABEL: define void @ptr32_to_ptr(ptr noundef %f, ptr addrspace(1) noundef %i)
+  // X64: %{{.+}} = addrspacecast ptr addrspace(1) %i to ptr
+  f->p64= i;
+  use_foo(f);
+}
+
+void ptr_to_ptr32(struct Foo *f, int *i) {
+  // X64-LABEL: define void @ptr_to_ptr32(ptr noundef %f, ptr noundef %i)
+  // X64: %{{.+}} = addrspacecast ptr %i to ptr addrspace(1)
+  f->p32 = i;
+  use_foo(f);
+}
+
+void ptr32_to_ptr32(struct Foo *f, int * __ptr32 i) {
+  // X64-LABEL: define void @ptr32_to_ptr32(ptr noundef %f, ptr addrspace(1) noundef %i)
+  // X64-NOT: addrspacecast
+  f->p32 = i;
+  use_foo(f);
+}
+
+void ptr_to_ptr32_explicit_cast(struct Foo *f, int *i) {
+  // X64-LABEL: define void @ptr_to_ptr32_explicit_cast(ptr noundef %f, ptr noundef %i)
+  // X64: %{{.+}} = addrspacecast ptr %i to ptr addrspace(1)
+  f->p32 = (int * __ptr32)i;
+  use_foo(f);
+}
+
+void test_indexing(struct Foo *f) {
+  // X64-LABEL: define void @test_indexing(ptr noundef %f)
+  // X64: addrspacecast ptr addrspace(1) {{%[0-9]}} to ptr
+  f->cp64 = ((char * __ptr32 *)1028)[1];
+  use_foo(f);
+}
+
+void test_indexing_2(struct Foo *f) {
+  // X64-LABEL: define void @test_indexing_2(ptr noundef %f)
+  // X64: getelementptr inbounds i8, ptr addrspace(1) {{%[0-9]}}, i32 16
+  // X64: getelementptr inbounds i8, ptr {{%[0-9]}}, i64 24
+  f->cp64 = ((char *** __ptr32 *)1028)[1][2][3];
+  use_foo(f);
+}
+
+unsigned long* test_misc() {
+  // X64-LABEL: define ptr @test_misc()
+  // X64: %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %0, i32 88
+  // X64-NEXT: %1 = load ptr, ptr addrspace(1) %arrayidx
+  // X64-NEXT: %arrayidx1 = getelementptr inbounds i8, ptr %1, i64 8
+  // X64-NEXT: %2 = load ptr, ptr %arrayidx1
+  // X64-NEXT: %arrayidx2 = getelementptr inbounds i8, ptr %2, i64 904
+  // X64-NEXT: %3 = load ptr, ptr %arrayidx2
+  // X64-NEXT: %arrayidx3 = getelementptr inbounds i8, ptr %3, i64 1192
+  unsigned long* x = (unsigned long*)((char***** __ptr32*)1208)[0][11][1][113][149];
+  return x;
+}
+
+char* __ptr32* __ptr32 test_misc_2() {
+  // X64-LABEL: define ptr addrspace(1) @test_misc_2()
+  // X64: br i1 %cmp, label %if.then, label %if.end
+  // X64: %1 = load ptr addrspace(1), ptr inttoptr (i64 16 to ptr)
+  // X64-NEXT: %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %1, i32 544
+  // X64-NEXT: %2 = load ptr addrspace(1), ptr addrspace(1) %arrayidx
+  // X64-NEXT: %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %2, i32 24
+  // X64-NEXT: %3 = load ptr addrspace(1), ptr addrspace(1) %arrayidx1
+  // X64-NEXT: store ptr addrspace(1) %3, ptr @test_misc_2.res
+  // X64: ret ptr addrspace(1)
+  static char* __ptr32* __ptr32 res = 0;
+  if (res == 0) {
+    res = ((char* __ptr32* __ptr32* __ptr32* __ptr32*)0)[4][136][6];
+  }
+  return res;
+}
+
+unsigned short test_misc_3() {
+  // X64-LABEL: define zeroext i16 @test_misc_3()
+  // X64: %0 = load ptr addrspace(1), ptr inttoptr (i64 548 to ptr)
+  // X64-NEXT: %1 = addrspacecast ptr addrspace(1) %0 to ptr
+  // X64-NEXT: %arrayidx = getelementptr inbounds i8, ptr %1, i64 36
+  // X64-NEXT: %2 = load i16, ptr %arrayidx, align 2
+  // X64-NEXT: ret i16 %2
+  unsigned short this_asid = ((unsigned short*)(*(char* __ptr32*)(0x224)))[18];
+  return this_asid;
+}
+
+int test_misc_4() {
+  // X64-LABEL: define signext range(i32 0, 2) i32 @test_misc_4()
+  // X64: getelementptr inbounds i8, ptr addrspace(1) {{%[0-9]}}, i32 88
+  // X64: getelementptr inbounds i8, ptr {{%[0-9]}}, i64 8
+  // X64: getelementptr inbounds i8, ptr {{%[0-9]}}, i64 984
+  // X64: getelementptr inbounds i8, ptr %3, i64 80
+  // X64: icmp sgt i32 {{.*[0-9]}}, 67240703
+  // X64: ret i32
+  int a = (*(int*)(80 + ((char**** __ptr32*)1208)[0][11][1][123]) > 0x040202FF);
+  return a;
+}
+
+void test_misc_5(struct Foo *f) {
+  // X64-LABEL: define void @test_misc_5(ptr noundef %f)
+  // X64: addrspacecast ptr addrspace(1) %0 to ptr
+  f->cp64  = *(char* __ptr32 *)(PSA_PTR + PSAAOLD);
+  use_foo(f);
+}
+
+int test_misc_6() {
+  // X64-LABEL: define {{.*}} i32 @test_misc_6()
+  // X64: ret i32 8
+  int * __ptr32 ip32;
+  int *ip64;
+  ip64 = ip32;
+  return sizeof(ip64);
+}
+
+int test_misc_7() {
+  // X64-LABEL: define {{.*}} i32 @test_misc_7()
+  // X64: ret i32 12
+  int foo = 12;
+
+  int *ip64;
+  int * __ptr32 ip32;
+
+  ip64 = &foo;
+  ip32 = (int * __ptr32) ip64;
+
+  return *ip32;
+}
+
+int test_misc_8() {
+  // X64-LABEL: define {{.*}} i32 @test_misc_8()
+  // X64: ret i32 97
+  char foo = 'a';
+
+  char *cp64;
+  char * __ptr32 cp32;
+
+  cp64 = &foo;
+  cp32 = (char * __ptr32) cp64;
+
+  return *cp32;
+}
+
+int test_misc_9() {
+  // X64-LABEL: define {{.*}} i32 @test_misc_9()
+  // X64: ret i32 15
+  int foo = 15;
+
+  int *ip64;
+  int * __ptr32 ip32;
+
+  ip32 = &foo;
+  ip64 = (int *)ip32;
+
+  return *ip64;
+}
+
+int test_misc_10() {
+  // X64-LABEL: define {{.*}} i32 @test_misc_10()
+  // X64: ret i32 97
+  char foo = 'a';
+
+  char *cp64;
+  char * __ptr32 cp32;
+
+  cp32 = &foo;
+  cp64= (char *)cp32;
+
+  return *cp64;
+}
+
+int test_function_ptr32_is_32bit() {
+  // X64-LABEL: define {{.*}} i32 @test_function_ptr32_is_32bit()
+  // X64: ret i32 4
+  int (* __ptr32 a)(int a);
+  return sizeof(a);
+}
+
+int get_processor_count() {
+  // X64-LABEL: define signext range(i32 -128, 128) i32 @get_processor_count()
+  // X64: load ptr addrspace(1), ptr inttoptr (i64 16 to ptr)
+  // X64-NEXT: [[ARR_IDX1:%[a-z].*]] = getelementptr inbounds i8, ptr addrspace(1) %0, i32 660
+  // X64: load ptr addrspace(1), ptr addrspace(1) [[ARR_IDX1]]
+  // X64: load i8, ptr addrspace(1) {{%[a-z].*}}
+  // X64: sext i8 {{%[0-9]}} to i32
+  // X64-NEXT: ret i32
+  return ((char * __ptr32 * __ptr32 *)0)[4][165][53];
+}
+
+int get_sizes_ptr32() {
+  // X64-LABEL: define {{.*}} i32 @get_sizes_ptr32()
+  // X64: ret i32 72
+  char * __ptr32 a;
+  signed char * __ptr32 b;
+  unsigned char *__ptr32 c;
+  int * __ptr32 d;
+  signed int * __ptr32 e;
+  unsigned int *__ptr32 f;
+  short * __ptr32 g;
+  signed short * __ptr32 h;
+  unsigned short * __ptr32 i;
+  long * __ptr32 j;
+  signed * __ptr32 k;
+  unsigned * __ptr32 l;
+  long long * __ptr32 m;
+  signed long long * __ptr32 n;
+  unsigned long long * __ptr32 o;
+  float * __ptr32 p;
+  double * __ptr32 q;
+  long double * __ptr32 r;
+
+  int sum = 0;
+  sum += sizeof(a);
+  sum += sizeof(b);
+  sum += sizeof(c);
+  sum += sizeof(d);
+  sum += sizeof(e);
+  sum += sizeof(f);
+  sum += sizeof(g);
+  sum += sizeof(h);
+  sum += sizeof(i);
+  sum += sizeof(j);
+  sum += sizeof(k);
+  sum += sizeof(l);
+  sum += sizeof(m);
+  sum += sizeof(n);
+  sum += sizeof(o);
+  sum += sizeof(p);
+  sum += sizeof(q);
+  sum += sizeof(r);
+
+  return sum;
+}
+
+int get_sizes_p64() {
+  // X64-LABEL: define {{.*}} i32 @get_sizes_p64()
+  // X64: ret i32 144
+  char *a;
+  signed char *b;
+  unsigned char *c;
+  int *d;
+  signed int *e;
+  unsigned int *f;
+  short *g;
+  signed short *h;
+  unsigned short *i;
+  long *j;
+  signed *k;
+  unsigned *l;
+  long long *m;
+  signed long long *n;
+  unsigned long long *o;
+  float *p;
+  double *q;
+  long double *r;
+
+  int sum = 0;
+  sum += sizeof(a);
+  sum += sizeof(b);
+  sum += sizeof(c);
+  sum += sizeof(d);
+  sum += sizeof(e);
+  sum += sizeof(f);
+  sum += sizeof(g);
+  sum += sizeof(h);
+  sum += sizeof(i);
+  sum += sizeof(j);
+  sum += sizeof(k);
+  sum += sizeof(l);
+  sum += sizeof(m);
+  sum += sizeof(n);
+  sum += sizeof(o);
+  sum += sizeof(p);
+  sum += sizeof(q);
+  sum += sizeof(r);
+
+  return sum;
+
+}
+
+int host_cpu() {
+  char *__ptr32 CVT = *(char * __ptr32 *__ptr32) 16;
+  unsigned short Id = *(unsigned short *)&CVT[-6];
+  Id = ((((Id >> 12) & 0x0f) * 10 + ((Id >> 8) & 0x0f)) * 10 + ((Id >> 4) & 0x0f)) * 10 + (Id & 0x0f);
+  int HaveVectorSupport = CVT[244] & 0x80;
+  int z13 = (Id >= 2964 && HaveVectorSupport);
+  return z13;
+}
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index 9a9fda70226fc2a..41cbd5a0219d5ec 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -235,7 +235,7 @@
 // RUN: FileCheck %s -check-prefix=ZOS
 // RUN: %clang_cc1 -triple s390x-none-zos -target-cpu z13 -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=ZOS
-// ZOS: target datalayout = "E-m:l-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+// ZOS: target datalayout = "E-m:l-p1:32:32-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
 
 // RUN: %clang_cc1 -triple msp430-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=MSP430
diff --git a/clang/test/CodeGenCXX/zos-mangle-ptr-size-address-space.cpp b/clang/test/CodeGenCXX/zos-mangle-ptr-size-address-space.cpp
new file mode 100644
index 000000000000000..d14ce117b2be4ae
--- /dev/null
+++ b/clang/test/CodeGenCXX/zos-mangle-ptr-size-address-space.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -fzos-extensions -emit-llvm -triple s390x-ibm-zos -x c++ -o - %s | FileCheck %s --check-prefixes=CHECK
+
+// CHECK-LABEL: define void @_Z2f1v()
+void f1() {}
+
+// CHECK-LABEL: define void @_Z2f2Pi(ptr addrspace(1) noundef %p32)
+void f2(int * __ptr32 p32) {}
+
+// CHECK-LABEL: define noundef ptr addrspace(1) @_Z2f3Pi(ptr addrspace(1) noundef %p32)
+int * __ptr32 f3(int * __ptr32 p32) {
+  return p32;
+}
+
+// CHECK-LABEL: define noundef ptr @_Z2f4PPi(ptr noundef %p32)
+int * __ptr32 *f4(int * __ptr32 *p32) {
+  return p32;
+}
diff --git a/clang/test/Driver/ppc-dependent-options.cpp b/clang/test/Driver/ppc-dependent-options.cpp
index 414ed1e70bb3054..46d6beafbc174c1 100644
--- a/clang/test/Driver/ppc-dependent-options.cpp
+++ b/clang/test/Driver/ppc-dependent-options.cpp
@@ -89,6 +89,34 @@
 // RUN: -std=c++11 -msoft-float -mvsx %s 2>&1 | \
 // RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-VSX
 
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mpower8-vector %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-P8VEC
+
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mpower9-vector %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-P9VEC
+
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mpower10-vector %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-P10VEC
+
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mdirect-move %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-DIRECTMOVE
+
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mmma %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-MMA
+
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mpaired-vector-memops %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-PAIREDVECMEMOP
+
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mcrypto %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-CRYPTO
+
 #ifdef __VSX__
 static_assert(false, "VSX enabled");
 #endif
@@ -126,5 +154,13 @@ static_assert(false, "Neither enabled");
 // CHECK-NVSX: Neither enabled
 // CHECK-VSX: VSX enabled
 // CHECK-NALTI-VSX: error: option '-mvsx' cannot be specified with '-mno-altivec'
-// CHECK-SOFTFLT-ALTI: error: option '-msoft-float' cannot be specified with '-maltivec'
-// CHECK-SOFTFLT-VSX: error: option '-msoft-float' cannot be specified with '-mvsx'
+// CHECK-SOFTFLT-ALTI: error: option '-maltivec' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-VSX: error: option '-mvsx' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-FLOAT128: error: option '-mfloat128' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-P8VEC: error: option '-mpower8-vector' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-P9VEC: error: option '-mpower9-vector' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-P10VEC: error: option '-mpower10-vector' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-DIRECTMOVE: error: option '-mdirect-move' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-MMA: error: option '-mmma' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-PAIREDVECMEMOP: error: option '-mpaired-vector-memops' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-CRYPTO: error: option '-mcrypto' cannot be specified with '-msoft-float'
diff --git a/clang/test/Driver/ppc-soft-float.c b/clang/test/Driver/ppc-soft-float.c
new file mode 100644
index 000000000000000..18d768aec0dedce
--- /dev/null
+++ b/clang/test/Driver/ppc-soft-float.c
@@ -0,0 +1,28 @@
+// RUN: %clang -target powerpc64-unknown-unknown -mcpu=pwr10 -msoft-float -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECKSOFT
+// RUN: %clang -target powerpc64-unknown-unknown -mcpu=pwr10 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECKNOSOFT
+
+int main () {
+  return 0;
+}
+
+// CHECKSOFT-DAG: -hard-float
+// CHECKSOFT-DAG: -vsx
+// CHECKSOFT-DAG: -altivec
+// CHECKSOFT-DAG: -direct-move
+// CHECKSOFT-DAG: -float128
+// CHECKSOFT-DAG: -mma
+// CHECKSOFT-DAG: -paired-vector-memops
+// CHECKSOFT-DAG: -power10-vector
+// CHECKSOFT-DAG: -power9-vector
+// CHECKSOFT-DAG: -power8-vector
+// CHECKSOFT-DAG: -crypto
+
+// CHECKNOSOFT-DAG: +vsx
+// CHECKNOSOFT-DAG: +altivec
+// CHECKNOSOFT-DAG: +direct-move
+// CHECKNOSOFT-DAG: +mma
+// CHECKNOSOFT-DAG: +paired-vector-memops
+// CHECKNOSOFT-DAG: +power10-vector
+// CHECKNOSOFT-DAG: +power9-vector
+// CHECKNOSOFT-DAG: +power8-vector
+// CHECKNOSOFT-DAG: +crypto
diff --git a/clang/test/Modules/check-for-sanitizer-feature.cpp b/clang/test/Modules/check-for-sanitizer-feature.cpp
index 2137b1bf36bb845..861b571f0efaa0e 100644
--- a/clang/test/Modules/check-for-sanitizer-feature.cpp
+++ b/clang/test/Modules/check-for-sanitizer-feature.cpp
@@ -43,7 +43,7 @@
 //
 // Import the PCH without ASan enabled (we expect an error).
 // RUN: not %clang_cc1 -x c -include-pch %t.asan_pch %s -verify 2>&1 | FileCheck %s --check-prefix=PCH_MISMATCH
-// PCH_MISMATCH: AST file was compiled with the target feature '-fsanitize=address' but the current translation unit is not
+// PCH_MISMATCH: AST file '{{.*}}.asan_pch' was compiled with the target feature '-fsanitize=address' but the current translation unit is not
 //
 // Emit a PCH with UBSan enabled.
 // RUN: %clang_cc1 -x c -fsanitize=null %S/Inputs/check-for-sanitizer-feature/check.h -emit-pch -o %t.ubsan_pch
diff --git a/clang/test/Modules/ignored_macros.m b/clang/test/Modules/ignored_macros.m
index a87a11f89c314f5..33801dfa4f47677 100644
--- a/clang/test/Modules/ignored_macros.m
+++ b/clang/test/Modules/ignored_macros.m
@@ -10,7 +10,7 @@
 // RUN: %clang_cc1 -fmodules-cache-path=%t.modules -fmodules -fimplicit-module-maps -I %S/Inputs -emit-pch -o %t.pch -x objective-c-header %s -verify
 // RUN: not %clang_cc1 -fmodules-cache-path=%t.modules -DIGNORED=1 -fmodules -fimplicit-module-maps -I %S/Inputs -include-pch %t.pch %s > %t.err 2>&1
 // RUN: FileCheck -check-prefix=CHECK-CONFLICT %s < %t.err
-// CHECK-CONFLICT: PCH was compiled with module cache path
+// CHECK-CONFLICT: AST file '{{.*}}' was compiled with module cache path
 
 // Third trial: pass -DIGNORED=1 only to the second invocation, but
 // make it ignored. There should be no failure, IGNORED is defined in
diff --git a/clang/test/Modules/load_failure.c b/clang/test/Modules/load_failure.c
index 3a8d29597348da9..662b39b6f1874f8 100644
--- a/clang/test/Modules/load_failure.c
+++ b/clang/test/Modules/load_failure.c
@@ -15,7 +15,7 @@
 // RUN: FileCheck -check-prefix=CHECK-FAILURE %s < %t.out
 
 // FIXME: Clean up diagnostic text below and give it a location
-// CHECK-FAILURE: error: C99 was disabled in PCH file but is currently enabled
+// CHECK-FAILURE: error: C99 was disabled in AST file '{{.*}}load_failure.pcm' but is currently enabled
 // FIXME: When we have a syntax for modules in C, use that.
 
 
diff --git a/clang/test/Modules/merge-target-features.cpp b/clang/test/Modules/merge-target-features.cpp
index 6a29c2db8a8d9e4..cc2bbfa077e9857 100644
--- a/clang/test/Modules/merge-target-features.cpp
+++ b/clang/test/Modules/merge-target-features.cpp
@@ -20,7 +20,7 @@
 // RUN:   -target-cpu i386 \
 // RUN:   -fsyntax-only merge-target-features.cpp 2>&1 \
 // RUN:   | FileCheck --check-prefix=SUBSET --implicit-check-not=error: %s
-// SUBSET: error: AST file was compiled with the target feature '+sse2' but the current translation unit is not
+// SUBSET: error: AST file '{{.*}}foo.pcm' was compiled with the target feature '+sse2' but the current translation unit is not
 // SUBSET: error: {{.*}} configuration mismatch
 //
 // RUN: %clang_cc1 -fmodules -x c++ -fmodules-cache-path=%t \
@@ -57,8 +57,8 @@
 // RUN:   -target-cpu i386 -target-feature +cx16 \
 // RUN:   -fsyntax-only merge-target-features.cpp 2>&1 \
 // RUN:   | FileCheck --check-prefix=MISMATCH --implicit-check-not=error: %s
-// MISMATCH: error: AST file was compiled with the target feature '+sse2' but the current translation unit is not
-// MISMATCH: error: current translation unit is compiled with the target feature '+cx16' but the AST file was not
+// MISMATCH: error: AST file '{{.*}}foo.pcm' was compiled with the target feature '+sse2' but the current translation unit is not
+// MISMATCH: error: current translation unit is compiled with the target feature '+cx16' but the AST file '{{.*}}foo.pcm' was not
 // MISMATCH: error: {{.*}} configuration mismatch
 
 #include "foo.h"
diff --git a/clang/test/Modules/mismatch-diagnostics.cpp b/clang/test/Modules/mismatch-diagnostics.cpp
index 5a026aa1f6c020f..dffd4b46a678e5b 100644
--- a/clang/test/Modules/mismatch-diagnostics.cpp
+++ b/clang/test/Modules/mismatch-diagnostics.cpp
@@ -29,5 +29,5 @@ export module mismatching_module;
 
 //--- use.cpp
 import mismatching_module;
-// CHECK: error: POSIX thread support was enabled in PCH file but is currently disabled
+// CHECK: error: POSIX thread support was enabled in AST file '{{.*[/|\\\\]}}mismatching_module.pcm' but is currently disabled
 // CHECK-NEXT: module file {{.*[/|\\\\]}}mismatching_module.pcm cannot be loaded due to a configuration mismatch with the current compilation
diff --git a/clang/test/Modules/module-pch-different-cache-path.c b/clang/test/Modules/module-pch-different-cache-path.c
index 8778adc886f719e..8dd04a166eab629 100644
--- a/clang/test/Modules/module-pch-different-cache-path.c
+++ b/clang/test/Modules/module-pch-different-cache-path.c
@@ -14,5 +14,5 @@
 
 pch_int x = 0;
 
-// CHECK-ERROR: PCH was compiled with module cache path '{{.*}}', but the path is currently '{{.*}}'
-// CHECK-SUCCESS-NOT: PCH was compiled with module cache path '{{.*}}', but the path is currently '{{.*}}'
+// CHECK-ERROR: AST file '{{.*}}' was compiled with module cache path '{{.*}}', but the path is currently '{{.*}}'
+// CHECK-SUCCESS-NOT: AST file '{{.*}}' was compiled with module cache path '{{.*}}', but the path is currently '{{.*}}'
diff --git a/clang/test/Modules/pr101398.cppm b/clang/test/Modules/pr101398.cppm
new file mode 100644
index 000000000000000..843d0ce84fdce3a
--- /dev/null
+++ b/clang/test/Modules/pr101398.cppm
@@ -0,0 +1,5 @@
+// RUN: mkdir -p %t
+// RUN: %clang -std=c++20 -xc++-module %s -Xclang -verify --precompile -o %t/tmp.pcm
+// not modules
+
+// expected-error@* {{missing 'export module' declaration in module interface unit}}
diff --git a/clang/test/Modules/pr62359.cppm b/clang/test/Modules/pr62359.cppm
index 69acc3ce303a574..7d9d3eec26cca7a 100644
--- a/clang/test/Modules/pr62359.cppm
+++ b/clang/test/Modules/pr62359.cppm
@@ -43,7 +43,7 @@ int use() {
   return 0;
 }
 
-// CHECK: OpenMP{{.*}}differs in PCH file vs. current file
+// CHECK: OpenMP{{.*}}differs in AST file '{{.*}}Hello.pcm' vs. current file
 
 //--- use2.cpp
 // expected-no-diagnostics
@@ -55,5 +55,5 @@ int use2() {
   return 0;
 }
 
-// CHECK: OpenMP{{.*}}differs in PCH file vs. current file
+// CHECK: OpenMP{{.*}}differs in AST file '{{.*}}Hello.pcm' vs. current file
 // CHECK: use of undeclared identifier 'pragma'
diff --git a/clang/test/Modules/pr99825.cppm b/clang/test/Modules/pr99825.cppm
new file mode 100644
index 000000000000000..fe6541c6e68e506
--- /dev/null
+++ b/clang/test/Modules/pr99825.cppm
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -std=c++20 %s -fsyntax-only -verify
+// expected-no-diagnostics
+export module mod;
+
+extern "C++"
+{
+    export constexpr auto x = 10;
+}
diff --git a/clang/test/PCH/arc.m b/clang/test/PCH/arc.m
index 32069e2314164cd..e4ad71a469b956b 100644
--- a/clang/test/PCH/arc.m
+++ b/clang/test/PCH/arc.m
@@ -14,5 +14,5 @@
 array0 a0;
 array1 a1;
 
-// CHECK-ERR1: Objective-C automated reference counting was enabled in PCH file but is currently disabled
-// CHECK-ERR2: Objective-C automated reference counting was disabled in PCH file but is currently enabled
+// CHECK-ERR1: Objective-C automated reference counting was enabled in AST file '{{.*}}' but is currently disabled
+// CHECK-ERR2: Objective-C automated reference counting was disabled in AST file '{{.*}}' but is currently enabled
diff --git a/clang/test/PCH/fuzzy-pch.c b/clang/test/PCH/fuzzy-pch.c
index 7296d1dc893b3b1..53985866dc08ed1 100644
--- a/clang/test/PCH/fuzzy-pch.c
+++ b/clang/test/PCH/fuzzy-pch.c
@@ -24,8 +24,8 @@ BAR bar = 17;
 #  error BAR was not defined
 #endif
 
-// CHECK-FOO: definition of macro 'FOO' differs between the precompiled header ('1') and the command line ('blah')
-// CHECK-NOFOO: macro 'FOO' was defined in the precompiled header but undef'd on the command line
+// CHECK-FOO: definition of macro 'FOO' differs between the AST file '{{.*}}' ('1') and the command line ('blah')
+// CHECK-NOFOO: macro 'FOO' was defined in the AST file '{{.*}}' but undef'd on the command line
 
-// CHECK-UNDEF: command line contains '-undef' but precompiled header was not built with it
+// CHECK-UNDEF: command line contains '-undef' but AST file '{{.*}}' was not built with it
 
diff --git a/clang/test/PCH/module-hash-difference.m b/clang/test/PCH/module-hash-difference.m
index fc542b0e8d1ad1e..73cf536f88b4f17 100644
--- a/clang/test/PCH/module-hash-difference.m
+++ b/clang/test/PCH/module-hash-difference.m
@@ -4,5 +4,5 @@
 // RUN: not %clang_cc1 -fsyntax-only -include-pch %t.pch %s -I %S/Inputs/modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t.mcp -fdisable-module-hash 2> %t.err
 // RUN: FileCheck -input-file=%t.err %s
 
-// CHECK: error: PCH was compiled with module cache path {{.*}}, but the path is currently {{.*}}
+// CHECK: error: AST file '{{.*}}' was compiled with module cache path {{.*}}, but the path is currently {{.*}}
 @import Foo;
diff --git a/clang/test/PCH/ms-pch-macro.c b/clang/test/PCH/ms-pch-macro.c
index a512e66e2486686..4d4900cc4f90dc0 100644
--- a/clang/test/PCH/ms-pch-macro.c
+++ b/clang/test/PCH/ms-pch-macro.c
@@ -33,7 +33,7 @@ BAR bar = 17;
 #  error BAR was not defined
 #endif
 
-// CHECK-FOO: definition of macro 'FOO' differs between the precompiled header ('1') and the command line ('blah')
-// CHECK-NOFOO: macro 'FOO' was defined in the precompiled header but undef'd on the command line
+// CHECK-FOO: definition of macro 'FOO' differs between the AST file '{{.*}}1.pch' ('1') and the command line ('blah')
+// CHECK-NOFOO: macro 'FOO' was defined in the AST file '{{.*}}1.pch' but undef'd on the command line
 
 // expected-warning@2 {{definition of macro 'BAR' does not match definition in precompiled header}}
diff --git a/clang/test/PCH/no-validate-pch.cl b/clang/test/PCH/no-validate-pch.cl
index 26c5fd5cc04c2ca..aa228ee2052192c 100644
--- a/clang/test/PCH/no-validate-pch.cl
+++ b/clang/test/PCH/no-validate-pch.cl
@@ -16,8 +16,8 @@
 // CHECK: note: previous definition is here
 // CHECK: #define X 4
 
-// CHECK-VAL: error: __OPTIMIZE__ predefined macro was enabled in PCH file but is currently disabled
-// CHECK-VAL: error: definition of macro 'X' differs between the precompiled header ('4') and the command line ('5')
+// CHECK-VAL: error: __OPTIMIZE__ predefined macro was enabled in AST file '{{.*}}' but is currently disabled
+// CHECK-VAL: error: definition of macro 'X' differs between the AST file '{{.*}}' ('4') and the command line ('5')
 
 void test(void) {
   int a = ONE;
diff --git a/clang/test/Sema/ZOSExtensions.cpp b/clang/test/Sema/ZOSExtensions.cpp
new file mode 100644
index 000000000000000..9b2d3cdb34529cb
--- /dev/null
+++ b/clang/test/Sema/ZOSExtensions.cpp
@@ -0,0 +1,120 @@
+// RUN: %clang_cc1 -triple s390x-ibm-zos %s -fsyntax-only -fzos-extensions -verify
+// RUN: %clang_cc1 -triple s390x-ibm-zos %s -fsyntax-only -verify
+
+struct A {
+  int a;
+  short b;
+  float q;
+  double z;
+};
+
+union B {
+  int a;
+  short b;
+  float q;
+  double z;
+};
+
+class C {
+  int a;
+  short b;
+  float q;
+  double z;
+};
+
+// ************************
+// INCORRECT DECLARATION
+// ************************
+int * __ptr64 p64; // expected-error {{expected ';' after top level declarator}}
+int *wrong_var3 __ptr32; // expected-error {{expected ';' after top level declarator}} expected-warning {{declaration does not declare anything}}
+
+// **************************
+// INCORRECT USAGES OF PTR32
+// **************************
+struct D {
+  int __ptr32 *a; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+};
+
+union E {
+  int __ptr32 *b; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+};
+
+char __ptr32 *a; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+signed char __ptr32 *b; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+unsigned char __ptr32 *c; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 *d; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+signed int __ptr32 *e; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+unsigned int __ptr32 *f; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+short int __ptr32 *g; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+signed short int __ptr32 *h; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+unsigned short int __ptr32 *i; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+long int __ptr32 *j; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+signed long int __ptr32 *k; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+unsigned long int __ptr32 *l; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+long long int __ptr32 *m; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+signed long long int __ptr32 *n; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+unsigned long long int __ptr32 *o; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+float __ptr32 *p;                  // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+double __ptr32 *q;                 // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 **r;                   // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 *__ptr32 *s;           // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 *__ptr32 *__ptr32 t;   // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 *__ptr32 *__ptr32 *__ptr32 u; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 __ptr32 **v_i;                // expected-error {{'__ptr32' attribute only applies to pointer arguments}} expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 __ptr32 __ptr32 w_i;          // expected-error {{'__ptr32' attribute only applies to pointer arguments}} expected-error {{'__ptr32' attribute only applies to pointer arguments}} expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+
+__ptr32 int wrong_var; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+
+struct A __ptr32 *c1;                  // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+struct A __ptr32 **e1;                 // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+struct A __ptr32 *__ptr32 *f1;         // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+struct A __ptr32 *__ptr32 *__ptr32 g1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+union B __ptr32 *d1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+union B __ptr32 **h1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+union B __ptr32 * __ptr32 *i1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+union B __ptr32 * __ptr32 * __ptr32 j1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+
+C __ptr32 **k1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+C __ptr32 * __ptr32 *l1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+C __ptr32 * __ptr32 * __ptr32 m1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+
+struct D n1;
+union E o1;
+
+int incorrect_func() {
+  int __ptr32 = 1; // expected-error {{expected unqualified-id}}
+  return __ptr32; // expected-error {{expected expression}}
+}
+
+typedef int __ptr32; // expected-warning {{typedef requires a name}}
+int incorrect_func2() {
+  return 1;
+}
+
+typedef int __ptr32 *v; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int incorrect_func3() {
+  v v1;
+  return 0;
+}
+
+int *__ptr32 a_ptr; //expected-note {{previous definition is here}}
+int *a_ptr;         // expected-error {{redefinition of 'a_ptr' with a different type: 'int *' vs 'int * __ptr32'}}
+
+// *******************************************************
+// FUNCTION OVERLOADING BETWEEN PTR32 AND REGULAR POINTERS
+// *******************************************************
+void func(int * __ptr32 p32) {} // expected-note {{previous definition is here}}
+void func(int *p64) {}          // expected-error {{redefinition of 'func'}}
+
+// Overloads between ptr32 and other non-pointer types are permissible
+void func1(int *__ptr32 p32) {}
+void func1(int p64) {}
+
+// ******
+// MISC
+// ******
+void func2() {
+  char * __ptr32 v = ((char * __ptr32 *)1028)[0];
+  char *v1 = ((char ** __ptr32 *)1028)[0][1];
+}
+
diff --git a/clang/test/Sema/attr-counted-by-bounds-safety-vlas.c b/clang/test/Sema/attr-counted-by-bounds-safety-vlas.c
index 7d9c9a90880fff4..5a739f9c6bc0432 100644
--- a/clang/test/Sema/attr-counted-by-bounds-safety-vlas.c
+++ b/clang/test/Sema/attr-counted-by-bounds-safety-vlas.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -fexperimental-bounds-safety -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fexperimental-bounds-safety -fexperimental-late-parse-attributes -verify %s
 //
 // This is a portion of the `attr-counted-by-vla.c` test but is checked
 // under the semantics of `-fexperimental-bounds-safety` which has different
diff --git a/clang/test/Sema/attr-counted-by-or-null-last-field.c b/clang/test/Sema/attr-counted-by-or-null-last-field.c
index dd3a6422521c02a..12c0b6de44f7264 100644
--- a/clang/test/Sema/attr-counted-by-or-null-last-field.c
+++ b/clang/test/Sema/attr-counted-by-or-null-last-field.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,immediate %s
+// RUN: %clang_cc1 -fsyntax-only -fexperimental-late-parse-attributes -verify=expected,late %s
 
 #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
 
@@ -82,7 +83,9 @@ struct found_outside_of_struct {
 
 struct self_referrential {
   int bork;
-  struct bar *self[] __counted_by_or_null(self); // expected-error {{use of undeclared identifier 'self'}}
+  // immediate-error@+2{{use of undeclared identifier 'self'}}
+  // late-error@+1{{'counted_by_or_null' only applies to pointers; did you mean to use 'counted_by'?}}
+  struct bar *self[] __counted_by_or_null(self);
 };
 
 struct non_int_count {
diff --git a/clang/test/Sema/attr-counted-by-or-null-struct-ptrs-sizeless-types.c b/clang/test/Sema/attr-counted-by-or-null-struct-ptrs-sizeless-types.c
index 301977300b06a19..4b898e7369c1978 100644
--- a/clang/test/Sema/attr-counted-by-or-null-struct-ptrs-sizeless-types.c
+++ b/clang/test/Sema/attr-counted-by-or-null-struct-ptrs-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
 
diff --git a/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c b/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c
index 017aafe0c9396ac..708bb727ce09dad 100644
--- a/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c
+++ b/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
 
 #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
 #define __counted_by(f)  __attribute__((counted_by(f)))
diff --git a/clang/test/Sema/attr-counted-by-or-null-vla-sizeless-types.c b/clang/test/Sema/attr-counted-by-or-null-vla-sizeless-types.c
index 8abd4476fe5977b..1e8c7179e790365 100644
--- a/clang/test/Sema/attr-counted-by-or-null-vla-sizeless-types.c
+++ b/clang/test/Sema/attr-counted-by-or-null-vla-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
 
diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c b/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
index 9b0f2eafb13c2b5..1de93640cf45806 100644
--- a/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
+++ b/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __counted_by(f)  __attribute__((counted_by(f)))
 
diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs.c b/clang/test/Sema/attr-counted-by-struct-ptrs.c
index cd2bfe36938b2e1..321d6aafbeba24a 100644
--- a/clang/test/Sema/attr-counted-by-struct-ptrs.c
+++ b/clang/test/Sema/attr-counted-by-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fexperimental-late-parse-attributes %s -verify
 
 #define __counted_by(f)  __attribute__((counted_by(f)))
 
diff --git a/clang/test/Sema/attr-counted-by-vla-sizeless-types.c b/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
index 31c0007501c48da..8cc5f6448254846 100644
--- a/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
+++ b/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple arm64-apple-darwin -fexperimental-late-parse-attributes -fsyntax-only -verify %s
 
 #define __counted_by(f)  __attribute__((counted_by(f)))
 
diff --git a/clang/test/Sema/attr-counted-by-vla.c b/clang/test/Sema/attr-counted-by-vla.c
index 571d6e6291e6bcb..35737c03f3222e4 100644
--- a/clang/test/Sema/attr-counted-by-vla.c
+++ b/clang/test/Sema/attr-counted-by-vla.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,immediate %s
+// RUN: %clang_cc1 -fsyntax-only -fexperimental-late-parse-attributes %s -verify=expected,late
 
 #define __counted_by(f)  __attribute__((counted_by(f)))
 
@@ -80,7 +81,9 @@ struct found_outside_of_struct {
 
 struct self_referrential {
   int bork;
-  struct bar *self[] __counted_by(self); // expected-error {{use of undeclared identifier 'self'}}
+  // immediate-error@+2{{use of undeclared identifier 'self'}}
+  // late-error@+1{{'counted_by' requires a non-boolean integer type argument}}
+  struct bar *self[] __counted_by(self);
 };
 
 struct non_int_count {
diff --git a/clang/test/Sema/attr-print-zos.c b/clang/test/Sema/attr-print-zos.c
new file mode 100644
index 000000000000000..f19926c131a4f39
--- /dev/null
+++ b/clang/test/Sema/attr-print-zos.c
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 %s -triple s390x-ibm-zos -ast-print -fzos-extensions | FileCheck %s
+
+// CHECK: int * __ptr32 p32;
+int * __ptr32 p32;
+
+// CHECK: char * __ptr32 c32;
+char * __ptr32 c32;
+
+// CHECK: void * __ptr32 v32;
+void * __ptr32 v32;
+
+// CHECK: int * __ptr32 *q;
+int * __ptr32 *q;
+
+// CHECK: void *func(int * __ptr32 p);
+void *func(int * __ptr32 p);
+
+// CHECK: int * __ptr32 func1(int * __ptr32 p);
+int * __ptr32 func1(int * __ptr32 p);
+
+// CHECK: int *func2(void * __ptr32 p);
+int *func2(void * __ptr32 p);
+
+// CHECK: int *const __ptr32 r;
+int * __ptr32 const r;
+
+// CHECK: int ** __ptr32 *v;
+int * *__ptr32* v;
+
+// CHECK: int *** __ptr32 *z;
+int ** * __ptr32 * z;
diff --git a/clang/test/Sema/attr-sized-by-last-field.c b/clang/test/Sema/attr-sized-by-last-field.c
index 6af29e9f31435da..f2e74f7fdf4b510 100644
--- a/clang/test/Sema/attr-sized-by-last-field.c
+++ b/clang/test/Sema/attr-sized-by-last-field.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,immediate %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify=expected,late %s
 
 #define __sized_by(f)  __attribute__((sized_by(f)))
 
@@ -82,7 +83,9 @@ struct found_outside_of_struct {
 
 struct self_referrential {
   int bork;
-  struct bar *self[] __sized_by(self); // expected-error {{use of undeclared identifier 'self'}}
+  // immediate-error@+2{{use of undeclared identifier 'self'}}
+  // late-error@+1{{'sized_by' only applies to pointers; did you mean to use 'counted_by'?}}
+  struct bar *self[] __sized_by(self);
 };
 
 struct non_int_size {
diff --git a/clang/test/Sema/attr-sized-by-or-null-last-field.c b/clang/test/Sema/attr-sized-by-or-null-last-field.c
index 96bbe847b910bf5..08ef5263a05cf03 100644
--- a/clang/test/Sema/attr-sized-by-or-null-last-field.c
+++ b/clang/test/Sema/attr-sized-by-or-null-last-field.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,immediate %s
+// RUN: %clang_cc1 -fsyntax-only -fexperimental-late-parse-attributes -verify=expected,late %s
 
 #define __sized_by_or_null(f)  __attribute__((sized_by_or_null(f)))
 
@@ -82,7 +83,9 @@ struct found_outside_of_struct {
 
 struct self_referrential {
   int bork;
-  struct bar *self[] __sized_by_or_null(self); // expected-error {{use of undeclared identifier 'self'}}
+  // immediate-error@+2{{use of undeclared identifier 'self'}}
+  // late-error@+1{{'sized_by_or_null' only applies to pointers; did you mean to use 'counted_by'?}}
+  struct bar *self[] __sized_by_or_null(self);
 };
 
 struct non_int_size {
diff --git a/clang/test/Sema/attr-sized-by-or-null-struct-ptrs-sizeless-types.c b/clang/test/Sema/attr-sized-by-or-null-struct-ptrs-sizeless-types.c
index 4a360b9722a0b0c..d960c0d31b65cc2 100644
--- a/clang/test/Sema/attr-sized-by-or-null-struct-ptrs-sizeless-types.c
+++ b/clang/test/Sema/attr-sized-by-or-null-struct-ptrs-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __sized_by_or_null(f)  __attribute__((sized_by_or_null(f)))
 
diff --git a/clang/test/Sema/attr-sized-by-or-null-struct-ptrs.c b/clang/test/Sema/attr-sized-by-or-null-struct-ptrs.c
index 2c7578b5ecbe64c..4200c9275a18040 100644
--- a/clang/test/Sema/attr-sized-by-or-null-struct-ptrs.c
+++ b/clang/test/Sema/attr-sized-by-or-null-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
 
 #define __sized_by_or_null(f)  __attribute__((sized_by_or_null(f)))
 #define __counted_by(f)  __attribute__((counted_by(f)))
diff --git a/clang/test/Sema/attr-sized-by-or-null-vla-sizeless-types.c b/clang/test/Sema/attr-sized-by-or-null-vla-sizeless-types.c
index 398b1df592fe389..7d16c2d456a02c9 100644
--- a/clang/test/Sema/attr-sized-by-or-null-vla-sizeless-types.c
+++ b/clang/test/Sema/attr-sized-by-or-null-vla-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __sized_by_or_null(f)  __attribute__((sized_by_or_null(f)))
 
diff --git a/clang/test/Sema/attr-sized-by-struct-ptrs-sizeless-types.c b/clang/test/Sema/attr-sized-by-struct-ptrs-sizeless-types.c
index 2e916bdb04720ce..7038330e60eee5b 100644
--- a/clang/test/Sema/attr-sized-by-struct-ptrs-sizeless-types.c
+++ b/clang/test/Sema/attr-sized-by-struct-ptrs-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __sized_by(f)  __attribute__((sized_by(f)))
 
diff --git a/clang/test/Sema/attr-sized-by-struct-ptrs.c b/clang/test/Sema/attr-sized-by-struct-ptrs.c
index 01195469c6fe42f..07373b247d0f790 100644
--- a/clang/test/Sema/attr-sized-by-struct-ptrs.c
+++ b/clang/test/Sema/attr-sized-by-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
 
 #define __sized_by(f)  __attribute__((sized_by(f)))
 #define __counted_by(f)  __attribute__((counted_by(f)))
diff --git a/clang/test/Sema/attr-sized-by-vla-sizeless-types.c b/clang/test/Sema/attr-sized-by-vla-sizeless-types.c
index 37e91639bb4a1f1..8a94b1217c9064a 100644
--- a/clang/test/Sema/attr-sized-by-vla-sizeless-types.c
+++ b/clang/test/Sema/attr-sized-by-vla-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __sized_by(f)  __attribute__((sized_by(f)))
 
diff --git a/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp b/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
index 9ef5303a9c4df55..9aaa13d7ac41ad7 100644
--- a/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
+++ b/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
@@ -19,13 +19,16 @@ template<typename T> constexpr bool has_type(T&) { return true; }
 
 std::initializer_list il1 = {1, 2, 3, 4, 5};
 auto il2 = std::initializer_list{1, 2, 3, 4};
-auto il3 = std::initializer_list{il1};
+auto il3 = std::initializer_list(il1);
 auto il4 = std::initializer_list{il1, il1, il1};
 static_assert(has_type<std::initializer_list<int>>(il1));
 static_assert(has_type<std::initializer_list<int>>(il2));
 static_assert(has_type<std::initializer_list<int>>(il3));
 static_assert(has_type<std::initializer_list<std::initializer_list<int>>>(il4));
 
+auto il5 = std::initializer_list{il1};
+// expected-error@-1 {{no viable conversion from 'std::initializer_list<int>' to 'const int'}}
+
 template<typename T> struct vector {
   template<typename Iter> vector(Iter, Iter);
   vector(std::initializer_list<T>);
diff --git a/clang/test/SemaCXX/single-element-init-list.cpp b/clang/test/SemaCXX/single-element-init-list.cpp
new file mode 100644
index 000000000000000..33d986e08401383
--- /dev/null
+++ b/clang/test/SemaCXX/single-element-init-list.cpp
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -verify %s
+
+// This is heavily affected by the speculative resolution applied to CWG2311
+// So behaviour shown here is subject to change.
+
+// expected-no-diagnostics
+
+namespace std {
+  typedef decltype(sizeof(int)) size_t;
+
+  // libc++'s implementation
+  template <class _E>
+  class initializer_list
+  {
+    const _E* __begin_;
+    size_t    __size_;
+
+    initializer_list(const _E* __b, size_t __s)
+      : __begin_(__b),
+        __size_(__s)
+    {}
+
+  public:
+    typedef _E        value_type;
+    typedef const _E& reference;
+    typedef const _E& const_reference;
+    typedef size_t    size_type;
+
+    typedef const _E* iterator;
+    typedef const _E* const_iterator;
+
+    constexpr initializer_list() : __begin_(nullptr), __size_(0) {}
+
+    constexpr size_t    size()  const {return __size_;}
+    const _E* begin() const {return __begin_;}
+    const _E* end()   const {return __begin_ + __size_;}
+  };
+
+  template<typename T>
+  struct vector {
+    size_t sz;
+    constexpr vector() : sz(0) {}
+    constexpr vector(initializer_list<T> ilist) : sz(ilist.size()) {}
+    constexpr vector(const vector& other) : sz(other.sz) {}
+    constexpr std::size_t size() const { return sz; }
+  };
+}
+
+// https://github.com/llvm/llvm-project/pull/77768#issuecomment-1908062472
+namespace Issue1 {
+  struct A {
+    constexpr A() {}
+  };
+
+  struct B {
+    int called_ctor;
+    constexpr explicit B(A) : called_ctor(0) {}
+    constexpr explicit B(std::vector<A>) : called_ctor(1) {}
+  };
+
+  struct C {
+    B b;
+    constexpr C() : b({A()}) {}
+  };
+
+  static_assert(C().b.called_ctor == 0);
+}
+
+// https://github.com/llvm/llvm-project/pull/77768#issuecomment-1957171805
+namespace Issue2 {
+  struct A {
+    constexpr A(int x_) {}
+    constexpr A(const std::vector<A>& a) {}
+  };
+
+  void f() {
+    constexpr std::vector<A> a{1,2};
+    constexpr std::vector<A> b{a};
+    // -> constexpr std::vector<A> b(std::initializer_list<A>{ A(a) });
+    static_assert(b.size() == 1);
+  }
+}
diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
index 8ac5a0e753a3409..9330df8cdd0398a 100644
--- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp
+++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
@@ -12,7 +12,7 @@ void g() {
   // expected-note@#FDEF{{because 'int' does not satisfy 'c'}}
   // expected-note@#CDEF{{because 'f(t)' would be invalid: no matching function for call to 'f'}}
 }
-} // namespace GH53213
+} // namespace GH53213 
 
 namespace GH45736 {
 struct constrained;
@@ -69,13 +69,13 @@ void baz() {
 auto it = begin(rng); // #BEGIN_CALL
 // expected-error@#INF_BEGIN {{satisfaction of constraint 'Inf<Inf auto>' depends on itself}}
 // expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}}
-// expected-note@#INF_BEGIN_EXPR {{while checking constraint satisfaction for template 'begin<struct my_range>' required here}}
+// expected-note@#INF_BEGIN_EXPR {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
 // expected-note@#INF_BEGIN_EXPR {{while substituting deduced template arguments into function template 'begin'}}
 // expected-note@#INF_BEGIN_EXPR {{in instantiation of requirement here}}
 // expected-note@#INF_REQ {{while substituting template arguments into constraint expression here}}
-// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<struct my_range>' requested here}}
+// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}}
 // expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}}
-// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<struct my_range>' required here}}
+// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
 // expected-note@#BEGIN_CALL {{in instantiation of function template specialization}}
 
 // Fallout of the failure is failed lookup, which is necessary to stop odd
@@ -100,7 +100,7 @@ namespace GH50891 {
   static_assert(Numeric<Deferred>); // #STATIC_ASSERT
   // expected-error@#NUMERIC{{satisfaction of constraint 'requires (T a) { foo(a); }' depends on itself}}
   // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}}
-  // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}}
+  // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}}
   // expected-note@#OP_TO {{while substituting template arguments into constraint expression here}}
   // expected-note@#FOO_CALL {{while checking constraint satisfaction for template}}
   // expected-note@#FOO_CALL {{in instantiation of function template specialization}}
@@ -108,7 +108,7 @@ namespace GH50891 {
   // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}}
 
   // expected-error@#STATIC_ASSERT {{static assertion failed}}
-  // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}}
+  // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}}
   // expected-note@#STATIC_ASSERT{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
 } // namespace GH50891
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 19b138270eebb28..a4b42cad79abd49 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -836,13 +836,13 @@ struct Parent {
 static_assert(Parent<void>::TakesUnary<int, 0>::i == 0);
 // expected-error@+3{{constraints not satisfied for class template 'TakesUnary'}}
 // expected-note@#UNARY{{because 'decltype(0ULL)' (aka 'unsigned long long') does not satisfy 'C'}}
-// expected-note@#61777_C{{because 'sizeof(decltype(0ULL)) == 4' (8 == 4) evaluated to false}}
+// expected-note@#61777_C{{because 'sizeof(unsigned long long) == 4' (8 == 4) evaluated to false}}
 static_assert(Parent<void>::TakesUnary<int, 0uLL>::i == 0);
 
 static_assert(Parent<int>::TakesBinary<int, 0>::i == 0);
 // expected-error@+3{{constraints not satisfied for class template 'TakesBinary'}}
 // expected-note@#BINARY{{because 'C2<decltype(0ULL), int>' evaluated to false}}
-// expected-note@#61777_C2{{because 'sizeof(decltype(0ULL)) == sizeof(int)' (8 == 4) evaluated to false}}
+// expected-note@#61777_C2{{because 'sizeof(unsigned long long) == sizeof(int)' (8 == 4) evaluated to false}}
 static_assert(Parent<int>::TakesBinary<int, 0ULL>::i == 0);
 }
 
diff --git a/clang/test/SemaTemplate/instantiate-requires-expr.cpp b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
index c5ebeff828295a8..20a19d731ae1696 100644
--- a/clang/test/SemaTemplate/instantiate-requires-expr.cpp
+++ b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
@@ -76,8 +76,8 @@ namespace type_requirement {
   // expected-note@-2 {{because 'false_v<requires { typename contains_template<short>::template temp<contains_template<short> >; }>' evaluated to false}}
   struct r2 {};
 
-  using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<int>]}}
-  using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<short>]}}
+  using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<int>]}}
+  using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<short>]}}
 
   // substitution error occurs, then requires expr is instantiated again
 
@@ -108,7 +108,7 @@ namespace type_requirement {
   // expected-note@-1 {{because 'false_v<requires { <<error-type>>; } && requires { <<error-type>>; }>' evaluated to false}}
   struct r7 {};
 
-  using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, A>]}}
+  using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, type_requirement::A>]}}
 }
 
 namespace expr_requirement {
@@ -237,13 +237,3 @@ constexpr bool e_v = true;
 static_assert(e_v<bool>);
 
 } // namespace GH73885
-
-namespace sugared_instantiation {
-  template <class C1> concept C = requires { C1{}; };
-  template <class D1> concept D = requires { new D1; };
-
-  // Test that 'deduced auto' doesn't get confused with 'undeduced auto'.
-  auto f() { return 0; }
-  static_assert(requires { { f() } -> C; });
-  static_assert(requires { { f() } -> D; });
-} // namespace sugared_instantiation
diff --git a/clang/test/SemaTemplate/pr52970.cpp b/clang/test/SemaTemplate/pr52970.cpp
index 6aabc419bd2b88c..7aac5ee85659349 100644
--- a/clang/test/SemaTemplate/pr52970.cpp
+++ b/clang/test/SemaTemplate/pr52970.cpp
@@ -53,7 +53,7 @@ static_assert(!DotFollowingPointer::f(Bad{}), "");
 #if __cplusplus >= 202002L
 template <class T>
 concept C = requires(T t) { t.begin(); };
-  // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Bad' (aka 'Holder<Incomplete> *') is a pointer}}
+  // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Holder<Incomplete> *' is a pointer}}
 
 static_assert(C<Good>);
 static_assert(!C<Bad>);
diff --git a/clang/tools/clang-linker-wrapper/CMakeLists.txt b/clang/tools/clang-linker-wrapper/CMakeLists.txt
index 4a16c3ca9f0903c..bf37d8031025ed3 100644
--- a/clang/tools/clang-linker-wrapper/CMakeLists.txt
+++ b/clang/tools/clang-linker-wrapper/CMakeLists.txt
@@ -31,7 +31,6 @@ add_clang_tool(clang-linker-wrapper
 
   DEPENDS
   ${tablegen_deps}
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
 
 set(CLANG_LINKER_WRAPPER_LIB_DEPS
@@ -42,3 +41,5 @@ target_link_libraries(clang-linker-wrapper
   PRIVATE
   ${CLANG_LINKER_WRAPPER_LIB_DEPS}
   )
+
+export_executable_symbols_for_plugins(clang-linker-wrapper)
diff --git a/clang/tools/clang-repl/CMakeLists.txt b/clang/tools/clang-repl/CMakeLists.txt
index 52b740b356284e4..a35ff13494e1153 100644
--- a/clang/tools/clang-repl/CMakeLists.txt
+++ b/clang/tools/clang-repl/CMakeLists.txt
@@ -9,8 +9,6 @@ set( LLVM_LINK_COMPONENTS
 
 add_clang_tool(clang-repl
   ClangRepl.cpp
-
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
 
 if(MSVC)
@@ -63,6 +61,8 @@ clang_target_link_libraries(clang-repl PRIVATE
   clangInterpreter
   )
 
+export_executable_symbols_for_plugins(clang-repl)
+
 # The clang-repl binary can get huge with static linking in debug mode.
 # Some 32-bit targets use PLT slots with limited branch range by default and we
 # start to exceed this limit, e.g. when linking for arm-linux-gnueabihf with
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index 805dffb0d9b7064..018605c2fd4f268 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -21,7 +21,6 @@ set( LLVM_LINK_COMPONENTS
 # Support plugins.
 if(CLANG_PLUGIN_SUPPORT)
   set(support_plugins SUPPORT_PLUGINS)
-  set(export_symbols EXPORT_SYMBOLS_FOR_PLUGINS)
 endif()
 
 add_clang_tool(clang
@@ -36,7 +35,6 @@ add_clang_tool(clang
   ARMTargetParserTableGen
   AArch64TargetParserTableGen
   ${support_plugins}
-  ${export_symbols}
   GENERATE_DRIVER
   )
 
@@ -56,6 +54,11 @@ else()
   set_target_properties(clang PROPERTIES VERSION ${CLANG_EXECUTABLE_VERSION})
 endif()
 
+# Support plugins.
+if(CLANG_PLUGIN_SUPPORT)
+  export_executable_symbols_for_plugins(clang)
+endif()
+
 add_dependencies(clang clang-resource-headers)
 
 if(NOT CLANG_LINKS_TO_CREATE)
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index b8b733dbfa7163b..5ad60002733a04d 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -12637,7 +12637,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2137.html">2137</a></td>
     <td>CD4</td>
     <td>List-initialization from object of same type</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="unreleased" align="center">Clang 20</td>
   </tr>
   <tr id="2138">
     <td><a href="https://cplusplus.github.io/CWG/issues/2138.html">2138</a></td>
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 34b16df75841b56..27f8697db7838f5 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -178,7 +178,11 @@ template <typename T> class NonZeroLengthArray<T, 0> {
   T &operator[](uptr UNUSED Idx) { UNREACHABLE("Unsupported!"); }
 };
 
-template <typename Config> class MapAllocatorCache {
+// The default unmap callback is simply scudo::unmap.
+// In testing, a different unmap callback is used to
+// record information about unmaps in the cache
+template <typename Config, void (*unmapCallBack)(MemMapT &) = unmap>
+class MapAllocatorCache {
 public:
   void getStats(ScopedString *Str) {
     ScopedLock L(Mutex);
@@ -246,6 +250,7 @@ template <typename Config> class MapAllocatorCache {
     const s32 Interval = atomic_load_relaxed(&ReleaseToOsIntervalMs);
     u64 Time;
     CachedBlock Entry;
+
     Entry.CommitBase = CommitBase;
     Entry.CommitSize = CommitSize;
     Entry.BlockBegin = BlockBegin;
@@ -290,7 +295,7 @@ template <typename Config> class MapAllocatorCache {
         // read Options and when we locked Mutex. We can't insert our entry into
         // the quarantine or the cache because the permissions would be wrong so
         // just unmap it.
-        unmap(Entry.MemMap);
+        unmapCallBack(Entry.MemMap);
         break;
       }
       if (Config::getQuarantineSize() && useMemoryTagging<Config>(Options)) {
@@ -321,7 +326,7 @@ template <typename Config> class MapAllocatorCache {
     } while (0);
 
     for (MemMapT &EvictMemMap : EvictionMemMaps)
-      unmap(EvictMemMap);
+      unmapCallBack(EvictMemMap);
 
     if (Interval >= 0) {
       // TODO: Add ReleaseToOS logic to LRU algorithm
@@ -423,7 +428,7 @@ template <typename Config> class MapAllocatorCache {
     for (u32 I = 0; I != Config::getQuarantineSize(); ++I) {
       if (Quarantine[I].isValid()) {
         MemMapT &MemMap = Quarantine[I].MemMap;
-        unmap(MemMap);
+        unmapCallBack(MemMap);
         Quarantine[I].invalidate();
       }
     }
@@ -517,7 +522,7 @@ template <typename Config> class MapAllocatorCache {
     }
     for (uptr I = 0; I < N; I++) {
       MemMapT &MemMap = MapInfo[I];
-      unmap(MemMap);
+      unmapCallBack(MemMap);
     }
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
index 5685a9335316d1e..e85b6abdb36d228 100644
--- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
@@ -265,3 +265,101 @@ TEST_F(MapAllocatorWithReleaseTest, SecondaryThreadsRace) {
   Allocator->getStats(&Str);
   Str.output();
 }
+
+struct MapAllocatorCacheTest : public Test {
+  static constexpr scudo::u32 UnmappedMarker = 0xDEADBEEF;
+
+  static void testUnmapCallback(scudo::MemMapT &MemMap) {
+    scudo::u32 *Ptr = reinterpret_cast<scudo::u32 *>(MemMap.getBase());
+    *Ptr = UnmappedMarker;
+  }
+
+  using SecondaryConfig = scudo::SecondaryConfig<TestConfig>;
+  using CacheConfig = SecondaryConfig::CacheConfig;
+  using CacheT = scudo::MapAllocatorCache<CacheConfig, testUnmapCallback>;
+
+  std::unique_ptr<CacheT> Cache = std::make_unique<CacheT>();
+
+  const scudo::uptr PageSize = scudo::getPageSizeCached();
+  // The current test allocation size is set to the minimum size
+  // needed for the scudo allocator to fall back to the secondary allocator
+  static constexpr scudo::uptr TestAllocSize =
+      CacheConfig::getDefaultMaxEntrySize();
+
+  scudo::Options Options = getOptionsForConfig<SecondaryConfig>();
+
+  void SetUp() override { Cache->init(/*ReleaseToOsInterval=*/-1); }
+
+  void TearDown() override { Cache->unmapTestOnly(); }
+
+  scudo::MemMapT allocate(scudo::uptr Size) {
+    scudo::uptr MapSize = scudo::roundUp(Size, PageSize);
+    scudo::ReservedMemoryT ReservedMemory;
+    CHECK(ReservedMemory.create(0U, MapSize, nullptr, MAP_ALLOWNOMEM));
+
+    scudo::MemMapT MemMap = ReservedMemory.dispatch(
+        ReservedMemory.getBase(), ReservedMemory.getCapacity());
+    MemMap.remap(MemMap.getBase(), MemMap.getCapacity(), "scudo:test",
+                 MAP_RESIZABLE | MAP_ALLOWNOMEM);
+    return MemMap;
+  }
+
+  void fillCacheWithSameSizeBlocks(std::vector<scudo::MemMapT> &MemMaps,
+                                   scudo::uptr NumEntries, scudo::uptr Size) {
+    for (scudo::uptr I = 0; I < NumEntries; I++) {
+      MemMaps.emplace_back(allocate(Size));
+      auto &MemMap = MemMaps[I];
+      Cache->store(Options, MemMap.getBase(), MemMap.getCapacity(),
+                   MemMap.getBase(), MemMap);
+    }
+  }
+};
+
+TEST_F(MapAllocatorCacheTest, CacheOrder) {
+  std::vector<scudo::MemMapT> MemMaps;
+  Cache->setOption(scudo::Option::MaxCacheEntriesCount,
+                   CacheConfig::getEntriesArraySize());
+
+  fillCacheWithSameSizeBlocks(MemMaps, CacheConfig::getEntriesArraySize(),
+                              TestAllocSize);
+
+  // Retrieval order should be the inverse of insertion order
+  for (scudo::uptr I = CacheConfig::getEntriesArraySize(); I > 0; I--) {
+    scudo::uptr EntryHeaderPos;
+    scudo::CachedBlock Entry =
+        Cache->retrieve(TestAllocSize, PageSize, 0, EntryHeaderPos);
+    EXPECT_EQ(Entry.MemMap.getBase(), MemMaps[I - 1].getBase());
+  }
+
+  // Clean up MemMaps
+  for (auto &MemMap : MemMaps)
+    MemMap.unmap();
+}
+
+TEST_F(MapAllocatorCacheTest, MemoryLeakTest) {
+  std::vector<scudo::MemMapT> MemMaps;
+  // Fill the cache above MaxEntriesCount to force an eviction
+  // The first cache entry should be evicted (because it is the oldest)
+  // due to the maximum number of entries being reached
+  fillCacheWithSameSizeBlocks(
+      MemMaps, CacheConfig::getDefaultMaxEntriesCount() + 1, TestAllocSize);
+
+  std::vector<scudo::CachedBlock> RetrievedEntries;
+
+  // First MemMap should be evicted from cache because it was the first
+  // inserted into the cache
+  for (scudo::uptr I = CacheConfig::getDefaultMaxEntriesCount(); I > 0; I--) {
+    scudo::uptr EntryHeaderPos;
+    RetrievedEntries.push_back(
+        Cache->retrieve(TestAllocSize, PageSize, 0, EntryHeaderPos));
+    EXPECT_EQ(MemMaps[I].getBase(), RetrievedEntries.back().MemMap.getBase());
+  }
+
+  // Evicted entry should be marked due to unmap callback
+  EXPECT_EQ(*reinterpret_cast<scudo::u32 *>(MemMaps[0].getBase()),
+            UnmappedMarker);
+
+  // Clean up MemMaps
+  for (auto &MemMap : MemMaps)
+    MemMap.unmap();
+}
diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp
index e3efa0a1a4fd278..06a632e6708b1e6 100644
--- a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp
@@ -10,9 +10,8 @@
 #include <windows.h>
 #include <stdio.h>
 
-extern "C" {
 #if defined(EXE)
-__declspec(dllimport) int foo_from_dll();
+extern "C" __declspec(dllimport) int foo_from_dll();
 
 // CHECK: in DLL(reason=1)
 int main(int argc, char **argv) {
@@ -23,6 +22,7 @@ int main(int argc, char **argv) {
 // CHECK: in DLL(reason=0)
 }
 #elif defined(DLL)
+extern "C" {
 // This global is registered at startup.
 int x[42];
 
@@ -35,7 +35,7 @@ BOOL WINAPI DllMain(HMODULE, DWORD reason, LPVOID) {
   fflush(0);
   return TRUE;
 }
+}
 #else
 # error oops!
 #endif
-}
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 093596c9dc8ebca..fb57744c2157033 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -384,6 +384,8 @@ end
 * `BIND(C, NAME="...", CDEFINED)` signifies that the storage for an
   interoperable variable will be allocated outside of Fortran,
   probably by a C or C++ external definition.
+* An automatic data object may be declared in the specification part
+  of the main program.
 
 ### Extensions supported when enabled by options
 
diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index 938da08e19d6b17..6ef5f44c89db07e 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -51,7 +51,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     BadBranchTarget, ConvertedArgument, HollerithPolymorphic, ListDirectedSize,
     NonBindCInteroperability, CudaManaged, CudaUnified,
     PolymorphicActualAllocatableOrPointerToMonomorphicDummy, RelaxedPureDummy,
-    UndefinableAsynchronousOrVolatileActual)
+    UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram)
 
 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
@@ -70,7 +70,7 @@ ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
     IgnoredIntrinsicFunctionType, PreviousScalarUse,
     RedeclaredInaccessibleComponent, ImplicitShared, IndexVarRedefinition,
     IncompatibleImplicitInterfaces, BadTypeForTarget,
-    VectorSubscriptFinalization, UndefinedFunctionResult)
+    VectorSubscriptFinalization, UndefinedFunctionResult, UselessIomsg)
 
 using LanguageFeatures = EnumSet<LanguageFeature, LanguageFeature_enumSize>;
 using UsageWarnings = EnumSet<UsageWarning, UsageWarning_enumSize>;
@@ -145,6 +145,7 @@ class LanguageFeatureControl {
     warnUsage_.set(UsageWarning::BadTypeForTarget);
     warnUsage_.set(UsageWarning::VectorSubscriptFinalization);
     warnUsage_.set(UsageWarning::UndefinedFunctionResult);
+    warnUsage_.set(UsageWarning::UselessIomsg);
   }
   LanguageFeatureControl(const LanguageFeatureControl &) = default;
 
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 8c6d3b37166a92a..de4d415eda6fd61 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1243,6 +1243,18 @@ bool CheckForCoindexedObject(parser::ContextualMessages &,
     const std::optional<ActualArgument> &, const std::string &procName,
     const std::string &argName);
 
+inline bool CanCUDASymbolHasSave(const Symbol &sym) {
+  if (const auto *details =
+          sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
+    if (details->cudaDataAttr() &&
+        *details->cudaDataAttr() != common::CUDADataAttr::Pinned &&
+        *details->cudaDataAttr() != common::CUDADataAttr::Unified) {
+      return false;
+    }
+  }
+  return true;
+}
+
 inline bool IsCUDADeviceSymbol(const Symbol &sym) {
   if (const auto *details =
           sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
diff --git a/flang/include/flang/Evaluate/type.h b/flang/include/flang/Evaluate/type.h
index 16c2b319ff1de6d..bd8887dbce4e82b 100644
--- a/flang/include/flang/Evaluate/type.h
+++ b/flang/include/flang/Evaluate/type.h
@@ -494,7 +494,9 @@ bool IsCUDAIntrinsicType(const DynamicType &);
 // Determine whether two derived type specs are sufficiently identical
 // to be considered the "same" type even if declared separately.
 bool AreSameDerivedType(
-    const semantics::DerivedTypeSpec &x, const semantics::DerivedTypeSpec &y);
+    const semantics::DerivedTypeSpec &, const semantics::DerivedTypeSpec &);
+bool AreSameDerivedTypeIgnoringTypeParameters(
+    const semantics::DerivedTypeSpec &, const semantics::DerivedTypeSpec &);
 
 // For generating "[extern] template class", &c. boilerplate
 #define EXPAND_FOR_EACH_INTEGER_KIND(M, P, S) \
diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 8f5204769d7aa72..f0bfc1548e64587 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -10,6 +10,7 @@
 #define FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
 
 #include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/entry-names.h"
 
 #define CUDA_REPORT_IF_ERROR(expr) \
   [](CUresult result) { \
@@ -25,7 +26,10 @@
 
 namespace Fortran::runtime::cuda {
 
-void CUFRegisterAllocator();
+extern "C" {
+
+void RTDECL(CUFRegisterAllocator)();
+}
 
 void *CUFAllocPinned(std::size_t);
 void CUFFreePinned(void *);
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index ec275f349e81bfd..15c02ecc0058cc0 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -442,6 +442,18 @@ std::list<std::list<SymbolRef>> GetStorageAssociations(const Scope &);
 //     closure of its components (including POINTERs) and the
 //     PotentialAndPointer subobject components of its non-POINTER derived type
 //     components.
+//
+// type t1                     ultimate components:  x, a, p
+//  real x                     direct components:    x, a, p
+//  real, allocatable :: a     potential components: x, a
+//  real, pointer :: p         potential & pointers: x, a, p
+// end type
+// type t2                     ultimate components:  y, c%x, c%a, c%p, b
+//  real y                     direct components:    y, c, c%x, c%a, c%p, b
+//  type(t1) :: c              potential components: y, c, c%x, c%a, b, b%x, b%a
+//  type(t1), allocatable :: b potential & pointers: potentials + c%p + b%p
+// end type
+//
 // Parent and procedure components are considered against these definitions.
 // For this kind of iterator, the component tree is recursively visited in the
 // following order:
@@ -620,8 +632,8 @@ UltimateComponentIterator::const_iterator FindAllocatableUltimateComponent(
     const DerivedTypeSpec &);
 DirectComponentIterator::const_iterator FindAllocatableOrPointerDirectComponent(
     const DerivedTypeSpec &);
-UltimateComponentIterator::const_iterator
-FindPolymorphicAllocatableUltimateComponent(const DerivedTypeSpec &);
+PotentialComponentIterator::const_iterator
+FindPolymorphicAllocatablePotentialComponent(const DerivedTypeSpec &);
 
 // The LabelEnforce class (given a set of labels) provides an error message if
 // there is a branch to a label which is not in the given set.
diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp
index 342aac4dd5d53e7..fef4620857a08ab 100644
--- a/flang/lib/Evaluate/check-expression.cpp
+++ b/flang/lib/Evaluate/check-expression.cpp
@@ -493,7 +493,7 @@ std::optional<Expr<SomeType>> NonPointerInitializationExpr(const Symbol &symbol,
       } else {
         context.messages().Say(
             "Initialization expression for '%s' (%s) cannot be computed as a constant value"_err_en_US,
-            symbol.name(), folded.AsFortran());
+            symbol.name(), x.AsFortran());
       }
     } else if (xType) {
       context.messages().Say(
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 34faba39ffd46f2..4d78f814f8ef2c9 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1696,7 +1696,8 @@ bool IsSaved(const Symbol &original) {
       (features.IsEnabled(common::LanguageFeature::SaveMainProgram) ||
           (features.IsEnabled(
                common::LanguageFeature::SaveBigMainProgramVariables) &&
-              symbol.size() > 32))) {
+              symbol.size() > 32)) &&
+      Fortran::evaluate::CanCUDASymbolHasSave(symbol)) {
     // With SaveBigMainProgramVariables, keeping all unsaved main program
     // variables of 32 bytes or less on the stack allows keeping numerical and
     // logical scalars, small scalar characters or derived, small arrays, and
diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp
index 463ac01da0e295e..5ecc3701b4f2460 100644
--- a/flang/lib/Evaluate/type.cpp
+++ b/flang/lib/Evaluate/type.cpp
@@ -505,7 +505,13 @@ bool AreSameDerivedType(
   return AreSameDerivedType(x, y, false, false, inProgress);
 }
 
-bool AreSameDerivedType(
+bool AreSameDerivedTypeIgnoringTypeParameters(
+    const semantics::DerivedTypeSpec &x, const semantics::DerivedTypeSpec &y) {
+  SetOfDerivedTypePairs inProgress;
+  return AreSameDerivedType(x, y, true, true, inProgress);
+}
+
+static bool AreSameDerivedType(
     const semantics::DerivedTypeSpec *x, const semantics::DerivedTypeSpec *y) {
   return x == y || (x && y && AreSameDerivedType(*x, *y));
 }
diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
index 70b503799421616..f059d36315a345a 100644
--- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
@@ -234,9 +234,20 @@ class CufOpConversion : public fir::impl::CufOpConversionBase<CufOpConversion> {
         fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
     fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
                                          /*forceUnifiedTBAATree=*/false, *dl);
-
-    target.addIllegalOp<cuf::AllocOp, cuf::AllocateOp, cuf::DeallocateOp,
-                        cuf::FreeOp>();
+    target.addDynamicallyLegalOp<cuf::AllocOp>([](::cuf::AllocOp op) {
+      return !mlir::isa<fir::BaseBoxType>(op.getInType());
+    });
+    target.addDynamicallyLegalOp<cuf::FreeOp>([](::cuf::FreeOp op) {
+      if (auto refTy = mlir::dyn_cast_or_null<fir::ReferenceType>(
+              op.getDevptr().getType())) {
+        return !mlir::isa<fir::BaseBoxType>(refTy.getEleTy());
+      }
+      return true;
+    });
+    target.addDynamicallyLegalOp<cuf::AllocateOp>(
+        [](::cuf::AllocateOp op) { return isBoxGlobal(op); });
+    target.addDynamicallyLegalOp<cuf::DeallocateOp>(
+        [](::cuf::DeallocateOp op) { return isBoxGlobal(op); });
     patterns.insert<CufAllocOpConversion>(ctx, &*dl, &typeConverter);
     patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion,
                     CufFreeOpConversion>(ctx);
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
index 8f7a200d23239b7..a5363a6710d3196 100644
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -270,11 +270,13 @@ static bool IsTypeCompatible(
     const DeclTypeSpec &type1, const DerivedTypeSpec &derivedType2) {
   if (const DerivedTypeSpec * derivedType1{type1.AsDerived()}) {
     if (type1.category() == DeclTypeSpec::Category::TypeDerived) {
-      return &derivedType1->typeSymbol() == &derivedType2.typeSymbol();
+      return evaluate::AreSameDerivedTypeIgnoringTypeParameters(
+          *derivedType1, derivedType2);
     } else if (type1.category() == DeclTypeSpec::Category::ClassDerived) {
       for (const DerivedTypeSpec *parent{&derivedType2}; parent;
            parent = parent->typeSymbol().GetParentTypeSpec()) {
-        if (&derivedType1->typeSymbol() == &parent->typeSymbol()) {
+        if (evaluate::AreSameDerivedTypeIgnoringTypeParameters(
+                *derivedType1, *parent)) {
           return true;
         }
       }
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index a52f013a70b9d9e..de3fa8794caedf7 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -247,6 +247,14 @@ void CheckHelper::Check(
   }
 }
 
+static bool IsBlockData(const Scope &scope) {
+  return scope.kind() == Scope::Kind::BlockData;
+}
+
+static bool IsBlockData(const Symbol &symbol) {
+  return symbol.scope() && IsBlockData(*symbol.scope());
+}
+
 void CheckHelper::Check(const Symbol &symbol) {
   if (symbol.name().size() > common::maxNameLen &&
       &symbol == &symbol.GetUltimate()) {
@@ -397,9 +405,10 @@ void CheckHelper::Check(const Symbol &symbol) {
             messages_.Say(
                 "Result of pure function may not have an impure FINAL subroutine"_err_en_US);
           }
-          if (auto bad{FindPolymorphicAllocatableUltimateComponent(*derived)}) {
+          if (auto bad{
+                  FindPolymorphicAllocatablePotentialComponent(*derived)}) {
             SayWithDeclaration(*bad,
-                "Result of pure function may not have polymorphic ALLOCATABLE ultimate component '%s'"_err_en_US,
+                "Result of pure function may not have polymorphic ALLOCATABLE potential component '%s'"_err_en_US,
                 bad.BuildResultDesignatorName());
           }
         }
@@ -463,6 +472,23 @@ void CheckHelper::Check(const Symbol &symbol) {
       messages_.Say(
           "Automatic data object '%s' may not appear in a module"_err_en_US,
           symbol.name());
+    } else if (IsBlockData(symbol.owner())) {
+      messages_.Say(
+          "Automatic data object '%s' may not appear in a BLOCK DATA subprogram"_err_en_US,
+          symbol.name());
+    } else if (symbol.owner().kind() == Scope::Kind::MainProgram) {
+      if (context_.IsEnabled(common::LanguageFeature::AutomaticInMainProgram)) {
+        if (context_.ShouldWarn(
+                common::LanguageFeature::AutomaticInMainProgram)) {
+          messages_.Say(
+              "Automatic data object '%s' should not appear in the specification part of a main program"_port_en_US,
+              symbol.name());
+        }
+      } else {
+        messages_.Say(
+            "Automatic data object '%s' may not appear in the specification part of a main program"_err_en_US,
+            symbol.name());
+      }
     }
   }
   if (IsProcedure(symbol)) {
@@ -2799,10 +2825,6 @@ static bool IsSubprogramDefinition(const Symbol &symbol) {
       symbol.scope()->kind() == Scope::Kind::Subprogram;
 }
 
-static bool IsBlockData(const Symbol &symbol) {
-  return symbol.scope() && symbol.scope()->kind() == Scope::Kind::BlockData;
-}
-
 static bool IsExternalProcedureDefinition(const Symbol &symbol) {
   return IsBlockData(symbol) ||
       (IsSubprogramDefinition(symbol) &&
diff --git a/flang/lib/Semantics/check-do-forall.cpp b/flang/lib/Semantics/check-do-forall.cpp
index 34225cd40619285..d798244ff1ef2d3 100644
--- a/flang/lib/Semantics/check-do-forall.cpp
+++ b/flang/lib/Semantics/check-do-forall.cpp
@@ -269,8 +269,7 @@ class DoConcurrentBodyEnforce {
       const parser::CharBlock statementLocation{
           GetImageControlStmtLocation(construct)};
       auto &msg{context_.Say(statementLocation,
-          "An image control statement is not allowed in DO"
-          " CONCURRENT"_err_en_US)};
+          "An image control statement is not allowed in DO CONCURRENT"_err_en_US)};
       if (auto coarrayMsg{GetImageControlStmtCoarrayMsg(construct)}) {
         msg.Attach(statementLocation, *coarrayMsg);
       }
@@ -286,19 +285,32 @@ class DoConcurrentBodyEnforce {
         .Attach(doConcurrentSourcePosition_, GetEnclosingDoMsg());
   }
 
-  // C1139: call to impure procedure and ...
-  // C1141: cannot call ieee_get_flag, ieee_[gs]et_halting_mode
-  // It's not necessary to check the ieee_get* procedures because they're
-  // not pure, and impure procedures are caught by checks for constraint C1139
+  // C1145, C1146: cannot call ieee_[gs]et_flag, ieee_[gs]et_halting_mode,
+  // ieee_[gs]et_status, ieee_set_rounding_mode, or ieee_set_underflow_mode
   void Post(const parser::ProcedureDesignator &procedureDesignator) {
     if (auto *name{std::get_if<parser::Name>(&procedureDesignator.u)}) {
-      if (name->symbol &&
-          fromScope(*name->symbol, "__fortran_ieee_exceptions"s)) {
-        if (name->source == "ieee_set_halting_mode") {
-          SayWithDo(context_, currentStatementSourcePosition_,
-              "IEEE_SET_HALTING_MODE is not allowed in DO "
-              "CONCURRENT"_err_en_US,
-              doConcurrentSourcePosition_);
+      if (name->symbol) {
+        const Symbol &ultimate{name->symbol->GetUltimate()};
+        const Scope &scope{ultimate.owner()};
+        if (const Symbol * module{scope.IsModule() ? scope.symbol() : nullptr};
+            module &&
+            (module->name() == "__fortran_ieee_arithmetic" ||
+                module->name() == "__fortran_ieee_exceptions")) {
+          std::string s{ultimate.name().ToString()};
+          static constexpr const char *badName[]{"ieee_get_flag",
+              "ieee_set_flag", "ieee_get_halting_mode", "ieee_set_halting_mode",
+              "ieee_get_status", "ieee_set_status", "ieee_set_rounding_mode",
+              "ieee_set_underflow_mode", nullptr};
+          for (std::size_t j{0}; badName[j]; ++j) {
+            if (s.find(badName[j]) != s.npos) {
+              context_
+                  .Say(name->source,
+                      "'%s' may not be called in DO CONCURRENT"_err_en_US,
+                      badName[j])
+                  .Attach(doConcurrentSourcePosition_, GetEnclosingDoMsg());
+              break;
+            }
+          }
         }
       }
     }
@@ -319,15 +331,6 @@ class DoConcurrentBodyEnforce {
   }
 
 private:
-  bool fromScope(const Symbol &symbol, const std::string &moduleName) {
-    if (symbol.GetUltimate().owner().IsModule() &&
-        symbol.GetUltimate().owner().GetName().value().ToString() ==
-            moduleName) {
-      return true;
-    }
-    return false;
-  }
-
   std::set<parser::Label> labels_;
   parser::CharBlock currentStatementSourcePosition_;
   SemanticsContext &context_;
@@ -372,8 +375,8 @@ class DoConcurrentVariableEnforce {
 // Find a DO or FORALL and enforce semantics checks on its body
 class DoContext {
 public:
-  DoContext(SemanticsContext &context, IndexVarKind kind)
-      : context_{context}, kind_{kind} {}
+  DoContext(SemanticsContext &context, IndexVarKind kind, bool isNested)
+      : context_{context}, kind_{kind}, isNested_{isNested} {}
 
   // Mark this DO construct as a point of definition for the DO variables
   // or index-names it contains.  If they're already defined, emit an error
@@ -743,13 +746,21 @@ class DoContext {
             std::get<std::optional<parser::ScalarLogicalExpr>>(header.t)}) {
       CheckMaskIsPure(*mask);
     }
-    auto &controls{std::get<std::list<parser::ConcurrentControl>>(header.t)};
+    const auto &controls{
+        std::get<std::list<parser::ConcurrentControl>>(header.t)};
     UnorderedSymbolSet indexNames;
     for (const parser::ConcurrentControl &control : controls) {
       const auto &indexName{std::get<parser::Name>(control.t)};
       if (indexName.symbol) {
         indexNames.insert(*indexName.symbol);
       }
+      if (isNested_) {
+        CheckForImpureCall(std::get<1>(control.t));
+        CheckForImpureCall(std::get<2>(control.t));
+        if (const auto &stride{std::get<3>(control.t)}) {
+          CheckForImpureCall(*stride);
+        }
+      }
     }
     if (!indexNames.empty()) {
       for (const parser::ConcurrentControl &control : controls) {
@@ -808,13 +819,24 @@ class DoContext {
     CheckConcurrentHeader(std::get<parser::ConcurrentHeader>(concurrent.t));
   }
 
-  template <typename T> void CheckForImpureCall(const T &x) {
+  template <typename T> void CheckForImpureCall(const T &x) const {
     if (auto bad{FindImpureCall(context_.foldingContext(), x)}) {
       context_.Say(
           "Impure procedure '%s' may not be referenced in a %s"_err_en_US, *bad,
           LoopKindName());
     }
   }
+  void CheckForImpureCall(const parser::ScalarIntExpr &x) const {
+    const auto &parsedExpr{x.thing.thing.value()};
+    auto oldLocation{context_.location()};
+    context_.set_location(parsedExpr.source);
+    if (const auto &typedExpr{parsedExpr.typedExpr}) {
+      if (const auto &expr{typedExpr->v}) {
+        CheckForImpureCall(*expr);
+      }
+    }
+    context_.set_location(oldLocation);
+  }
 
   // Each index should be used on the LHS of each assignment in a FORALL
   void CheckForallIndexesUsed(const evaluate::Assignment &assignment) {
@@ -870,40 +892,47 @@ class DoContext {
   SemanticsContext &context_;
   const IndexVarKind kind_;
   parser::CharBlock currentStatementSourcePosition_;
+  bool isNested_{false};
 }; // class DoContext
 
 void DoForallChecker::Enter(const parser::DoConstruct &doConstruct) {
-  DoContext doContext{context_, IndexVarKind::DO};
+  DoContext doContext{context_, IndexVarKind::DO, constructNesting_ > 0};
   doContext.DefineDoVariables(doConstruct);
 }
 
 void DoForallChecker::Leave(const parser::DoConstruct &doConstruct) {
-  DoContext doContext{context_, IndexVarKind::DO};
+  DoContext doContext{context_, IndexVarKind::DO, constructNesting_ > 0};
+  ++constructNesting_;
   doContext.Check(doConstruct);
   doContext.ResetDoVariables(doConstruct);
+  --constructNesting_;
 }
 
 void DoForallChecker::Enter(const parser::ForallConstruct &construct) {
-  DoContext doContext{context_, IndexVarKind::FORALL};
+  DoContext doContext{context_, IndexVarKind::FORALL, constructNesting_ > 0};
   doContext.ActivateIndexVars(GetControls(construct));
+  ++constructNesting_;
+  doContext.Check(construct);
 }
 void DoForallChecker::Leave(const parser::ForallConstruct &construct) {
-  DoContext doContext{context_, IndexVarKind::FORALL};
-  doContext.Check(construct);
+  DoContext doContext{context_, IndexVarKind::FORALL, constructNesting_ > 0};
   doContext.DeactivateIndexVars(GetControls(construct));
+  --constructNesting_;
 }
 
 void DoForallChecker::Enter(const parser::ForallStmt &stmt) {
-  DoContext doContext{context_, IndexVarKind::FORALL};
+  DoContext doContext{context_, IndexVarKind::FORALL, constructNesting_ > 0};
+  ++constructNesting_;
+  doContext.Check(stmt);
   doContext.ActivateIndexVars(GetControls(stmt));
 }
 void DoForallChecker::Leave(const parser::ForallStmt &stmt) {
-  DoContext doContext{context_, IndexVarKind::FORALL};
-  doContext.Check(stmt);
+  DoContext doContext{context_, IndexVarKind::FORALL, constructNesting_ > 0};
   doContext.DeactivateIndexVars(GetControls(stmt));
+  --constructNesting_;
 }
 void DoForallChecker::Leave(const parser::ForallAssignmentStmt &stmt) {
-  DoContext doContext{context_, IndexVarKind::FORALL};
+  DoContext doContext{context_, IndexVarKind::FORALL, constructNesting_ > 0};
   doContext.Check(stmt);
 }
 
diff --git a/flang/lib/Semantics/check-do-forall.h b/flang/lib/Semantics/check-do-forall.h
index 3b2ae59f5f3fff3..f08ff3467d4cc93 100644
--- a/flang/lib/Semantics/check-do-forall.h
+++ b/flang/lib/Semantics/check-do-forall.h
@@ -60,6 +60,7 @@ class DoForallChecker : public virtual BaseChecker {
 private:
   SemanticsContext &context_;
   int exprDepth_{0};
+  int constructNesting_{0};
 
   void SayBadLeave(
       StmtType, const char *enclosingStmt, const ConstructNode &) const;
diff --git a/flang/lib/Semantics/check-io.cpp b/flang/lib/Semantics/check-io.cpp
index 8bde737c4cb948d..54e8e09cbf7e48a 100644
--- a/flang/lib/Semantics/check-io.cpp
+++ b/flang/lib/Semantics/check-io.cpp
@@ -675,6 +675,7 @@ void IoChecker::Leave(const parser::BackspaceStmt &) {
   CheckForPureSubprogram();
   CheckForRequiredSpecifier(
       flags_.test(Flag::NumberUnit), "UNIT number"); // C1240
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -682,6 +683,7 @@ void IoChecker::Leave(const parser::CloseStmt &) {
   CheckForPureSubprogram();
   CheckForRequiredSpecifier(
       flags_.test(Flag::NumberUnit), "UNIT number"); // C1208
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -689,6 +691,7 @@ void IoChecker::Leave(const parser::EndfileStmt &) {
   CheckForPureSubprogram();
   CheckForRequiredSpecifier(
       flags_.test(Flag::NumberUnit), "UNIT number"); // C1240
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -696,6 +699,7 @@ void IoChecker::Leave(const parser::FlushStmt &) {
   CheckForPureSubprogram();
   CheckForRequiredSpecifier(
       flags_.test(Flag::NumberUnit), "UNIT number"); // C1243
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -708,6 +712,7 @@ void IoChecker::Leave(const parser::InquireStmt &stmt) {
         "UNIT number or FILE"); // C1246
     CheckForProhibitedSpecifier(IoSpecKind::File, IoSpecKind::Unit); // C1246
     CheckForRequiredSpecifier(IoSpecKind::Id, IoSpecKind::Pending); // C1248
+    CheckForUselessIomsg();
   }
   Done();
 }
@@ -742,11 +747,13 @@ void IoChecker::Leave(const parser::OpenStmt &) {
     CheckForProhibitedSpecifier(flags_.test(Flag::AccessStream),
         "STATUS='STREAM'", IoSpecKind::Recl); // 12.5.6.15
   }
+  CheckForUselessIomsg();
   Done();
 }
 
 void IoChecker::Leave(const parser::PrintStmt &) {
   CheckForPureSubprogram();
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -817,6 +824,7 @@ void IoChecker::Leave(const parser::RewindStmt &) {
   CheckForRequiredSpecifier(
       flags_.test(Flag::NumberUnit), "UNIT number"); // C1240
   CheckForPureSubprogram();
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -824,6 +832,7 @@ void IoChecker::Leave(const parser::WaitStmt &) {
   CheckForRequiredSpecifier(
       flags_.test(Flag::NumberUnit), "UNIT number"); // C1237
   CheckForPureSubprogram();
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -883,6 +892,7 @@ void IoChecker::LeaveReadWrite() const {
       "FMT or NML"); // C1227
   CheckForRequiredSpecifier(IoSpecKind::Round, flags_.test(Flag::FmtOrNml),
       "FMT or NML"); // C1227
+  CheckForUselessIomsg();
 }
 
 void IoChecker::SetSpecifier(IoSpecKind specKind) {
@@ -1057,6 +1067,15 @@ void IoChecker::CheckForPureSubprogram() const { // C1597
   }
 }
 
+void IoChecker::CheckForUselessIomsg() const {
+  if (specifierSet_.test(IoSpecKind::Iomsg) &&
+      !specifierSet_.test(IoSpecKind::Err) &&
+      !specifierSet_.test(IoSpecKind::Iostat) &&
+      context_.ShouldWarn(common::UsageWarning::UselessIomsg)) {
+    context_.Say("IOMSG= is useless without either ERR= or IOSTAT="_warn_en_US);
+  }
+}
+
 // Seeks out an allocatable or pointer ultimate component that is not
 // nested in a nonallocatable/nonpointer component with a specific
 // defined I/O procedure.
diff --git a/flang/lib/Semantics/check-io.h b/flang/lib/Semantics/check-io.h
index 0ef166f7f100edb..2fb03c63afe353d 100644
--- a/flang/lib/Semantics/check-io.h
+++ b/flang/lib/Semantics/check-io.h
@@ -125,6 +125,7 @@ class IoChecker : public virtual BaseChecker {
   void CheckForDefinableVariable(const A &var, const std::string &s) const;
 
   void CheckForPureSubprogram() const;
+  void CheckForUselessIomsg() const;
 
   parser::Message *CheckForBadIoType(const evaluate::DynamicType &,
       common::DefinedIo, parser::CharBlock) const;
diff --git a/flang/lib/Semantics/definable.cpp b/flang/lib/Semantics/definable.cpp
index ae76f668f6ce7b9..62fed63df4475cf 100644
--- a/flang/lib/Semantics/definable.cpp
+++ b/flang/lib/Semantics/definable.cpp
@@ -223,7 +223,8 @@ static std::optional<parser::Message> WhyNotDefinableLast(parser::CharBlock at,
       }
       if (const DerivedTypeSpec * derived{GetDerivedTypeSpec(dyType)}) {
         if (!flags.test(DefinabilityFlag::PolymorphicOkInPure)) {
-          if (auto bad{FindPolymorphicAllocatableUltimateComponent(*derived)}) {
+          if (auto bad{
+                  FindPolymorphicAllocatablePotentialComponent(*derived)}) {
             return BlameSymbol(at,
                 "'%s' has polymorphic component '%s' in a pure subprogram"_en_US,
                 original, bad.BuildResultDesignatorName());
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 0ec96d447c0b9d4..4f1c53f1bb53fcb 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -3026,8 +3026,7 @@ const Symbol *AssumedTypeDummy<parser::PointerObject>(
 bool ExpressionAnalyzer::CheckIsValidForwardReference(
     const semantics::DerivedTypeSpec &dtSpec) {
   if (dtSpec.IsForwardReferenced()) {
-    Say("Cannot construct value for derived type '%s' "
-        "before it is defined"_err_en_US,
+    Say("Cannot construct value for derived type '%s' before it is defined"_err_en_US,
         dtSpec.name());
     return false;
   }
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index b7725c5b0022844..b4875d87d172c2b 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -5507,11 +5507,8 @@ void DeclarationVisitor::Post(const parser::DerivedTypeStmt &x) {
   std::optional<DerivedTypeSpec> extendsType{
       ResolveExtendsType(name, extendsName)};
   DerivedTypeDetails derivedTypeDetails;
-  if (Symbol * typeSymbol{FindInScope(currScope(), name)}; typeSymbol &&
-      typeSymbol->has<DerivedTypeDetails>() &&
-      typeSymbol->get<DerivedTypeDetails>().isForwardReferenced()) {
-    derivedTypeDetails.set_isForwardReferenced(true);
-  }
+  // Catch any premature structure constructors within the definition
+  derivedTypeDetails.set_isForwardReferenced(true);
   auto &symbol{MakeSymbol(name, GetAttrs(), std::move(derivedTypeDetails))};
   symbol.ReplaceName(name.source);
   derivedTypeInfo_.type = &symbol;
@@ -9235,7 +9232,7 @@ void ResolveNamesVisitor::ResolveSpecificationParts(ProgramTree &node) {
       node.GetKind() == ProgramTree::Kind::Submodule};
   for (auto &pair : *node.scope()) {
     Symbol &symbol{*pair.second};
-    if (inModule && symbol.attrs().test(Attr::EXTERNAL) &&
+    if (inModule && symbol.attrs().test(Attr::EXTERNAL) && !IsPointer(symbol) &&
         !symbol.test(Symbol::Flag::Function) &&
         !symbol.test(Symbol::Flag::Subroutine)) {
       // in a module, external proc without return type is subroutine
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
index 3cb24f6c6af4371..f7a277d1b414f6d 100644
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -221,10 +221,13 @@ static bool PerformStatementSemantics(
   if (context.languageFeatures().IsEnabled(common::LanguageFeature::CUDA)) {
     SemanticsVisitor<CUDAChecker>{context}.Walk(program);
   }
-  if (!context.AnyFatalError()) {
+  if (!context.messages().AnyFatalError()) {
+    // Do this if all messages are only warnings
     if (context.ShouldWarn(common::UsageWarning::UndefinedFunctionResult)) {
       WarnUndefinedFunctionResult(context, context.globalScope());
     }
+  }
+  if (!context.AnyFatalError()) {
     pass2.CompileDataInitializationsIntoInitializers();
   }
   return !context.AnyFatalError();
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index fdaf052c2d34eb3..57d84bde60b43c0 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -866,7 +866,7 @@ const Symbol *HasImpureFinal(const Symbol &original, std::optional<int> rank) {
 
 bool MayRequireFinalization(const DerivedTypeSpec &derived) {
   return IsFinalizable(derived) ||
-      FindPolymorphicAllocatableUltimateComponent(derived);
+      FindPolymorphicAllocatablePotentialComponent(derived);
 }
 
 bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived) {
@@ -1404,11 +1404,11 @@ DirectComponentIterator::const_iterator FindAllocatableOrPointerDirectComponent(
   return std::find_if(directs.begin(), directs.end(), IsAllocatableOrPointer);
 }
 
-UltimateComponentIterator::const_iterator
-FindPolymorphicAllocatableUltimateComponent(const DerivedTypeSpec &derived) {
-  UltimateComponentIterator ultimates{derived};
+PotentialComponentIterator::const_iterator
+FindPolymorphicAllocatablePotentialComponent(const DerivedTypeSpec &derived) {
+  PotentialComponentIterator potentials{derived};
   return std::find_if(
-      ultimates.begin(), ultimates.end(), IsPolymorphicAllocatable);
+      potentials.begin(), potentials.end(), IsPolymorphicAllocatable);
 }
 
 const Symbol *FindUltimateComponent(const DerivedTypeSpec &derived,
diff --git a/flang/module/__fortran_ieee_exceptions.f90 b/flang/module/__fortran_ieee_exceptions.f90
index 810a2b0e400f242..cebd60452018125 100644
--- a/flang/module/__fortran_ieee_exceptions.f90
+++ b/flang/module/__fortran_ieee_exceptions.f90
@@ -129,7 +129,7 @@ end subroutine ieee_set_modes_0
   public :: ieee_set_modes
 
   interface ieee_set_status
-    subroutine ieee_set_status_0(status)
+    pure subroutine ieee_set_status_0(status)
       import ieee_status_type
       type(ieee_status_type), intent(in) :: status
     end subroutine ieee_set_status_0
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
index cd00d40361d28b3..bd657b800c61e89 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -18,8 +18,9 @@
 #include "cuda.h"
 
 namespace Fortran::runtime::cuda {
+extern "C" {
 
-void CUFRegisterAllocator() {
+void RTDEF(CUFRegisterAllocator)() {
   allocatorRegistry.Register(
       kPinnedAllocatorPos, {&CUFAllocPinned, CUFFreePinned});
   allocatorRegistry.Register(
@@ -29,6 +30,7 @@ void CUFRegisterAllocator() {
   allocatorRegistry.Register(
       kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified});
 }
+}
 
 void *CUFAllocPinned(std::size_t sizeInBytes) {
   void *p;
diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index 37989bbcee0ab8d..71021dd8a01588c 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -119,7 +119,7 @@ static RT_API_ATTRS bool EditBOZInput(
   std::memset(n, 0, bytes);
   int increment{isHostLittleEndian ? -1 : 1};
   auto *data{reinterpret_cast<unsigned char *>(n) +
-      (isHostLittleEndian ? significantBytes - 1 : 0)};
+      (isHostLittleEndian ? significantBytes - 1 : bytes - significantBytes)};
   int shift{((digits - 1) * LOG2_BASE) & 7};
   while (digits > 0) {
     char32_t ch{*io.NextInField(remaining, edit)};
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 1274d3921dd854e..569e72f57d6d6c0 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -14,7 +14,6 @@ func.func @_QPsub1() {
   return
 }
 
-
 // CHECK-LABEL: func.func @_QPsub1()
 // CHECK: %[[DESC_RT_CALL:.*]] = fir.call @_FortranACUFAllocDesciptor(%{{.*}}, %{{.*}}, %{{.*}}) : (i64, !fir.ref<i8>, i32) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
@@ -27,4 +26,37 @@ func.func @_QPsub1() {
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFFreeDesciptor(%[[BOX_NONE]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> none
 
+// Check operations that should not be transformed yet.
+func.func @_QPsub2() {
+  %0 = cuf.alloc !fir.array<10xf32> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFcuda_alloc_freeEa"} -> !fir.ref<!fir.array<10xf32>>
+  cuf.free %0 : !fir.ref<!fir.array<10xf32>> {data_attr = #cuf.cuda<device>}
+  return
 }
+
+// CHECK-LABEL: func.func @_QPsub2()
+// CHECK: cuf.alloc !fir.array<10xf32>
+// CHECK: cuf.free %{{.*}} : !fir.ref<!fir.array<10xf32>>
+
+fir.global @_QMmod1Ea {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xf32>>> {
+    %0 = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+    %c0 = arith.constant 0 : index
+    %1 = fir.shape %c0 : (index) -> !fir.shape<1>
+    %2 = fir.embox %0(%1) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+    fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?xf32>>>
+}
+
+func.func @_QPsub3() {
+  %0 = fir.address_of(@_QMmod1Ea) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+  %1:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %2 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
+  %3 = cuf.deallocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
+  return
+}
+
+// CHECK-LABEL: func.func @_QPsub3()
+// CHECK: cuf.allocate
+// CHECK: cuf.deallocate
+
+}
+
+
diff --git a/flang/test/Lower/CUDA/cuda-program-global.cuf b/flang/test/Lower/CUDA/cuda-program-global.cuf
index 97b9927b3082fd8..a3c9e1ba8d253c1 100644
--- a/flang/test/Lower/CUDA/cuda-program-global.cuf
+++ b/flang/test/Lower/CUDA/cuda-program-global.cuf
@@ -1,19 +1,22 @@
 ! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
 
-! Test lowering of program local variable that are global
+! Test lowering of program local variables. Make sure CUDA device variables are
+! not lowered as global.
 
 program test
   integer, device :: a(10)
+  integer, unified :: u(10)
   integer :: b(10)
   integer :: i
   print*,i
 end
 
 ! CHECK-LABEL: func.func @_QQmain()
-! CHECK: fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
+! CHECK: cuf.alloc !fir.array<10xi32> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa"} -> !fir.ref<!fir.array<10xi32>>
 ! CHECK: fir.address_of(@_QFEb) : !fir.ref<!fir.array<10xi32>>
 ! CHECK: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
 ! CHECK: hlfir.declare %[[ALLOCA]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
-! CHECK: fir.global internal @_QFEa {data_attr = #cuf.cuda<device>} : !fir.array<10xi32> {{{$}}
+! CHECK-NOT: fir.global internal @_QFEa {data_attr = #cuf.cuda<device>} : !fir.array<10xi32> {{{$}}
 ! CHECK: fir.global internal @_QFEb : !fir.array<10xi32> {{{$}}
+! CHECK: fir.global internal @_QFEu {data_attr = #cuf.cuda<unified>} : !fir.array<10xi32>
diff --git a/flang/test/Semantics/allocate08.f90 b/flang/test/Semantics/allocate08.f90
index cc074a149ae9ed4..b2b88f78b32bea5 100644
--- a/flang/test/Semantics/allocate08.f90
+++ b/flang/test/Semantics/allocate08.f90
@@ -95,6 +95,42 @@ subroutine bar
   end subroutine
 end module
 
+module mod1
+  type, bind(C) :: t
+     integer :: n
+  end type
+  type(t), allocatable :: x
+end
+
+module mod2
+  type, bind(C) :: t
+     integer :: n
+  end type
+  type(t), allocatable :: x
+end
+
+module mod3
+  type, bind(C) :: t
+     real :: a
+  end type
+  type(t), allocatable :: x
+end
+
+subroutine same_type
+  use mod1, only: a => x
+  use mod2, only: b => x
+  use mod3, only: c => x
+  allocate(a)
+  allocate(b, source=a) ! ok
+  deallocate(a)
+  allocate(a, source=b) ! ok
+  !ERROR: Allocatable object in ALLOCATE must be type compatible with source expression from MOLD or SOURCE
+  allocate(c, source=a)
+  deallocate(a)
+  !ERROR: Allocatable object in ALLOCATE must be type compatible with source expression from MOLD or SOURCE
+  allocate(a, source=c)
+end
+
 ! Related to C945, check typeless expression are caught
 
 subroutine sub
diff --git a/flang/test/Semantics/assign03.f90 b/flang/test/Semantics/assign03.f90
index a80ef1e102b2b99..d8e7f14238f9203 100644
--- a/flang/test/Semantics/assign03.f90
+++ b/flang/test/Semantics/assign03.f90
@@ -1,6 +1,10 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
 ! Pointer assignment constraints 10.2.2.2 (see also assign02.f90)
 
+module m0
+  procedure(),pointer,save :: p
+end
+
 module m
   interface
     subroutine s(i)
@@ -324,4 +328,10 @@ subroutine s14
     !ERROR: Statement function 'sf' may not be the target of a pointer assignment
     ptr => sf
   end subroutine
+
+  subroutine s15
+    use m0
+    intrinsic sin
+    p=>sin ! ok
+  end
 end
diff --git a/flang/test/Semantics/bad-forward-type.f90 b/flang/test/Semantics/bad-forward-type.f90
index 432d450a15f3fc5..27c6045b0059fa2 100644
--- a/flang/test/Semantics/bad-forward-type.f90
+++ b/flang/test/Semantics/bad-forward-type.f90
@@ -76,7 +76,8 @@ subroutine s8
   !ERROR: Cannot construct value for derived type 't2' before it is defined
   parameter(y=t2(12.3))
   type t2
-    real :: c
+    !ERROR: Cannot construct value for derived type 't2' before it is defined
+    real :: c = transfer(t2(),0.)
   end type
 end subroutine
 
diff --git a/flang/test/Semantics/call10.f90 b/flang/test/Semantics/call10.f90
index ffb3b48c329e72e..2d2f57934cd8aa2 100644
--- a/flang/test/Semantics/call10.f90
+++ b/flang/test/Semantics/call10.f90
@@ -78,7 +78,7 @@ pure function f07() ! C1585
     class(t), allocatable :: f07
   end function
   pure function f08() ! C1585
-    !ERROR: Result of pure function may not have polymorphic ALLOCATABLE ultimate component '%a'
+    !ERROR: Result of pure function may not have polymorphic ALLOCATABLE potential component '%a'
     type(polyAlloc) :: f08
   end function
 
diff --git a/flang/test/Semantics/call11.f90 b/flang/test/Semantics/call11.f90
index f4f474079556238..7bc4931890dee59 100644
--- a/flang/test/Semantics/call11.f90
+++ b/flang/test/Semantics/call11.f90
@@ -42,6 +42,46 @@ subroutine test
       !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
       a(j) = impure(j) ! C1139
     end do
+    do concurrent (k=impure(1):1); end do ! ok
+    do concurrent (k=1:impure(1)); end do ! ok
+    do concurrent (k=1:1:impure(1)); end do ! ok
+    forall (k=impure(1):1); end forall ! ok
+    forall (k=1:impure(1)); end forall ! ok
+    forall (k=1:1:impure(1)); end forall ! ok
+    do concurrent (j=1:1)
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      do concurrent (k=impure(1):1); end do
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      do concurrent (k=1:impure(1)); end do
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      do concurrent (k=1:1:impure(1)); end do
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      forall (k=impure(1):1); end forall
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      forall (k=1:impure(1)); end forall
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      forall (k=1:1:impure(1)); end forall
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      forall (k=impure(1):1) a(k) = 0.
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      forall (k=1:impure(1)) a(k) = 0.
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      forall (k=1:1:impure(1)) a(k) = 0.
+    end do
+    forall (j=1:1)
+      !ERROR: Impure procedure 'impure' may not be referenced in a FORALL
+      forall (k=impure(1):1); end forall
+      !ERROR: Impure procedure 'impure' may not be referenced in a FORALL
+      forall (k=1:impure(1)); end forall
+      !ERROR: Impure procedure 'impure' may not be referenced in a FORALL
+      forall (k=1:1:impure(1)); end forall
+      !ERROR: Impure procedure 'impure' may not be referenced in a FORALL
+      forall (k=impure(1):1) a(j*k) = 0.
+      !ERROR: Impure procedure 'impure' may not be referenced in a FORALL
+      forall (k=1:impure(1)) a(j*k) = 0.
+      !ERROR: Impure procedure 'impure' may not be referenced in a FORALL
+      forall (k=1:1:impure(1)) a(j*k) = 0.
+    end forall
   end subroutine
 
   subroutine test2
diff --git a/flang/test/Semantics/doconcurrent01.f90 b/flang/test/Semantics/doconcurrent01.f90
index 7c13a26814e5be3..9bb2b4537683510 100644
--- a/flang/test/Semantics/doconcurrent01.f90
+++ b/flang/test/Semantics/doconcurrent01.f90
@@ -48,18 +48,22 @@ subroutine do_concurrent_test2(i,j,n,flag)
     change team (j)
 !ERROR: An image control statement is not allowed in DO CONCURRENT
       critical
-        call ieee_get_status(status) ! ok
-!ERROR: IEEE_SET_HALTING_MODE is not allowed in DO CONCURRENT
-        call ieee_set_halting_mode(flag, halting)
       end critical
     end team
 !ERROR: ADVANCE specifier is not allowed in DO CONCURRENT
     write(*,'(a35)',advance='no')
-  end do
-
-! The following is OK
-  do concurrent (i = 1:n)
-        call ieee_set_flag(flag, flagValue)
+!ERROR: 'ieee_get_status' may not be called in DO CONCURRENT
+    call ieee_get_status(status)
+!ERROR: 'ieee_set_status' may not be called in DO CONCURRENT
+    call ieee_set_status(status)
+!ERROR: 'ieee_get_halting_mode' may not be called in DO CONCURRENT
+    call ieee_get_halting_mode(flag, halting)
+!ERROR: 'ieee_set_halting_mode' may not be called in DO CONCURRENT
+    call ieee_set_halting_mode(flag, halting)
+!ERROR: 'ieee_get_flag' may not be called in DO CONCURRENT
+    call ieee_get_flag(flag, flagValue)
+!ERROR: 'ieee_set_flag' may not be called in DO CONCURRENT
+    call ieee_set_flag(flag, flagValue)
   end do
 end subroutine do_concurrent_test2
 
diff --git a/flang/test/Semantics/io05.f90 b/flang/test/Semantics/io05.f90
index 8480ea4b784c299..bef0d6db89524a8 100644
--- a/flang/test/Semantics/io05.f90
+++ b/flang/test/Semantics/io05.f90
@@ -55,6 +55,7 @@
   inquire(1, read=c(1), write=c(2), sign=c(3), sign=c(4), read=c(5), write=c(1))
 
   !ERROR: Duplicate IOMSG specifier
+  !WARNING: IOMSG= is useless without either ERR= or IOSTAT=
   inquire(10, iomsg=msg, pos=ipos, iomsg=msg)
 
   !ERROR: If ID appears, PENDING must also appear
diff --git a/flang/test/Semantics/resolve77.f90 b/flang/test/Semantics/resolve77.f90
index ffee10271d51bfa..943993ee74d76ee 100644
--- a/flang/test/Semantics/resolve77.f90
+++ b/flang/test/Semantics/resolve77.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
 ! Tests valid and invalid usage of forward references to procedures
 ! in specification expressions.
 module m
@@ -56,3 +56,16 @@ pure integer function if2(n)
     if2 = n
   end function
 end subroutine
+
+block data
+  common /blk2/ n
+  data n/100/
+  !ERROR: Automatic data object 'a' may not appear in a BLOCK DATA subprogram
+  real a(n)
+end
+
+program main
+  common /blk2/ n
+  !PORTABILITY: Automatic data object 'a' should not appear in the specification part of a main program
+  real a(n)
+end
diff --git a/flang/test/Semantics/stmt-func01.f90 b/flang/test/Semantics/stmt-func01.f90
index 83c31ded1d39b93..a87b0d7af52b470 100644
--- a/flang/test/Semantics/stmt-func01.f90
+++ b/flang/test/Semantics/stmt-func01.f90
@@ -10,6 +10,7 @@ program main
     pure integer function ifunc()
     end function
   end interface
+  !PORTABILITY: Automatic data object 'x1' should not appear in the specification part of a main program
   type(t1(k=4,l=ifunc())) x1
   !PORTABILITY: Statement function 'sf1' should not contain an array constructor
   sf1(n) = sum([(j,j=1,n)])
diff --git a/flang/test/Semantics/structconst02.f90 b/flang/test/Semantics/structconst02.f90
index 24ec0a196401574..71d6b720fb41a4c 100644
--- a/flang/test/Semantics/structconst02.f90
+++ b/flang/test/Semantics/structconst02.f90
@@ -14,7 +14,7 @@ end function realfunc
     integer(kind=ik) :: ix = int(0,kind=ik)
     real(kind=rk) :: rx = real(0.,kind=rk)
     complex(kind=zk) :: zx = cmplx(0.,0.,kind=zk)
-    !ERROR: Initialization expression for 'cx' (%SET_LENGTH(" ",len)) cannot be computed as a constant value
+    !ERROR: Initialization expression for 'cx' (" ") cannot be computed as a constant value
     character(kind=ck,len=len) :: cx = ' '
     logical(kind=lk) :: lx = .false.
     real(kind=rk), pointer :: rp => NULL()
diff --git a/flang/test/Semantics/typeinfo11.f90 b/flang/test/Semantics/typeinfo11.f90
new file mode 100644
index 000000000000000..92efc8f9ea54b8a
--- /dev/null
+++ b/flang/test/Semantics/typeinfo11.f90
@@ -0,0 +1,17 @@
+!RUN: bbc --dump-symbols %s | FileCheck %s
+!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s
+
+!Tests that derived types with polymorphic potential subobject
+!components do not have their noFinalizationNeeded flags set, even
+!when those components are packaged within another allocatable.
+
+type t1
+  class(*), allocatable :: a
+end type
+type t2
+  type(t1), allocatable :: b
+end type
+type(t2) x
+end
+
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
diff --git a/flang/test/Semantics/undef-result01.f90 b/flang/test/Semantics/undef-result01.f90
index a372fcd544feead..dd73f9c76df0a55 100644
--- a/flang/test/Semantics/undef-result01.f90
+++ b/flang/test/Semantics/undef-result01.f90
@@ -121,6 +121,7 @@ integer function defdBySize()
 end
 
 character(40) function defdByIomsg()
+  !WARNING: IOMSG= is useless without either ERR= or IOSTAT=
   write(123,*,iomsg=defdByIomsg)
 end
 
diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt
index baa949600283517..9f33cdfe3fa90f7 100644
--- a/flang/tools/flang-driver/CMakeLists.txt
+++ b/flang/tools/flang-driver/CMakeLists.txt
@@ -11,18 +11,9 @@ set( LLVM_LINK_COMPONENTS
   TargetParser
 )
 
-option(FLANG_PLUGIN_SUPPORT "Build Flang with plugin support." ON)
-
-# Enable support for plugins, which need access to symbols from flang-new
-if(FLANG_PLUGIN_SUPPORT)
-  set(export_symbols EXPORT_SYMBOLS_FOR_PLUGINS)
-endif()
-
 add_flang_tool(flang-new
   driver.cpp
   fc1_main.cpp
-
-  ${export_symbols}
 )
 
 target_link_libraries(flang-new
@@ -37,4 +28,11 @@ clang_target_link_libraries(flang-new
   clangBasic
 )
 
+option(FLANG_PLUGIN_SUPPORT "Build Flang with plugin support." ON)
+
+# Enable support for plugins, which need access to symbols from flang-new
+if(FLANG_PLUGIN_SUPPORT)
+  export_executable_symbols_for_plugins(flang-new)
+endif()
+
 install(TARGETS flang-new DESTINATION "${CMAKE_INSTALL_BINDIR}")
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
index 4f53e654034cb76..9f5ec289ee8f740 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -55,7 +55,7 @@ class ScopedContext {
 
 TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
   using Fortran::common::TypeCategory;
-  Fortran::runtime::cuda::CUFRegisterAllocator();
+  RTNAME(CUFRegisterAllocator)();
   ScopedContext ctx;
   // REAL(4), DEVICE, ALLOCATABLE :: a(:)
   auto a{createAllocatable(TypeCategory::Real, 4)};
@@ -73,7 +73,7 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
 
 TEST(AllocatableCUFTest, SimplePinnedAllocate) {
   using Fortran::common::TypeCategory;
-  Fortran::runtime::cuda::CUFRegisterAllocator();
+  RTNAME(CUFRegisterAllocator)();
   ScopedContext ctx;
   // INTEGER(4), PINNED, ALLOCATABLE :: a(:)
   auto a{createAllocatable(TypeCategory::Integer, 4)};
@@ -92,7 +92,7 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) {
 
 TEST(AllocatableCUFTest, DescriptorAllocationTest) {
   using Fortran::common::TypeCategory;
-  Fortran::runtime::cuda::CUFRegisterAllocator();
+  RTNAME(CUFRegisterAllocator)();
   ScopedContext ctx;
   // REAL(4), DEVICE, ALLOCATABLE :: a(:)
   auto a{createAllocatable(TypeCategory::Real, 4)};
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 28235cf137c1867..f237e2ea1b9545e 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -126,12 +126,14 @@ void print_header() {
   LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
                          benchmarks[0]->get_suite_name().data());
   LIBC_NAMESPACE::printf("%s", RESET);
-  LIBC_NAMESPACE::printf(
+  cpp::string titles =
       "Benchmark            |  Cycles |     Min |     Max | "
-      "Iterations | Time / Iteration |   Stddev |  Threads |\n");
-  LIBC_NAMESPACE::printf(
-      "---------------------------------------------------------------------"
-      "--------------------------------\n");
+      "Iterations | Time / Iteration |   Stddev |  Threads |\n";
+  LIBC_NAMESPACE::printf(titles.data());
+
+  cpp::string separator(titles.size(), '-');
+  separator[titles.size() - 1] = '\n';
+  LIBC_NAMESPACE::printf(separator.data());
 }
 
 void Benchmark::run_benchmarks() {
diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2b85b146ed7459c..830e6f9e89a7439 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -21,7 +21,7 @@ namespace benchmarks {
 
 struct BenchmarkOptions {
   uint32_t initial_iterations = 1;
-  uint32_t min_iterations = 50;
+  uint32_t min_iterations = 1;
   uint32_t max_iterations = 10000000;
   uint32_t min_samples = 4;
   uint32_t max_samples = 1000;
@@ -111,9 +111,15 @@ class Benchmark {
 };
 
 // We want our random values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-template <typename T> static T get_rand_input() {
+// Output: a random number with the exponent field between min_exp and max_exp,
+// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
+// Caveats:
+//   -EXP_BIAS corresponding to denormal values,
+//   EXP_BIAS + 1 corresponding to inf or nan.
+template <typename T>
+static T
+get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
 
   // Required to correctly instantiate FPBits for floats and doubles.
@@ -125,10 +131,11 @@ template <typename T> static T get_rand_input() {
            static_cast<uint64_t>(LIBC_NAMESPACE::rand());
   else
     bits = LIBC_NAMESPACE::rand();
-  double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
+  double scale =
+      static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
   FPBits fp(bits);
   fp.set_biased_exponent(
-      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
   return fp.get_val();
 }
 
@@ -141,19 +148,15 @@ template <typename T> class MathPerf {
 public:
   typedef T Func(T);
 
-  static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
-                                    StorageType ending_bit, StorageType step) {
-    uint64_t total_time = 0;
-    if (step <= 0)
-      step = 1;
-    volatile T result;
-    for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
-      T x = FPBits(bits).get_val();
-      total_time += LIBC_NAMESPACE::latency(f, x);
-    }
-    StorageType num_runs = (ending_bit - starting_bit) / step + 1;
-
-    return total_time / num_runs;
+  template <size_t N = 1>
+  static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
+    cpp::array<T, N> inputs;
+    for (size_t i = 0; i < N; ++i)
+      inputs[i] = get_rand_input<T>(min_exp, max_exp);
+
+    uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
+
+    return total_time / N;
   }
 };
 
@@ -176,5 +179,4 @@ template <typename T> class MathPerf {
 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
   BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
                       LIBC_NAMESPACE::gpu::get_lane_size())
-
 #endif
diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index 77250edb6bb526f..335da5ad71cf882 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -32,6 +32,7 @@ add_benchmark(
     sin_benchmark.cpp
   DEPENDS
     libc.src.math.sin
+    libc.src.math.sinf
     libc.src.stdlib.srand
     libc.src.stdlib.rand
     libc.src.__support.FPUtil.fp_bits
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 5849ea3e99bb094..bf09e6e462172e8 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -5,6 +5,7 @@
 #include "src/__support/CPP/functional.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/sin.h"
+#include "src/math/sinf.h"
 #include "src/stdlib/rand.h"
 
 #ifdef NVPTX_MATH_FOUND
@@ -15,51 +16,60 @@
 #include "src/math/amdgpu/declarations.h"
 #endif
 
-constexpr double M_PI = 3.14159265358979323846;
-uint64_t get_bits(double x) {
-  return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
-}
-
 // BENCHMARK() expects a function that with no parameters that returns a
 // uint64_t representing the latency. Defining each benchmark using macro that
 // expands to a lambda to allow us to switch the implementation of `sin()` to
 // easily register NVPTX benchmarks.
-#define BM_RANDOM_INPUT(Func)                                                  \
+#define BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N)                          \
   []() {                                                                       \
-    double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>();           \
-    return LIBC_NAMESPACE::latency(Func, x);                                   \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range<   \
+        N>(Func, MIN_EXP, MAX_EXP);                                            \
   }
-BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
 
-#define BM_TWO_PI(Func)                                                        \
-  []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
-        Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64));                     \
-  }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
+#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP)                                 \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1,                     \
+                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1));        \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128,                   \
+                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128));      \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024,                  \
+                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024));     \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096,                  \
+                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))
 
-#define BM_LARGE_INT(Func)                                                     \
-  []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
-        Func, 0, get_bits(1 << 30), get_bits(1 << 4));                         \
-  }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::sin));
+BENCH(double, Sin, LIBC_NAMESPACE::sin, -1023, 1023);
+BENCH(double, SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
+BENCH(double, SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
+BENCH(double, SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
+
+#ifdef NVPTX_MATH_FOUND
+BENCH(double, NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
+BENCH(double, NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
+BENCH(double, NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
+BENCH(double, NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
+#endif
+
+#ifdef AMDGPU_MATH_FOUND
+BENCH(double, AmdSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
+BENCH(double, AmdSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
+BENCH(double, AmdSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
+BENCH(double, AmdSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
+#endif
+
+BENCH(float, Sinf, LIBC_NAMESPACE::sinf, -127, 128);
+BENCH(float, SinfTwoPi, LIBC_NAMESPACE::sinf, -10, 3);
+BENCH(float, SinfTwoPow30, LIBC_NAMESPACE::sinf, 0, 30);
+BENCH(float, SinfVeryLarge, LIBC_NAMESPACE::sinf, 30, 120);
 
 #ifdef NVPTX_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
-          BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
-          BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+BENCH(float, NvSinf, LIBC_NAMESPACE::__nv_sinf, -127, 128);
+BENCH(float, NvSinfTwoPi, LIBC_NAMESPACE::__nv_sinf, -10, 3);
+BENCH(float, NvSinfTwoPow30, LIBC_NAMESPACE::__nv_sinf, 0, 30);
+BENCH(float, NvSinfVeryLarge, LIBC_NAMESPACE::__nv_sinf, 30, 120);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
-          BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
-          BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
+BENCH(float, AmdSinf, LIBC_NAMESPACE::__ocml_sin_f32, -127, 128);
+BENCH(float, AmdSinfTwoPi, LIBC_NAMESPACE::__ocml_sin_f32, -10, 3);
+BENCH(float, AmdSinfTwoPow30, LIBC_NAMESPACE::__ocml_sin_f32, 0, 30);
+BENCH(float, AmdSinfVeryLarge, LIBC_NAMESPACE::__ocml_sin_f32, 30, 120);
 #endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
index 179429db9a09ae8..aa5dcd33bee9c81 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
     timing.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.attributes
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.array
 )
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index e308d619e956956..d5c3df27b7de605 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 
+#include "src/__support/CPP/array.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
@@ -17,14 +18,6 @@
 
 #include <stdint.h>
 
-// AMDGPU does not support input register constraints for i1 and i8, so we must
-// cast them to uint16_t's before loading them into registers.
-#define FORCE_TO_REGISTER(TYPE, VARIABLE)                                      \
-  if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>)      \
-    asm("" ::"v"(static_cast<uint16_t>(VARIABLE)));                            \
-  else                                                                         \
-    asm("" ::"v"(VARIABLE))
-
 namespace LIBC_NAMESPACE_DECL {
 
 // Returns the overhead associated with calling the profiling region. This
@@ -50,8 +43,6 @@ template <typename F, typename T>
   volatile T storage = t;
   T arg = storage;
 
-  FORCE_TO_REGISTER(T, arg);
-
   // The AMDGPU architecture needs to wait on pending results.
   gpu::memory_fence();
   // Get the current timestamp from the clock.
@@ -59,7 +50,6 @@ template <typename F, typename T>
 
   // This forces the compiler to load the input argument and run the clock
   // cycle counter before the profiling region.
-  FORCE_TO_REGISTER(T, arg);
   asm("" ::"s"(start));
 
   // Run the function under test and return its value.
@@ -67,8 +57,15 @@ template <typename F, typename T>
 
   // This inline assembly performs a no-op which forces the result to both
   // be used and prevents us from exiting this region before it's complete.
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
-      static_cast<uint32_t>(result)));
+  if constexpr (cpp::is_same_v<decltype(result), char> ||
+                cpp::is_same_v<decltype(result), bool>)
+    // AMDGPU does not support input register constraints for i1 and i8, so we
+    // cast it to a 32-bit integer. This does not add an additional assembly
+    // instruction (https://godbolt.org/z/zxGqv8G91).
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+        static_cast<uint32_t>(result)));
+  else
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.
@@ -87,20 +84,19 @@ template <typename F, typename T1, typename T2>
   T1 arg1 = storage1;
   T2 arg2 = storage2;
 
-  FORCE_TO_REGISTER(T1, arg1);
-  FORCE_TO_REGISTER(T2, arg2);
-
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
-  FORCE_TO_REGISTER(T1, arg1);
-  FORCE_TO_REGISTER(T2, arg2);
   asm("" ::"s"(start));
 
   auto result = f(arg1, arg2);
 
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
-      static_cast<uint32_t>(result)));
+  if constexpr (cpp::is_same_v<decltype(result), char> ||
+                cpp::is_same_v<decltype(result), bool>)
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+        static_cast<uint32_t>(result)));
+  else
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
@@ -109,6 +105,31 @@ template <typename F, typename T1, typename T2>
   return stop - start;
 }
 
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+throughput(F f, const cpp::array<T, N> &inputs) {
+  asm("" ::"v"(&inputs));
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"s"(start));
+
+  for (auto input : inputs) {
+    auto result = f(input);
+
+    asm("" ::"v"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  asm("" ::"s"(stop));
+  gpu::memory_fence();
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index 9958e16206a410a..2723c8940814c6f 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
     timing.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.attributes
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.array
 )
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index b426dfd0ea1535f..637986abd9092da 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/attributes.h"
@@ -25,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL {
   volatile uint32_t x = 1;
   uint32_t y = x;
   uint64_t start = gpu::processor_clock();
-  asm("" ::"r"(y), "llr"(start));
+  asm("" ::"llr"(start));
   uint32_t result = y;
   asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
   uint64_t stop = gpu::processor_clock();
@@ -42,7 +44,6 @@ template <typename F, typename T>
   // not constant propagate it and remove the profiling region.
   volatile T storage = t;
   T arg = storage;
-  asm("" ::"r"(arg));
 
   // Get the current timestamp from the clock.
   gpu::memory_fence();
@@ -50,7 +51,7 @@ template <typename F, typename T>
 
   // This forces the compiler to load the input argument and run the clock cycle
   // counter before the profiling region.
-  asm("" ::"r"(arg), "llr"(start));
+  asm("" ::"llr"(start));
 
   // Run the function under test and return its value.
   auto result = f(arg);
@@ -76,12 +77,11 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   volatile T2 storage2 = t2;
   T1 arg = storage;
   T2 arg2 = storage2;
-  asm("" ::"r"(arg), "r"(arg2));
 
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
-  asm("" ::"r"(arg), "r"(arg2), "llr"(start));
+  asm("" ::"llr"(start));
 
   auto result = f(arg, arg2);
 
@@ -94,6 +94,33 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
 
   return stop - start;
 }
+
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+throughput(F f, const cpp::array<T, N> &inputs) {
+  asm("" ::"r"(&inputs));
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"llr"(start));
+
+  uint64_t result;
+  for (auto input : inputs) {
+    asm("" ::"r"(input));
+    result = f(input);
+    asm("" ::"r"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  gpu::memory_fence();
+  asm("" ::"r"(stop));
+  volatile auto output = result;
+
+  // Return the time elapsed.
+  return stop - start;
+}
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 9fc10375a1d379f..e3dfe1a15296919 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -71,6 +71,10 @@ function(_get_compile_options_from_config output_var)
     list(APPEND config_options "-DLIBC_QSORT_IMPL=${LIBC_CONF_QSORT_IMPL}")
   endif()
 
+  if(LIBC_TYPES_TIME_T_IS_32_BIT AND LLVM_LIBC_FULL_BUILD)
+    list(APPEND config_options "-DLIBC_TYPES_TIME_T_IS_32_BIT")
+  endif()
+
   set(${output_var} ${config_options} PARENT_SCOPE)
 endfunction(_get_compile_options_from_config)
 
diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake
index 3629a7f111a7c52..69f31ace80dd3dd 100644
--- a/libc/cmake/modules/LLVMLibCFlagRules.cmake
+++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake
@@ -286,3 +286,16 @@ if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE4_2")
        LIBC_TARGET_ARCHITECTURE_IS_AARCH64 OR LIBC_TARGET_OS_IS_GPU))
   set(SKIP_FLAG_EXPANSION_ROUND_OPT TRUE)
 endif()
+
+# Choose whether time_t is 32- or 64-bit, based on target architecture
+# and config options. This will be used to set a #define during the
+# library build, and also to select the right version of time_t.h for
+# the output headers.
+if(LIBC_TARGET_ARCHITECTURE_IS_ARM AND NOT (LIBC_CONF_TIME_64BIT))
+  # Set time_t to 32 bit for compatibility with glibc, unless
+  # configuration says otherwise
+  set(LIBC_TYPES_TIME_T_IS_32_BIT TRUE)
+else()
+  # Other platforms default to 64-bit time_t
+  set(LIBC_TYPES_TIME_T_IS_32_BIT FALSE)
+endif()
diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index 3049f4db7301f66..c2c675bda26d31e 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -9,8 +9,8 @@
 function(add_header target_name)
   cmake_parse_arguments(
     "ADD_HEADER"
-    ""    # No optional arguments
-    "HDR" # Single value arguments
+    ""             # No optional arguments
+    "HDR;DEST_HDR" # Single value arguments
     "DEPENDS"
     ${ARGN}
   )
@@ -18,7 +18,12 @@ function(add_header target_name)
     message(FATAL_ERROR "'add_header' rules requires the HDR argument specifying a headef file.")
   endif()
 
-  set(absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_HEADER_HDR})
+  if(ADD_HEADER_DEST_HDR)
+    set(dest_leaf_filename ${ADD_HEADER_DEST_HDR})
+  else()
+    set(dest_leaf_filename ${ADD_HEADER_HDR})
+  endif()
+  set(absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${dest_leaf_filename})
   file(RELATIVE_PATH relative_path ${LIBC_INCLUDE_SOURCE_DIR} ${absolute_path})
   set(dest_file ${LIBC_INCLUDE_DIR}/${relative_path})
   set(src_file ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_HEADER_HDR})
diff --git a/libc/config/CMakeLists.txt b/libc/config/CMakeLists.txt
index 853854b03be4e4e..cf38ae3eed72677 100644
--- a/libc/config/CMakeLists.txt
+++ b/libc/config/CMakeLists.txt
@@ -1,3 +1,7 @@
-#TODO: Properly select the correct subdirectory.
-
-add_subdirectory(linux)
+add_header_library(
+  app_h
+  HDRS
+    app.h
+  DEPENDS
+    libc.src.__support.common
+)
diff --git a/libc/config/app.h b/libc/config/app.h
new file mode 100644
index 000000000000000..27f4141d80c4bee
--- /dev/null
+++ b/libc/config/app.h
@@ -0,0 +1,20 @@
+//===-- Classes to capture properites of applications -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_CONFIG_APP_H
+#define LLVM_LIBC_CONFIG_APP_H
+
+#include "src/__support/macros/properties/architectures.h"
+
+#if defined(LIBC_TARGET_ARCH_IS_GPU)
+#include "gpu/app.h"
+#elif defined(__linux__)
+#include "linux/app.h"
+#endif
+
+#endif // LLVM_LIBC_CONFIG_APP_H
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index d9b0fd8d065862e..af9a8bc9925441f 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -394,6 +394,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.roundevenl
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index 60d3070c963a050..6ebe2e4a29025f3 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -389,6 +389,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.roundevenl
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
diff --git a/libc/config/config.json b/libc/config/config.json
index 538fea53cc704ab..2e72c0a3fd1d690 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -88,5 +88,11 @@
       "value": true,
       "doc": "Make setjmp save the value of x18, and longjmp restore it. The AArch64 ABI delegates this register to platform ABIs, which can choose whether to make it caller-saved."
     }
+  },
+  "time": {
+    "LIBC_CONF_TIME_64BIT": {
+      "value": false,
+      "doc": "Force the size of time_t to 64 bits, even on platforms where compatibility considerations would otherwise make it 32-bit."
+    }
   }
 }
diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index 27822f5013f2fcc..36da9e13136638f 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -149,6 +149,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.dfmal
     libc.src.math.dsqrtl
     libc.src.math.daddl
+    libc.src.math.ddivl
     libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
@@ -247,6 +248,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.round
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
diff --git a/libc/config/darwin/x86_64/entrypoints.txt b/libc/config/darwin/x86_64/entrypoints.txt
index aa13fb276750fc6..49c19571ac41927 100644
--- a/libc/config/darwin/x86_64/entrypoints.txt
+++ b/libc/config/darwin/x86_64/entrypoints.txt
@@ -120,6 +120,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     #libc.src.math.coshf
     #libc.src.math.cosf
     #libc.src.math.daddl
+    #libc.src.math.ddivl
     #libc.src.math.dfmal
     #libc.src.math.dsqrtl
     #libc.src.math.dsubl
diff --git a/libc/config/gpu/app.h b/libc/config/gpu/app.h
new file mode 100644
index 000000000000000..148c51b702203f7
--- /dev/null
+++ b/libc/config/gpu/app.h
@@ -0,0 +1,28 @@
+//===-- Classes to capture properites of GPU applications -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_CONFIG_GPU_APP_H
+#define LLVM_LIBC_CONFIG_GPU_APP_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/architectures.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+// TODO: Move other global values here and export them to the host.
+struct DataEnvironment {
+  uintptr_t *env_ptr;
+};
+
+extern DataEnvironment app;
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_CONFIG_GPU_APP_H
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index e1a16a3b688789b..5e05c1617a3be01 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -167,6 +167,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.strtoull
     libc.src.stdlib.at_quick_exit
     libc.src.stdlib.quick_exit
+    libc.src.stdlib.getenv
 
     # TODO: Implement these correctly
     libc.src.stdlib.aligned_alloc
@@ -335,6 +336,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.rintf
     libc.src.math.round
     libc.src.math.roundf
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.sin
diff --git a/libc/config/linux/CMakeLists.txt b/libc/config/linux/CMakeLists.txt
deleted file mode 100644
index cf38ae3eed72677..000000000000000
--- a/libc/config/linux/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  app_h
-  HDRS
-    app.h
-  DEPENDS
-    libc.src.__support.common
-)
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index f2ab6c0ba73d741..bb0ebca29e6ae25 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -389,6 +389,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.coshf
     libc.src.math.cospif
     libc.src.math.daddl
+    libc.src.math.ddivl
     libc.src.math.dfmal
     libc.src.math.dmull
     libc.src.math.dsqrtl
@@ -541,6 +542,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.roundevenl
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
@@ -728,6 +732,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.rintf128
     libc.src.math.roundevenf128
     libc.src.math.roundf128
+    libc.src.math.scalblnf128
     libc.src.math.scalbnf128
     libc.src.math.setpayloadf128
     libc.src.math.setpayloadsigf128
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 9d3ab0c157aa81c..7fd60799bbcc722 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -371,6 +371,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.round
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index a7deccb9ded988c..0d48b55a9654836 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -388,6 +388,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.coshf
     libc.src.math.cospif
     libc.src.math.daddl
+    libc.src.math.ddivl
     libc.src.math.dfmal
     libc.src.math.dmull
     libc.src.math.dsqrtl
@@ -544,6 +545,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.roundevenl
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
@@ -635,6 +639,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.rintf128
     libc.src.math.roundevenf128
     libc.src.math.roundf128
+    libc.src.math.scalblnf128
     libc.src.math.scalbnf128
     libc.src.math.setpayloadf128
     libc.src.math.setpayloadsigf128
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 6b16843e5027d77..b9134c8496c30ae 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -391,6 +391,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.dmull
     libc.src.math.dsqrtl
     libc.src.math.daddl
+    libc.src.math.ddivl
     libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
@@ -544,6 +545,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.roundevenl
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
@@ -724,6 +728,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.rintf128
     libc.src.math.roundevenf128
     libc.src.math.roundf128
+    libc.src.math.scalblnf128
     libc.src.math.scalbnf128
     libc.src.math.setpayloadf128
     libc.src.math.setpayloadsigf128
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 87531028959ef11..7fa7eb22772e29b 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -137,6 +137,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cosf
     libc.src.math.coshf
     libc.src.math.daddl
+    libc.src.math.ddivl
     libc.src.math.dfmal
     libc.src.math.dsubl
     libc.src.math.erff
@@ -263,6 +264,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.round
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 950de0eee4c05df..54ca5d55d7b2435 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -52,3 +52,5 @@ to learn about the defaults for your platform and target.
 * **"string" options**
     - ``LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING``: Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled.
     - ``LIBC_CONF_STRING_UNSAFE_WIDE_READ``: Read more than a byte at a time to perform byte-string operations like strlen.
+* **"time" options**
+    - ``LIBC_CONF_TIME_64BIT``: Force the size of time_t to 64 bits, even on platforms where compatibility considerations would otherwise make it 32-bit.
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index c0cecdfa1b25ac3..9185a95192963e2 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -116,7 +116,7 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | dadd             | N/A              | N/A             | |check|                | N/A                  | |check|\*              | 7.12.14.1              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| ddiv             | N/A              | N/A             |                        | N/A                  | |check|\*              | 7.12.14.4              | F.10.11                    |
+| ddiv             | N/A              | N/A             | |check|                | N/A                  | |check|\*              | 7.12.14.4              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | dfma             | N/A              | N/A             | |check|                | N/A                  | |check|\*              | 7.12.14.5              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -220,7 +220,7 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | roundeven        | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.9.8               | F.10.6.8                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| scalbln          |                  |                 |                        | |check|              |                        | 7.12.6.19              | F.10.3.19                  |
+| scalbln          | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.19              | F.10.3.19                  |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | scalbn           | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.19              | F.10.3.19                  |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt
index 9e77ab226ce6c25..0fa86e0152f9ba9 100644
--- a/libc/include/llvm-libc-types/CMakeLists.txt
+++ b/libc/include/llvm-libc-types/CMakeLists.txt
@@ -59,7 +59,11 @@ add_header(pthread_rwlockattr_t HDR pthread_rwlockattr_t.h)
 add_header(pthread_spinlock_t HDR pthread_spinlock_t.h DEPENDS .pid_t)
 add_header(pthread_t HDR pthread_t.h DEPENDS .__thread_type)
 add_header(rlim_t HDR rlim_t.h)
-add_header(time_t HDR time_t.h)
+if(LIBC_TYPES_TIME_T_IS_32_BIT)
+  add_header(time_t HDR time_t_32.h DEST_HDR time_t.h)
+else()
+  add_header(time_t HDR time_t_64.h DEST_HDR time_t.h)
+endif()
 add_header(stack_t HDR stack_t.h DEPENDS .size_t)
 add_header(suseconds_t HDR suseconds_t.h)
 add_header(struct_flock HDR struct_flock.h DEPENDS .off_t .pid_t)
diff --git a/libc/include/llvm-libc-types/time_t.h b/libc/include/llvm-libc-types/time_t.h
index 59953b343ba9634..76920dc07ec69c6 100644
--- a/libc/include/llvm-libc-types/time_t.h
+++ b/libc/include/llvm-libc-types/time_t.h
@@ -1,4 +1,4 @@
-//===-- Definition of the type time_t -------------------------------------===//
+//===-- Definition of the type time_t, for use during the libc build ------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,10 +9,10 @@
 #ifndef LLVM_LIBC_TYPES_TIME_T_H
 #define LLVM_LIBC_TYPES_TIME_T_H
 
-#if (defined(__arm__) || defined(_M_ARM))
-typedef __INTPTR_TYPE__ time_t;
+#ifdef LIBC_TYPES_TIME_T_IS_32_BIT
+#include "time_t_32.h"
 #else
-typedef __INT64_TYPE__ time_t;
+#include "time_t_64.h"
 #endif
 
 #endif // LLVM_LIBC_TYPES_TIME_T_H
diff --git a/libc/include/llvm-libc-types/time_t_32.h b/libc/include/llvm-libc-types/time_t_32.h
new file mode 100644
index 000000000000000..2c415f6fa9dcabb
--- /dev/null
+++ b/libc/include/llvm-libc-types/time_t_32.h
@@ -0,0 +1,14 @@
+//===-- Definition of the type time_t -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_TIME_T_32_H
+#define LLVM_LIBC_TYPES_TIME_T_32_H
+
+typedef __INT32_TYPE__ time_t;
+
+#endif // LLVM_LIBC_TYPES_TIME_T_32_H
diff --git a/libc/include/llvm-libc-types/time_t_64.h b/libc/include/llvm-libc-types/time_t_64.h
new file mode 100644
index 000000000000000..8f7fd3233646e62
--- /dev/null
+++ b/libc/include/llvm-libc-types/time_t_64.h
@@ -0,0 +1,14 @@
+//===-- Definition of the type time_t -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_TIME_T_64_H
+#define LLVM_LIBC_TYPES_TIME_T_64_H
+
+typedef __INT64_TYPE__ time_t;
+
+#endif // LLVM_LIBC_TYPES_TIME_T_64_H
diff --git a/libc/newhdrgen/yaml/math.yaml b/libc/newhdrgen/yaml/math.yaml
index c0485428fc559e5..2f1203e581f9a72 100644
--- a/libc/newhdrgen/yaml/math.yaml
+++ b/libc/newhdrgen/yaml/math.yaml
@@ -2313,6 +2313,43 @@ functions:
       - type: int
       - type: unsigned int
     guard: LIBC_TYPES_HAS_FLOAT128
+  - name: scalbln
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: long
+  - name: scalblnl
+    standards:
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long
+  - name: scalblnf
+    standards:
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: long
+  - name: scalblnf16
+    standards:
+      - stdc
+    return_type: float16
+    arguments:
+      - type: float16
+      - type: long
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: scalblnf128
+    standards:
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: long
+    guard: LIBC_TYPES_HAS_FLOAT128
   - name: lgamma
     standards:
       - stdc
@@ -2352,3 +2389,10 @@ functions:
     arguments:
       - type: long double
       - type: int *
+  - name: ddivl
+    standards:
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
diff --git a/libc/newhdrgen/yaml_to_classes.py b/libc/newhdrgen/yaml_to_classes.py
index 37a4f78ec4a7b67..3eb5e4ef2546c1a 100644
--- a/libc/newhdrgen/yaml_to_classes.py
+++ b/libc/newhdrgen/yaml_to_classes.py
@@ -190,7 +190,15 @@ def add_function_to_yaml(yaml_file, function_details):
     if new_function.attributes:
         function_dict["attributes"] = new_function.attributes
 
-    yaml_data["functions"].append(function_dict)
+    insert_index = 0
+    for i, func in enumerate(yaml_data["functions"]):
+        if func["name"] > new_function.name:
+            insert_index = i
+            break
+    else:
+        insert_index = len(yaml_data["functions"])
+
+    yaml_data["functions"].insert(insert_index, function_dict)
 
     class IndentYamlListDumper(yaml.Dumper):
         def increase_indent(self, flow=False, indentless=False):
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 0c58af084698940..72bfe0cf71aa2bd 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -398,7 +398,7 @@ def StdC : StandardSpec<"stdc"> {
           GuardedFunctionSpec<"ceilf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"daddl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-
+          FunctionSpec<"ddivl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
           FunctionSpec<"dfmal", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
           FunctionSpec<"dsubl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
 
@@ -717,7 +717,11 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"asinhf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"atanhf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
 
+          FunctionSpec<"scalbln", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<LongType>]>,
+          FunctionSpec<"scalblnf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<LongType>]>,
+          FunctionSpec<"scalblnl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongType>]>,
           GuardedFunctionSpec<"scalblnf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<LongType>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"scalblnf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<LongType>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"scalbn", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>]>,
           FunctionSpec<"scalbnf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntType>]>,
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index d8a192f1ffa570c..9bd1e29081a801f 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -190,8 +190,6 @@ add_header_library(
     libc.src.__support.CPP.bit
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.optional
-    libc.src.__support.FPUtil.dyadic_float
-    libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.rounding_mode
     libc.src.errno.errno
diff --git a/libc/src/__support/FPUtil/generic/div.h b/libc/src/__support/FPUtil/generic/div.h
index dad1772fce75007..f0e405772e9fa3c 100644
--- a/libc/src/__support/FPUtil/generic/div.h
+++ b/libc/src/__support/FPUtil/generic/div.h
@@ -35,7 +35,7 @@ div(InType x, InType y) {
   using InFPBits = FPBits<InType>;
   using InStorageType = typename InFPBits::StorageType;
   using DyadicFloat =
-      DyadicFloat<cpp::bit_ceil(static_cast<size_t>(InFPBits::FRACTION_LEN))>;
+      DyadicFloat<cpp::bit_ceil(static_cast<size_t>(InFPBits::SIG_LEN + 1))>;
 
   InFPBits x_bits(x);
   InFPBits y_bits(y);
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index c72bc1f957dc374..17cf3dd55b9fb4b 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -13,9 +13,7 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/dyadic_float.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
@@ -27,6 +25,8 @@
 #include "src/__support/uint128.h"
 #include "src/errno/libc_errno.h" // For ERANGE
 
+#include <stdint.h>
+
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
@@ -525,10 +525,9 @@ clinger_fast_path(ExpandedFloat<T> init_num,
   FPBits result;
   T float_mantissa;
   if constexpr (cpp::is_same_v<StorageType, UInt<128>>) {
-    float_mantissa = static_cast<T>(fputil::DyadicFloat<128>(
-        Sign::POS, 0,
-        fputil::DyadicFloat<128>::MantissaType(
-            {uint64_t(mantissa), uint64_t(mantissa >> 64)})));
+    float_mantissa =
+        (static_cast<T>(uint64_t(mantissa)) * static_cast<T>(0x1.0p64)) +
+        static_cast<T>(uint64_t(mantissa >> 64));
   } else {
     float_mantissa = static_cast<T>(mantissa);
   }
diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
index c2f0ed0cb233db3..b6796f40adce7bd 100644
--- a/libc/src/__support/threads/linux/CMakeLists.txt
+++ b/libc/src/__support/threads/linux/CMakeLists.txt
@@ -77,7 +77,7 @@ add_object_library(
     thread.cpp
   DEPENDS
     .futex_utils
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.include.sys_syscall
     libc.include.fcntl
     libc.src.errno.errno
diff --git a/libc/src/__support/threads/linux/thread.cpp b/libc/src/__support/threads/linux/thread.cpp
index 36b4a88eba9b422..ee3f63fa3cde32e 100644
--- a/libc/src/__support/threads/linux/thread.cpp
+++ b/libc/src/__support/threads/linux/thread.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/threads/thread.h"
-#include "config/linux/app.h"
+#include "config/app.h"
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/CPP/stringstream.h"
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 56e9bb60c1e4362..e2ebf2ddd4bfa44 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -88,6 +88,7 @@ add_math_entrypoint_object(cospif)
 
 add_math_entrypoint_object(daddl)
 add_math_entrypoint_object(daddf128)
+add_math_entrypoint_object(ddivl)
 add_math_entrypoint_object(ddivf128)
 add_math_entrypoint_object(dmull)
 add_math_entrypoint_object(dmulf128)
@@ -426,7 +427,11 @@ add_math_entrypoint_object(roundevenl)
 add_math_entrypoint_object(roundevenf16)
 add_math_entrypoint_object(roundevenf128)
 
+add_math_entrypoint_object(scalbln)
+add_math_entrypoint_object(scalblnf)
+add_math_entrypoint_object(scalblnl)
 add_math_entrypoint_object(scalblnf16)
+add_math_entrypoint_object(scalblnf128)
 
 add_math_entrypoint_object(scalbn)
 add_math_entrypoint_object(scalbnf)
diff --git a/libc/src/math/ddivl.h b/libc/src/math/ddivl.h
new file mode 100644
index 000000000000000..bf0da2887e330f2
--- /dev/null
+++ b/libc/src/math/ddivl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for ddivl -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DDIVL_H
+#define LLVM_LIBC_SRC_MATH_DDIVL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double ddivl(long double x, long double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DDIVL_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index e5f40673dd5f021..c80c7ca7f7af11d 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -154,6 +154,18 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.generic.add_sub
 )
 
+add_entrypoint_object(
+  ddivl
+  SRCS
+    ddivl.cpp
+  HDRS
+    ../ddivl.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.FPUtil.generic.div
+)
+
 add_entrypoint_object(
   ddivf128
   SRCS
@@ -4173,6 +4185,46 @@ add_entrypoint_object(
     libc.src.__support.macros.optimization
 )
 
+
+add_entrypoint_object(
+  scalbln
+  SRCS
+    scalbln.cpp
+  HDRS
+    ../scalbln.h
+  DEPENDS
+    libc.hdr.float_macros
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  scalblnf
+  SRCS
+    scalblnf.cpp
+  HDRS
+    ../scalblnf.h
+  DEPENDS
+    libc.hdr.float_macros
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  scalblnl
+  SRCS
+    scalblnl.cpp
+  HDRS
+    ../scalblnl.h
+  DEPENDS
+    libc.hdr.float_macros
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   scalblnf16
   SRCS
@@ -4187,6 +4239,20 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  scalblnf128
+  SRCS
+    scalblnf128.cpp
+  HDRS
+    ../scalblnf128.h
+  DEPENDS
+    libc.hdr.float_macros
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   scalbn
   SRCS
diff --git a/libc/src/math/generic/ddivl.cpp b/libc/src/math/generic/ddivl.cpp
new file mode 100644
index 000000000000000..18fc44d6f1648e9
--- /dev/null
+++ b/libc/src/math/generic/ddivl.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of ddivl function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ddivl.h"
+#include "src/__support/FPUtil/generic/div.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, ddivl, (long double x, long double y)) {
+  return fputil::generic::div<double>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/scalbln.cpp b/libc/src/math/generic/scalbln.cpp
new file mode 100644
index 000000000000000..f97619954237eef
--- /dev/null
+++ b/libc/src/math/generic/scalbln.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of scalbln function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/scalbln.h"
+#include "hdr/float_macros.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, scalbln, (double x, long n)) {
+  return fputil::ldexp(x, n);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/scalblnf.cpp b/libc/src/math/generic/scalblnf.cpp
new file mode 100644
index 000000000000000..aa11a552a919401
--- /dev/null
+++ b/libc/src/math/generic/scalblnf.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of scalblnf function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/scalblnf.h"
+#include "hdr/float_macros.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float, scalblnf, (float x, long n)) {
+  return fputil::ldexp(x, n);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/scalblnf128.cpp b/libc/src/math/generic/scalblnf128.cpp
new file mode 100644
index 000000000000000..fda6ea0bfe03056
--- /dev/null
+++ b/libc/src/math/generic/scalblnf128.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of scalblnf128 function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/scalblnf128.h"
+#include "hdr/float_macros.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float128, scalblnf128, (float128 x, long n)) {
+  return fputil::ldexp(x, n);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/scalblnl.cpp b/libc/src/math/generic/scalblnl.cpp
new file mode 100644
index 000000000000000..5823c498ba3ecc4
--- /dev/null
+++ b/libc/src/math/generic/scalblnl.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of scalblnl function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/scalblnl.h"
+#include "hdr/float_macros.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(long double, scalblnl, (long double x, long n)) {
+  return fputil::ldexp(x, n);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/scalbln.h b/libc/src/math/scalbln.h
new file mode 100644
index 000000000000000..b99ba7683ebc4a3
--- /dev/null
+++ b/libc/src/math/scalbln.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for scalbln -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SCALBLN_H
+#define LLVM_LIBC_SRC_MATH_SCALBLN_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double scalbln(double x, long n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SCALBLN_H
diff --git a/libc/src/math/scalblnf.h b/libc/src/math/scalblnf.h
new file mode 100644
index 000000000000000..a757f528e785641
--- /dev/null
+++ b/libc/src/math/scalblnf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for scalblnf ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SCALBLNF_H
+#define LLVM_LIBC_SRC_MATH_SCALBLNF_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float scalblnf(float x, long n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SCALBLNF_H
diff --git a/libc/src/math/scalblnf128.h b/libc/src/math/scalblnf128.h
new file mode 100644
index 000000000000000..b9b7a862f66f681
--- /dev/null
+++ b/libc/src/math/scalblnf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for scalblnf128 -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SCALBLNF128_H
+#define LLVM_LIBC_SRC_MATH_SCALBLNF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float128 scalblnf128(float128 x, long n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SCALBLNF128_H
diff --git a/libc/src/math/scalblnl.h b/libc/src/math/scalblnl.h
new file mode 100644
index 000000000000000..e2df840892e5e97
--- /dev/null
+++ b/libc/src/math/scalblnl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for scalblnl ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SCALBLNL_H
+#define LLVM_LIBC_SRC_MATH_SCALBLNL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+long double scalblnl(long double x, long n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SCALBLNL_H
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index 29789f5e2adc2b8..ce12e66cf3e57f8 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -62,7 +62,7 @@ add_entrypoint_object(
   HDRS
     getenv.h
   DEPENDS
-    libc.config.linux.app_h
+  libc.config.app_h
 )
 
 add_entrypoint_object(
diff --git a/libc/src/stdlib/getenv.cpp b/libc/src/stdlib/getenv.cpp
index 6b1bb693a6d8311..e6ef03fad5c51ef 100644
--- a/libc/src/stdlib/getenv.cpp
+++ b/libc/src/stdlib/getenv.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/getenv.h"
-#include "config/linux/app.h"
+#include "config/app.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
diff --git a/libc/src/sys/auxv/linux/CMakeLists.txt b/libc/src/sys/auxv/linux/CMakeLists.txt
index 383c29eafda8d89..4884184cc605390 100644
--- a/libc/src/sys/auxv/linux/CMakeLists.txt
+++ b/libc/src/sys/auxv/linux/CMakeLists.txt
@@ -11,7 +11,7 @@ add_entrypoint_object(
     libc.src.__support.threads.callonce
     libc.src.__support.common
     libc.src.errno.errno
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.src.fcntl.open
     libc.src.unistd.read
     libc.src.unistd.close
diff --git a/libc/src/sys/auxv/linux/getauxval.cpp b/libc/src/sys/auxv/linux/getauxval.cpp
index bfa6b23b5ef913b..236fd25698f6597 100644
--- a/libc/src/sys/auxv/linux/getauxval.cpp
+++ b/libc/src/sys/auxv/linux/getauxval.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/sys/auxv/getauxval.h"
-#include "config/linux/app.h"
+#include "config/app.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
diff --git a/libc/startup/gpu/CMakeLists.txt b/libc/startup/gpu/CMakeLists.txt
index 5e5745063fc8c39..9d0e0885dff9390 100644
--- a/libc/startup/gpu/CMakeLists.txt
+++ b/libc/startup/gpu/CMakeLists.txt
@@ -34,7 +34,7 @@ function(add_startup_object name)
       RUNTIME_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR}
       RUNTIME_OUTPUT_NAME ${name}.o)
     target_link_options(${fq_target_name}.exe PRIVATE
-                        "-nostdlib" "-flto" "-Wl,--lto-emit-llvm")
+                        "-r" "-nostdlib" "-flto" "-Wl,--lto-emit-llvm")
   endif()
 endfunction()
 
diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt
index 3ac104ee8ba94a4..b67a5a2cc89fb24 100644
--- a/libc/startup/gpu/amdgpu/CMakeLists.txt
+++ b/libc/startup/gpu/amdgpu/CMakeLists.txt
@@ -3,6 +3,7 @@ add_startup_object(
   SRC
     start.cpp
   DEPENDS
+    libc.config.app_h
     libc.src.__support.RPC.rpc_client
     libc.src.__support.GPU.utils
     libc.src.stdlib.exit
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 6bda151023c8fce..5aaa7e938d27924 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "config/gpu/app.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/macros/config.h"
@@ -16,6 +17,8 @@ extern "C" int main(int argc, char **argv, char **envp);
 
 namespace LIBC_NAMESPACE_DECL {
 
+DataEnvironment app;
+
 extern "C" uintptr_t __init_array_start[];
 extern "C" uintptr_t __init_array_end[];
 extern "C" uintptr_t __fini_array_start[];
@@ -40,6 +43,8 @@ static void call_fini_array_callbacks() {
 
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
 _begin(int argc, char **argv, char **env) {
+  __atomic_store_n(&LIBC_NAMESPACE::app.env_ptr,
+                   reinterpret_cast<uintptr_t *>(env), __ATOMIC_RELAXED);
   // We want the fini array callbacks to be run after other atexit
   // callbacks are run. So, we register them before running the init
   // array callbacks as they can potentially register their own atexit
diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
index 3ac104ee8ba94a4..b67a5a2cc89fb24 100644
--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -3,6 +3,7 @@ add_startup_object(
   SRC
     start.cpp
   DEPENDS
+    libc.config.app_h
     libc.src.__support.RPC.rpc_client
     libc.src.__support.GPU.utils
     libc.src.stdlib.exit
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index b1ef944c4aa2894..ef1e63e5161a61a 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "config/gpu/app.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/macros/config.h"
@@ -16,6 +17,8 @@ extern "C" int main(int argc, char **argv, char **envp);
 
 namespace LIBC_NAMESPACE_DECL {
 
+DataEnvironment app;
+
 extern "C" {
 // Nvidia's 'nvlink' linker does not provide these symbols. We instead need
 // to manually create them and update the globals in the loader implememtation.
@@ -46,6 +49,9 @@ static void call_fini_array_callbacks() {
 
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
 _begin(int argc, char **argv, char **env) {
+  __atomic_store_n(&LIBC_NAMESPACE::app.env_ptr,
+                   reinterpret_cast<uintptr_t *>(env), __ATOMIC_RELAXED);
+
   // We want the fini array callbacks to be run after other atexit
   // callbacks are run. So, we register them before running the init
   // array callbacks as they can potentially register their own atexit
diff --git a/libc/startup/linux/CMakeLists.txt b/libc/startup/linux/CMakeLists.txt
index 336c5d0f6bfa279..71f187ca05f29fe 100644
--- a/libc/startup/linux/CMakeLists.txt
+++ b/libc/startup/linux/CMakeLists.txt
@@ -95,7 +95,7 @@ add_object_library(
   HDRS
     do_start.h
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.include.sys_mman
     libc.include.sys_syscall
     libc.include.llvm-libc-macros.link_macros
diff --git a/libc/startup/linux/aarch64/CMakeLists.txt b/libc/startup/linux/aarch64/CMakeLists.txt
index 5ea6ae59abcb284..5564f0a8f687e69 100644
--- a/libc/startup/linux/aarch64/CMakeLists.txt
+++ b/libc/startup/linux/aarch64/CMakeLists.txt
@@ -3,7 +3,7 @@ add_startup_object(
   SRC
     tls.cpp
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.include.sys_mman
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -18,7 +18,7 @@ add_startup_object(
   SRC
     start.cpp
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
   COMPILE_OPTIONS
     -fno-omit-frame-pointer
     -ffreestanding # To avoid compiler warnings about calling the main function.
diff --git a/libc/startup/linux/do_start.h b/libc/startup/linux/do_start.h
index dd41c9bd384e7b9..8fc8c3716f2ac0d 100644
--- a/libc/startup/linux/do_start.h
+++ b/libc/startup/linux/do_start.h
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "config/linux/app.h"
+#include "config/app.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/startup/linux/riscv/CMakeLists.txt b/libc/startup/linux/riscv/CMakeLists.txt
index 3717784233c1513..2a61f8289067de2 100644
--- a/libc/startup/linux/riscv/CMakeLists.txt
+++ b/libc/startup/linux/riscv/CMakeLists.txt
@@ -3,7 +3,7 @@ add_startup_object(
   SRC
     tls.cpp
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.include.sys_mman
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -18,7 +18,7 @@ add_startup_object(
   SRC
     start.cpp
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.src.__support.macros.attributes
   COMPILE_OPTIONS
     -fno-omit-frame-pointer
diff --git a/libc/startup/linux/x86_64/CMakeLists.txt b/libc/startup/linux/x86_64/CMakeLists.txt
index 30da7ab4e1ec3df..4f482eaf5d18eb6 100644
--- a/libc/startup/linux/x86_64/CMakeLists.txt
+++ b/libc/startup/linux/x86_64/CMakeLists.txt
@@ -3,7 +3,7 @@ add_startup_object(
   SRC
     tls.cpp
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.include.sys_mman
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -20,7 +20,7 @@ add_startup_object(
   SRC
     start.cpp
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.src.__support.macros.attributes
   COMPILE_OPTIONS
     -fno-stack-protector
diff --git a/libc/test/integration/src/stdlib/CMakeLists.txt b/libc/test/integration/src/stdlib/CMakeLists.txt
index 0985a80ce7a0907..1efdf607defe9b7 100644
--- a/libc/test/integration/src/stdlib/CMakeLists.txt
+++ b/libc/test/integration/src/stdlib/CMakeLists.txt
@@ -13,4 +13,3 @@ add_integration_test(
     FRANCE=Paris
     GERMANY=Berlin
 )
-
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index f3703eb59999b1c..d0106972809cc02 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -2464,6 +2464,19 @@ add_fp_unittest(
     libc.src.stdlib.srand
 )
 
+add_fp_unittest(
+  ddivl_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    ddivl_test.cpp
+  HDRS
+    DivTest.h
+  DEPENDS
+    libc.src.math.ddivl
+)
+
 add_fp_unittest(
   dfmal_test
   NEED_MPFR
@@ -2518,7 +2531,6 @@ add_fp_unittest(
     libc.src.math.fdivl
 )
 
-
 add_fp_unittest(
   ffma_test
   NEED_MPFR
diff --git a/libc/test/src/math/DivTest.h b/libc/test/src/math/DivTest.h
index 1cdc1398a1a1c1c..c14d16fb5773bc9 100644
--- a/libc/test/src/math/DivTest.h
+++ b/libc/test/src/math/DivTest.h
@@ -47,7 +47,7 @@ class DivTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
       InType x = InFPBits(v).get_val();
       InType y = InFPBits(w).get_val();
       mpfr::BinaryInput<InType> input{x, y};
-      EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Div, input, func(x, y),
+      ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Div, input, func(x, y),
                                      0.5);
     }
   }
@@ -60,7 +60,7 @@ class DivTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
       InType x = InFPBits(v).get_val();
       InType y = InFPBits(w).get_val();
       mpfr::BinaryInput<InType> input{x, y};
-      EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Div, input, func(x, y),
+      ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Div, input, func(x, y),
                                      0.5);
     }
   }
diff --git a/libc/test/src/math/ddivl_test.cpp b/libc/test/src/math/ddivl_test.cpp
new file mode 100644
index 000000000000000..7768766e30c12ab
--- /dev/null
+++ b/libc/test/src/math/ddivl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for ddivl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DivTest.h"
+
+#include "src/math/ddivl.h"
+
+LIST_DIV_TESTS(double, long double, LIBC_NAMESPACE::ddivl)
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 45e1c4c26cc147b..609ecef42f74596 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3630,6 +3630,51 @@ add_fp_unittest(
     libc.src.math.atan2
 )
 
+add_fp_unittest(
+  scalbln_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    scalbln_test.cpp
+  HDRS
+    ScalbnTest.h
+  DEPENDS
+    libc.src.math.scalbln
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
+)
+
+add_fp_unittest(
+  scalblnf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    scalblnf_test.cpp
+  HDRS
+    ScalbnTest.h
+  DEPENDS
+    libc.src.math.scalblnf
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
+)
+
+add_fp_unittest(
+  scalblnl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    scalblnl_test.cpp
+  HDRS
+    ScalbnTest.h
+  DEPENDS
+    libc.src.math.scalblnl
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
+)
+
 add_fp_unittest(
   scalblnf16_test
   SUITE
@@ -3640,7 +3685,24 @@ add_fp_unittest(
     ScalbnTest.h
   DEPENDS
     libc.src.math.scalblnf16
+    libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
+)
+
+add_fp_unittest(
+  scalblnf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    scalblnf128_test.cpp
+  HDRS
+    ScalbnTest.h
+  DEPENDS
+    libc.src.math.scalblnf128
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
@@ -3653,7 +3715,9 @@ add_fp_unittest(
     ScalbnTest.h
   DEPENDS
     libc.src.math.scalbn
+    libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
@@ -3666,7 +3730,9 @@ add_fp_unittest(
     ScalbnTest.h
   DEPENDS
     libc.src.math.scalbnf
+    libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
@@ -3679,7 +3745,9 @@ add_fp_unittest(
     ScalbnTest.h
   DEPENDS
     libc.src.math.scalbnl
+    libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
@@ -3692,7 +3760,9 @@ add_fp_unittest(
     ScalbnTest.h
   DEPENDS
     libc.src.math.scalbnf16
+    libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
@@ -3705,7 +3775,9 @@ add_fp_unittest(
     ScalbnTest.h
   DEPENDS
     libc.src.math.scalbnf128
+    libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
@@ -4615,7 +4687,7 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
-  daddl
+  daddl_test
   SUITE
     libc-math-smoke-tests
   SRCS
@@ -4627,7 +4699,7 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
-  daddf128
+  daddf128_test
   SUITE
     libc-math-smoke-tests
   SRCS
@@ -4639,7 +4711,19 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
-  ddivf128
+  ddivl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    ddivl_test.cpp
+  HDRS
+    DivTest.h
+  DEPENDS
+    libc.src.math.ddivl
+)
+
+add_fp_unittest(
+  ddivf128_test
   SUITE
     libc-math-smoke-tests
   SRCS
diff --git a/libc/test/src/math/smoke/SetPayloadSigTest.h b/libc/test/src/math/smoke/SetPayloadSigTest.h
index 7ec3ac08a180abe..60913c60b481c66 100644
--- a/libc/test/src/math/smoke/SetPayloadSigTest.h
+++ b/libc/test/src/math/smoke/SetPayloadSigTest.h
@@ -35,7 +35,13 @@ class SetPayloadSigTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_EQ(1, func(&res, T(-1.0)));
     EXPECT_EQ(1, func(&res, T(0x42.1p+0)));
     EXPECT_EQ(1, func(&res, T(-0x42.1p+0)));
-    EXPECT_EQ(1, func(&res, T(StorageType(1) << (FPBits::FRACTION_LEN - 1))));
+
+    FPBits default_snan_payload_bits = FPBits::one();
+    default_snan_payload_bits.set_biased_exponent(FPBits::FRACTION_LEN - 1 +
+                                                  FPBits::EXP_BIAS);
+    T default_snan_payload = default_snan_payload_bits.get_val();
+
+    EXPECT_EQ(1, func(&res, default_snan_payload));
   }
 
   void testValidPayloads(SetPayloadSigFunc func) {
@@ -56,7 +62,12 @@ class SetPayloadSigTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_EQ(FPBits::signaling_nan(Sign::POS, 0x123).uintval(),
               FPBits(res).uintval());
 
-    EXPECT_EQ(0, func(&res, T(FPBits::FRACTION_MASK >> 1)));
+    FPBits nan_payload_bits = FPBits::one();
+    nan_payload_bits.set_biased_exponent(FPBits::FRACTION_LEN - 2 +
+                                         FPBits::EXP_BIAS);
+    nan_payload_bits.set_mantissa(FPBits::SIG_MASK - 3);
+    T nan_payload = nan_payload_bits.get_val();
+    EXPECT_EQ(0, func(&res, nan_payload));
     EXPECT_TRUE(FPBits(res).is_signaling_nan());
     EXPECT_EQ(
         FPBits::signaling_nan(Sign::POS, FPBits::FRACTION_MASK >> 1).uintval(),
diff --git a/libc/test/src/math/smoke/ddivl_test.cpp b/libc/test/src/math/smoke/ddivl_test.cpp
new file mode 100644
index 000000000000000..7768766e30c12ab
--- /dev/null
+++ b/libc/test/src/math/smoke/ddivl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for ddivl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DivTest.h"
+
+#include "src/math/ddivl.h"
+
+LIST_DIV_TESTS(double, long double, LIBC_NAMESPACE::ddivl)
diff --git a/libc/test/src/math/smoke/scalbln_test.cpp b/libc/test/src/math/smoke/scalbln_test.cpp
new file mode 100644
index 000000000000000..eaf7b8e47b41ce3
--- /dev/null
+++ b/libc/test/src/math/smoke/scalbln_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for scalbln ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScalbnTest.h"
+
+#include "src/math/scalbln.h"
+
+LIST_SCALBN_TESTS(double, long, LIBC_NAMESPACE::scalbln)
diff --git a/libc/test/src/math/smoke/scalblnf128_test.cpp b/libc/test/src/math/smoke/scalblnf128_test.cpp
new file mode 100644
index 000000000000000..9ee1b3816511e0d
--- /dev/null
+++ b/libc/test/src/math/smoke/scalblnf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for scalblnf128 -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScalbnTest.h"
+
+#include "src/math/scalblnf128.h"
+
+LIST_SCALBN_TESTS(float128, long, LIBC_NAMESPACE::scalblnf128)
diff --git a/libc/test/src/math/smoke/scalblnf_test.cpp b/libc/test/src/math/smoke/scalblnf_test.cpp
new file mode 100644
index 000000000000000..a40d7aa7886db1f
--- /dev/null
+++ b/libc/test/src/math/smoke/scalblnf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for scalblnf --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScalbnTest.h"
+
+#include "src/math/scalblnf.h"
+
+LIST_SCALBN_TESTS(float, long, LIBC_NAMESPACE::scalblnf)
diff --git a/libc/test/src/math/smoke/scalblnl_test.cpp b/libc/test/src/math/smoke/scalblnl_test.cpp
new file mode 100644
index 000000000000000..ccfbe1ebdeaf094
--- /dev/null
+++ b/libc/test/src/math/smoke/scalblnl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for scalblnl --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScalbnTest.h"
+
+#include "src/math/scalblnl.h"
+
+LIST_SCALBN_TESTS(long double, long, LIBC_NAMESPACE::scalblnl)
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index c2f4f3e755b95ef..b60b00cee4a6022 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -459,7 +459,8 @@ class OffsetGetter {
 // InputSectionBase.
 class RelocationScanner {
 public:
-  template <class ELFT> void scanSection(InputSectionBase &s);
+  template <class ELFT>
+  void scanSection(InputSectionBase &s, bool isEH = false);
 
 private:
   InputSectionBase *sec;
@@ -1617,10 +1618,11 @@ void RelocationScanner::scan(Relocs<RelTy> rels) {
                       });
 }
 
-template <class ELFT> void RelocationScanner::scanSection(InputSectionBase &s) {
+template <class ELFT>
+void RelocationScanner::scanSection(InputSectionBase &s, bool isEH) {
   sec = &s;
   getter = OffsetGetter(s);
-  const RelsOrRelas<ELFT> rels = s.template relsOrRelas<ELFT>();
+  const RelsOrRelas<ELFT> rels = s.template relsOrRelas<ELFT>(!isEH);
   if (rels.areRelocsCrel())
     scan<ELFT>(rels.crels);
   else if (rels.areRelocsRel())
@@ -1658,7 +1660,7 @@ template <class ELFT> void elf::scanRelocations() {
     RelocationScanner scanner;
     for (Partition &part : partitions) {
       for (EhInputSection *sec : part.ehFrame->sections)
-        scanner.template scanSection<ELFT>(*sec);
+        scanner.template scanSection<ELFT>(*sec, /*isEH=*/true);
       if (part.armExidx && part.armExidx->isLive())
         for (InputSection *sec : part.armExidx->exidxSections)
           if (sec->isLive())
diff --git a/lld/cmake/modules/AddLLD.cmake b/lld/cmake/modules/AddLLD.cmake
index 34f9974efbf50af..9f2684b6f933eca 100644
--- a/lld/cmake/modules/AddLLD.cmake
+++ b/lld/cmake/modules/AddLLD.cmake
@@ -44,7 +44,7 @@ macro(add_lld_tool name)
     AND (NOT LLVM_DISTRIBUTION_COMPONENTS OR ${name} IN_LIST LLVM_DISTRIBUTION_COMPONENTS)
   )
     set(get_obj_args ${ARGN})
-    list(FILTER get_obj_args EXCLUDE REGEX "^(SUPPORT_PLUGINS|EXPORT_SYMBOLS_FOR_PLUGINS)$")
+    list(FILTER get_obj_args EXCLUDE REGEX "^SUPPORT_PLUGINS$")
     generate_llvm_objects(${name} ${get_obj_args})
     add_custom_target(${name} DEPENDS llvm-driver)
   else()
diff --git a/lld/test/ELF/crel.s b/lld/test/ELF/crel.s
index d7c87be9a540257..1de3f314fc6770c 100644
--- a/lld/test/ELF/crel.s
+++ b/lld/test/ELF/crel.s
@@ -5,6 +5,7 @@
 # RUN: ld.lld -pie a.o b.o -o out
 # RUN: llvm-objdump -d out | FileCheck %s
 # RUN: llvm-readelf -Srs out | FileCheck %s --check-prefix=RELOC
+# RUN: llvm-dwarfdump --eh-frame out | FileCheck %s --check-prefix=UNWIND
 
 # CHECK:       <_start>:
 # CHECK-NEXT:    callq {{.*}} <foo>
@@ -18,6 +19,13 @@
 
 # RELOC:  {{0*}}[[#DATA+8]]  0000000000000008 R_X86_64_RELATIVE [[#%x,DATA+0x8000000000000000]]
 
+# RELOC:      00000000000012f4     0 NOTYPE  GLOBAL DEFAULT [[#]] _start
+# RELOC-NEXT: 00000000000012fe     0 NOTYPE  GLOBAL DEFAULT [[#]] foo
+
+## initial_location fields in FDEs are correctly relocated.
+# UNWIND: 00000018 00000010 0000001c FDE cie=00000000 pc=000012f4...000012fe
+# UNWIND: 0000002c 00000010 00000030 FDE cie=00000000 pc=000012fe...0000130c
+
 # RUN: ld.lld -pie --emit-relocs a.o b.o -o out1
 # RUN: llvm-objdump -dr out1 | FileCheck %s --check-prefix=CHECKE
 # RUN: llvm-readelf -Sr out1 | FileCheck %s --check-prefix=RELOCE
diff --git a/lld/tools/lld/CMakeLists.txt b/lld/tools/lld/CMakeLists.txt
index 630d38f770a7fee..8498a91597a930c 100644
--- a/lld/tools/lld/CMakeLists.txt
+++ b/lld/tools/lld/CMakeLists.txt
@@ -8,8 +8,8 @@ add_lld_tool(lld
 
   SUPPORT_PLUGINS
   GENERATE_DRIVER
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
+export_executable_symbols_for_plugins(lld)
 
 function(lld_target_link_libraries target type)
   if (TARGET obj.${target})
diff --git a/lldb/docs/use/tutorial.rst b/lldb/docs/use/tutorial.rst
index 22354c6720e14ad..00e7befdd087a44 100644
--- a/lldb/docs/use/tutorial.rst
+++ b/lldb/docs/use/tutorial.rst
@@ -168,6 +168,10 @@ is more convenient to make the basic commands unique down to a letter or two,
 and then learn these sequences than to fill the namespace with lots of aliases,
 and then have to type them all the way out.
 
+If the alias abbreviation or the full alias command collides with another
+existing command, the command resolver will prefer to use the alias over any
+other command as far as there is only one alias command match.
+
 However, users are free to customize LLDB's command set however they like, and
 since LLDB reads the file ``~/.lldbinit`` at startup, you can store all your
 aliases there and they will be generally available to you. Your aliases are
diff --git a/lldb/include/lldb/Interpreter/CommandInterpreter.h b/lldb/include/lldb/Interpreter/CommandInterpreter.h
index 48f6618ab0e3929..2bafc30cc8e23ad 100644
--- a/lldb/include/lldb/Interpreter/CommandInterpreter.h
+++ b/lldb/include/lldb/Interpreter/CommandInterpreter.h
@@ -295,6 +295,10 @@ class CommandInterpreter : public Broadcaster,
                                       StringList *matches = nullptr,
                                       StringList *descriptions = nullptr) const;
 
+  CommandObject *
+  GetAliasCommandObject(llvm::StringRef cmd, StringList *matches = nullptr,
+                        StringList *descriptions = nullptr) const;
+
   /// Determine whether a root level, built-in command with this name exists.
   bool CommandExists(llvm::StringRef cmd) const;
 
diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp
index c63445b7c8c8686..7c439f4ddb93e38 100644
--- a/lldb/source/Commands/CommandObjectCommands.cpp
+++ b/lldb/source/Commands/CommandObjectCommands.cpp
@@ -322,7 +322,13 @@ rather than using a positional placeholder:"
 
 (lldb) command alias bl3 breakpoint set -f %1 -l 3
 
-    Always sets a breakpoint on line 3 of whatever file is indicated.)");
+    Always sets a breakpoint on line 3 of whatever file is indicated.
+
+)"
+
+        "If the alias abbreviation or the full alias command collides with another \
+existing command, the command resolver will prefer to use the alias over any \
+other command as far as there is only one alias command match.");
 
     CommandArgumentEntry arg1;
     CommandArgumentEntry arg2;
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index 71c928ec811fc65..e45112530404b8e 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -520,10 +520,6 @@ void CommandInterpreter::Initialize() {
 
   cmd_obj_sp = GetCommandSPExact("scripting run");
   if (cmd_obj_sp) {
-    AddAlias("sc", cmd_obj_sp);
-    AddAlias("scr", cmd_obj_sp);
-    AddAlias("scri", cmd_obj_sp);
-    AddAlias("scrip", cmd_obj_sp);
     AddAlias("script", cmd_obj_sp);
   }
 
@@ -1302,6 +1298,39 @@ CommandObject *CommandInterpreter::GetUserCommandObject(
   return {};
 }
 
+CommandObject *CommandInterpreter::GetAliasCommandObject(
+    llvm::StringRef cmd, StringList *matches, StringList *descriptions) const {
+  auto find_exact =
+      [&](const CommandObject::CommandMap &map) -> CommandObject * {
+    auto found_elem = map.find(cmd.str());
+    if (found_elem == map.end())
+      return (CommandObject *)nullptr;
+    CommandObject *exact_cmd = found_elem->second.get();
+    if (!exact_cmd)
+      return nullptr;
+
+    if (matches)
+      matches->AppendString(exact_cmd->GetCommandName());
+
+    if (descriptions)
+      descriptions->AppendString(exact_cmd->GetHelp());
+
+    return exact_cmd;
+    return nullptr;
+  };
+
+  CommandObject *exact_cmd = find_exact(GetAliases());
+  if (exact_cmd)
+    return exact_cmd;
+
+  // We didn't have an exact command, so now look for partial matches.
+  StringList tmp_list;
+  StringList *matches_ptr = matches ? matches : &tmp_list;
+  AddNamesMatchingPartialString(GetAliases(), cmd, *matches_ptr);
+
+  return {};
+}
+
 bool CommandInterpreter::CommandExists(llvm::StringRef cmd) const {
   return m_command_dict.find(std::string(cmd)) != m_command_dict.end();
 }
@@ -3421,6 +3450,19 @@ CommandInterpreter::ResolveCommandImpl(std::string &command_line,
   std::string next_word;
   StringList matches;
   bool done = false;
+
+  auto build_alias_cmd = [&](std::string &full_name) {
+    revised_command_line.Clear();
+    matches.Clear();
+    std::string alias_result;
+    cmd_obj =
+        BuildAliasResult(full_name, scratch_command, alias_result, result);
+    revised_command_line.Printf("%s", alias_result.c_str());
+    if (cmd_obj) {
+      wants_raw_input = cmd_obj->WantsRawCommandString();
+    }
+  };
+
   while (!done) {
     char quote_char = '\0';
     std::string suffix;
@@ -3432,14 +3474,7 @@ CommandInterpreter::ResolveCommandImpl(std::string &command_line,
       bool is_real_command =
           (!is_alias) || (cmd_obj != nullptr && !cmd_obj->IsAlias());
       if (!is_real_command) {
-        matches.Clear();
-        std::string alias_result;
-        cmd_obj =
-            BuildAliasResult(full_name, scratch_command, alias_result, result);
-        revised_command_line.Printf("%s", alias_result.c_str());
-        if (cmd_obj) {
-          wants_raw_input = cmd_obj->WantsRawCommandString();
-        }
+        build_alias_cmd(full_name);
       } else {
         if (cmd_obj) {
           llvm::StringRef cmd_name = cmd_obj->GetCommandName();
@@ -3486,21 +3521,32 @@ CommandInterpreter::ResolveCommandImpl(std::string &command_line,
     if (cmd_obj == nullptr) {
       const size_t num_matches = matches.GetSize();
       if (matches.GetSize() > 1) {
-        StreamString error_msg;
-        error_msg.Printf("Ambiguous command '%s'. Possible matches:\n",
-                         next_word.c_str());
+        StringList alias_matches;
+        GetAliasCommandObject(next_word, &alias_matches);
+
+        if (alias_matches.GetSize() == 1) {
+          std::string full_name;
+          GetAliasFullName(alias_matches.GetStringAtIndex(0), full_name);
+          build_alias_cmd(full_name);
+          done = static_cast<bool>(cmd_obj);
+        } else {
+          StreamString error_msg;
+          error_msg.Printf("Ambiguous command '%s'. Possible matches:\n",
+                           next_word.c_str());
 
-        for (uint32_t i = 0; i < num_matches; ++i) {
-          error_msg.Printf("\t%s\n", matches.GetStringAtIndex(i));
+          for (uint32_t i = 0; i < num_matches; ++i) {
+            error_msg.Printf("\t%s\n", matches.GetStringAtIndex(i));
+          }
+          result.AppendRawError(error_msg.GetString());
         }
-        result.AppendRawError(error_msg.GetString());
       } else {
         // We didn't have only one match, otherwise we wouldn't get here.
         lldbassert(num_matches == 0);
         result.AppendErrorWithFormat("'%s' is not a valid command.\n",
                                      next_word.c_str());
       }
-      return nullptr;
+      if (!done)
+        return nullptr;
     }
 
     if (cmd_obj->IsMultiwordObject()) {
diff --git a/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py b/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
new file mode 100644
index 000000000000000..14c66fefea7efda
--- /dev/null
+++ b/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
@@ -0,0 +1,35 @@
+"""
+Test how lldb reacts to ambiguous commands
+"""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class AmbiguousCommandTestCase(TestBase):
+    @no_debug_info_test
+    def test_ambiguous_command_with_alias(self):
+        command_interpreter = self.dbg.GetCommandInterpreter()
+        self.assertTrue(command_interpreter, VALID_COMMAND_INTERPRETER)
+        result = lldb.SBCommandReturnObject()
+
+        command_interpreter.HandleCommand(
+            "command alias corefile target create -c %0", result
+        )
+        self.assertTrue(result.Succeeded())
+
+        command_interpreter.ResolveCommand("co", result)
+        self.assertFalse(result.Succeeded())
+        self.assertEqual(
+            result.GetError(),
+            "Ambiguous command 'co'. Possible matches:\n\tcommand\n\tcontinue\n\tcorefile\n",
+        )
+
+        command_interpreter.HandleCommand("command unalias continue", result)
+        self.assertTrue(result.Succeeded())
+
+        command_interpreter.ResolveCommand("co", result)
+        self.assertTrue(result.Succeeded())
+        self.assertEqual(result.GetOutput(), "target create -c %0")
diff --git a/lldb/test/API/functionalities/ambigous_commands/categories b/lldb/test/API/functionalities/ambigous_commands/categories
new file mode 100644
index 000000000000000..3a3f4df6416b9c8
--- /dev/null
+++ b/lldb/test/API/functionalities/ambigous_commands/categories
@@ -0,0 +1 @@
+cmdline
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 257dc2250bb4ef8..3e7e3a965559afa 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1010,7 +1010,7 @@ endmacro()
 
 macro(add_llvm_executable name)
   cmake_parse_arguments(ARG
-    "DISABLE_LLVM_LINK_LLVM_DYLIB;IGNORE_EXTERNALIZE_DEBUGINFO;NO_INSTALL_RPATH;SUPPORT_PLUGINS;EXPORT_SYMBOLS;EXPORT_SYMBOLS_FOR_PLUGINS"
+    "DISABLE_LLVM_LINK_LLVM_DYLIB;IGNORE_EXTERNALIZE_DEBUGINFO;NO_INSTALL_RPATH;SUPPORT_PLUGINS;EXPORT_SYMBOLS"
     "ENTITLEMENTS;BUNDLE_PATH"
     ""
     ${ARGN})
@@ -1081,12 +1081,6 @@ macro(add_llvm_executable name)
     endif()
   endif()
 
-  if (ARG_EXPORT_SYMBOLS)
-    export_executable_symbols(${name})
-  elseif(ARG_EXPORT_SYMBOLS_FOR_PLUGINS)
-    export_executable_symbols_for_plugins(${name})
-  endif()
-
   if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB)
     set(USE_SHARED USE_SHARED)
   endif()
@@ -1118,6 +1112,10 @@ macro(add_llvm_executable name)
   endif()
 
   llvm_codesign(${name} ENTITLEMENTS ${ARG_ENTITLEMENTS} BUNDLE_PATH ${ARG_BUNDLE_PATH})
+
+  if (ARG_EXPORT_SYMBOLS)
+    export_executable_symbols(${name})
+  endif()
 endmacro(add_llvm_executable name)
 
 # add_llvm_pass_plugin(name [NO_MODULE] ...)
diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake
index 39b4abaa0d9313e..e36a71f522d82c9 100644
--- a/llvm/cmake/modules/CrossCompile.cmake
+++ b/llvm/cmake/modules/CrossCompile.cmake
@@ -71,6 +71,12 @@ function(llvm_create_cross_target project_name target_name toolchain buildtype)
 
   if("libc" IN_LIST LLVM_ENABLE_PROJECTS AND NOT LIBC_HDRGEN_EXE)
     set(libc_flags -DLLVM_LIBC_FULL_BUILD=ON -DLIBC_HDRGEN_ONLY=ON)
+    if(MSVC)
+      # Due to some issues mentioned in llvm/projects/CMakeLists.txt, libc build is disabled by
+      # default in the cross target when building with MSVC compatible compilers on Windows. Add
+      # LLVM_FORCE_BUILD_RUNTIME to bypass this issue and force its building on Windows.
+      list(APPEND libc_flags -DLLVM_FORCE_BUILD_RUNTIME=ON)
+    endif()
   endif()
 
   add_custom_command(OUTPUT ${${project_name}_${target_name}_BUILD}/CMakeCache.txt
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index b17e3c828ed3d5e..0ee4d7b444cfcf7 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -1675,7 +1675,8 @@ Currently, only the following parameter attributes are defined:
     -  The pair ``a,b`` represents the range ``[a,b)``.
     -  Both ``a`` and ``b`` are constants.
     -  The range is allowed to wrap.
-    -  The range should not represent the full or empty set. That is, ``a!=b``.
+    -  The empty range is represented using ``0,0``.
+    -  Otherwise, ``a`` and ``b`` are not allowed to be equal.
     
     This attribute may only be applied to parameters or return values with integer 
     or vector of integer types.
@@ -2309,6 +2310,10 @@ example:
     This attribute indicates that MemTagSanitizer checks
     (dynamic address safety analysis based on Armv8 MTE) are enabled for
     this function.
+``sanitize_realtime``
+    This attribute indicates that RealtimeSanitizer checks
+    (realtime safety analysis - no allocations, syscalls or exceptions) are enabled
+    for this function.
 ``speculative_load_hardening``
     This attribute indicates that
     `Speculative Load Hardening <https://llvm.org/docs/SpeculativeLoadHardening.html>`_
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 1ed860de6b9dce9..c98171bc8fd8c1b 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -108,6 +108,9 @@ Changes to the RISC-V Backend
   fill value) rather than NOPs.
 * Added Syntacore SCR4 CPUs: ``-mcpu=syntacore-scr4-rv32/64``
 * ``-mcpu=sifive-p470`` was added.
+* Fixed length vector support using RVV instructions now requires VLEN>=64. This
+  means Zve32x and Zve32f will also require Zvl64b. The prior support was
+  largely untested.
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/include/llvm/ADT/GraphTraits.h b/llvm/include/llvm/ADT/GraphTraits.h
index 0764ecb4bb56823..20bb27f50e17f95 100644
--- a/llvm/include/llvm/ADT/GraphTraits.h
+++ b/llvm/include/llvm/ADT/GraphTraits.h
@@ -97,7 +97,8 @@ struct GraphTraits {
 
 namespace detail {
 template <typename T>
-using has_number_t = decltype(GraphTraits<T>::getNumber(std::declval<T>()));
+using has_number_t = decltype(GraphTraits<T>::getNumber(
+    std::declval<typename GraphTraits<T>::NodeRef>()));
 } // namespace detail
 
 /// Indicate whether a GraphTraits<NodeT>::getNumber() is supported.
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index fb88f2fe75adb51..4beac37a5834457 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -758,6 +758,7 @@ enum AttributeKindCodes {
   ATTR_KIND_SANITIZE_NUMERICAL_STABILITY = 93,
   ATTR_KIND_INITIALIZES = 94,
   ATTR_KIND_HYBRID_PATCHABLE = 95,
+  ATTR_KIND_SANITIZE_REALTIME = 96,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 93086d4ac2c8df7..890c2b8ca36e117 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2135,21 +2135,17 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       return Cost;
     }
     case Intrinsic::smax:
+      ISD = ISD::SMAX;
+      break;
     case Intrinsic::smin:
+      ISD = ISD::SMIN;
+      break;
     case Intrinsic::umax:
-    case Intrinsic::umin: {
-      // minmax(X,Y) = select(icmp(X,Y),X,Y)
-      Type *CondTy = RetTy->getWithNewBitWidth(1);
-      bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
-      CmpInst::Predicate Pred =
-          IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
-      InstructionCost Cost = 0;
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
-                                          Pred, CostKind);
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
-                                          Pred, CostKind);
-      return Cost;
-    }
+      ISD = ISD::UMAX;
+      break;
+    case Intrinsic::umin:
+      ISD = ISD::UMIN;
+      break;
     case Intrinsic::sadd_sat:
       ISD = ISD::SADDSAT;
       break;
@@ -2163,101 +2159,29 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       ISD = ISD::USUBSAT;
       break;
     case Intrinsic::smul_fix:
-    case Intrinsic::umul_fix: {
-      unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
-      Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
-
-      unsigned ExtOp =
-          IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
-      TTI::CastContextHint CCH = TTI::CastContextHint::None;
-
-      InstructionCost Cost = 0;
-      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
-      Cost +=
-          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
-      Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
-                                            CCH, CostKind);
-      Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
-                                              CostKind,
-                                              {TTI::OK_AnyValue, TTI::OP_None},
-                                              {TTI::OK_UniformConstantValue, TTI::OP_None});
-      Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
-                                              {TTI::OK_AnyValue, TTI::OP_None},
-                                              {TTI::OK_UniformConstantValue, TTI::OP_None});
-      Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
-      return Cost;
-    }
+      ISD = ISD::SMULFIX;
+      break;
+    case Intrinsic::umul_fix:
+      ISD = ISD::UMULFIX;
+      break;
     case Intrinsic::sadd_with_overflow:
-    case Intrinsic::ssub_with_overflow: {
-      Type *SumTy = RetTy->getContainedType(0);
-      Type *OverflowTy = RetTy->getContainedType(1);
-      unsigned Opcode = IID == Intrinsic::sadd_with_overflow
-                            ? BinaryOperator::Add
-                            : BinaryOperator::Sub;
-
-      //   Add:
-      //   Overflow -> (Result < LHS) ^ (RHS < 0)
-      //   Sub:
-      //   Overflow -> (Result < LHS) ^ (RHS > 0)
-      InstructionCost Cost = 0;
-      Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
-      Cost += 2 * thisT()->getCmpSelInstrCost(
-                      Instruction::ICmp, SumTy, OverflowTy,
-                      CmpInst::ICMP_SGT, CostKind);
-      Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
-                                              CostKind);
-      return Cost;
-    }
+      ISD = ISD::SADDO;
+      break;
+    case Intrinsic::ssub_with_overflow:
+      ISD = ISD::SSUBO;
+      break;
     case Intrinsic::uadd_with_overflow:
-    case Intrinsic::usub_with_overflow: {
-      Type *SumTy = RetTy->getContainedType(0);
-      Type *OverflowTy = RetTy->getContainedType(1);
-      unsigned Opcode = IID == Intrinsic::uadd_with_overflow
-                            ? BinaryOperator::Add
-                            : BinaryOperator::Sub;
-      CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
-                                    ? CmpInst::ICMP_ULT
-                                    : CmpInst::ICMP_UGT;
-
-      InstructionCost Cost = 0;
-      Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
-      Cost +=
-          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
-                                      Pred, CostKind);
-      return Cost;
-    }
+      ISD = ISD::UADDO;
+      break;
+    case Intrinsic::usub_with_overflow:
+      ISD = ISD::USUBO;
+      break;
     case Intrinsic::smul_with_overflow:
-    case Intrinsic::umul_with_overflow: {
-      Type *MulTy = RetTy->getContainedType(0);
-      Type *OverflowTy = RetTy->getContainedType(1);
-      unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
-      Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
-      bool IsSigned = IID == Intrinsic::smul_with_overflow;
-
-      unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
-      TTI::CastContextHint CCH = TTI::CastContextHint::None;
-
-      InstructionCost Cost = 0;
-      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
-      Cost +=
-          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
-      Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
-                                            CCH, CostKind);
-      Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
-                                              CostKind,
-                                              {TTI::OK_AnyValue, TTI::OP_None},
-                                              {TTI::OK_UniformConstantValue, TTI::OP_None});
-
-      if (IsSigned)
-        Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
-                                                CostKind,
-                                                {TTI::OK_AnyValue, TTI::OP_None},
-                                                {TTI::OK_UniformConstantValue, TTI::OP_None});
-
-      Cost += thisT()->getCmpSelInstrCost(
-          BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
-      return Cost;
-    }
+      ISD = ISD::SMULO;
+      break;
+    case Intrinsic::umul_with_overflow:
+      ISD = ISD::UMULO;
+      break;
     case Intrinsic::fptosi_sat:
     case Intrinsic::fptoui_sat: {
       if (Tys.empty())
@@ -2304,8 +2228,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       break;
     }
 
+    auto *ST = dyn_cast<StructType>(RetTy);
+    Type *LegalizeTy = ST ? ST->getContainedType(0) : RetTy;
+    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(LegalizeTy);
+
     const TargetLoweringBase *TLI = getTLI();
-    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
 
     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
       if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
@@ -2345,6 +2272,91 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
     }
+    case Intrinsic::smin:
+    case Intrinsic::smax:
+    case Intrinsic::umin:
+    case Intrinsic::umax: {
+      // minmax(X,Y) = select(icmp(X,Y),X,Y)
+      Type *CondTy = RetTy->getWithNewBitWidth(1);
+      bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
+      CmpInst::Predicate Pred =
+          IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
+      InstructionCost Cost = 0;
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+                                          Pred, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                          Pred, CostKind);
+      return Cost;
+    }
+    case Intrinsic::sadd_with_overflow:
+    case Intrinsic::ssub_with_overflow: {
+      Type *SumTy = RetTy->getContainedType(0);
+      Type *OverflowTy = RetTy->getContainedType(1);
+      unsigned Opcode = IID == Intrinsic::sadd_with_overflow
+                            ? BinaryOperator::Add
+                            : BinaryOperator::Sub;
+
+      //   Add:
+      //   Overflow -> (Result < LHS) ^ (RHS < 0)
+      //   Sub:
+      //   Overflow -> (Result < LHS) ^ (RHS > 0)
+      InstructionCost Cost = 0;
+      Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
+      Cost +=
+          2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy, OverflowTy,
+                                          CmpInst::ICMP_SGT, CostKind);
+      Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
+                                              CostKind);
+      return Cost;
+    }
+    case Intrinsic::uadd_with_overflow:
+    case Intrinsic::usub_with_overflow: {
+      Type *SumTy = RetTy->getContainedType(0);
+      Type *OverflowTy = RetTy->getContainedType(1);
+      unsigned Opcode = IID == Intrinsic::uadd_with_overflow
+                            ? BinaryOperator::Add
+                            : BinaryOperator::Sub;
+      CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
+                                    ? CmpInst::ICMP_ULT
+                                    : CmpInst::ICMP_UGT;
+
+      InstructionCost Cost = 0;
+      Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
+                                          OverflowTy, Pred, CostKind);
+      return Cost;
+    }
+    case Intrinsic::smul_with_overflow:
+    case Intrinsic::umul_with_overflow: {
+      Type *MulTy = RetTy->getContainedType(0);
+      Type *OverflowTy = RetTy->getContainedType(1);
+      unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
+      Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
+      bool IsSigned = IID == Intrinsic::smul_with_overflow;
+
+      unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
+      TTI::CastContextHint CCH = TTI::CastContextHint::None;
+
+      InstructionCost Cost = 0;
+      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
+      Cost +=
+          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
+      Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
+                                            CCH, CostKind);
+      Cost += thisT()->getArithmeticInstrCost(
+          Instruction::LShr, ExtTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
+          {TTI::OK_UniformConstantValue, TTI::OP_None});
+
+      if (IsSigned)
+        Cost += thisT()->getArithmeticInstrCost(
+            Instruction::AShr, MulTy, CostKind,
+            {TTI::OK_AnyValue, TTI::OP_None},
+            {TTI::OK_UniformConstantValue, TTI::OP_None});
+
+      Cost += thisT()->getCmpSelInstrCost(
+          BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
+      return Cost;
+    }
     case Intrinsic::sadd_sat:
     case Intrinsic::ssub_sat: {
       // Assume a default expansion.
@@ -2386,6 +2398,30 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
       return Cost;
     }
+    case Intrinsic::smul_fix:
+    case Intrinsic::umul_fix: {
+      unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
+      Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
+
+      unsigned ExtOp =
+          IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
+      TTI::CastContextHint CCH = TTI::CastContextHint::None;
+
+      InstructionCost Cost = 0;
+      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
+      Cost +=
+          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
+      Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
+                                            CCH, CostKind);
+      Cost += thisT()->getArithmeticInstrCost(
+          Instruction::LShr, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
+          {TTI::OK_UniformConstantValue, TTI::OP_None});
+      Cost += thisT()->getArithmeticInstrCost(
+          Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
+          {TTI::OK_UniformConstantValue, TTI::OP_None});
+      Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
+      return Cost;
+    }
     default:
       break;
     }
diff --git a/llvm/include/llvm/CodeGen/ExpandVectorPredication.h b/llvm/include/llvm/CodeGen/ExpandVectorPredication.h
index c42c644c99e91ec..3aafb22eab6beaf 100644
--- a/llvm/include/llvm/CodeGen/ExpandVectorPredication.h
+++ b/llvm/include/llvm/CodeGen/ExpandVectorPredication.h
@@ -16,10 +16,21 @@ namespace llvm {
 class TargetTransformInfo;
 class VPIntrinsic;
 
-/// Expand a vector predication intrinsic. Returns true if the intrinsic was
-/// removed/replaced.
-bool expandVectorPredicationIntrinsic(VPIntrinsic &VPI,
-                                      const TargetTransformInfo &TTI);
+/// Represents the details the expansion of a VP intrinsic.
+enum class VPExpansionDetails {
+  /// No change happened during expansion.
+  IntrinsicUnchanged,
+  /// At least one operand was updated.
+  IntrinsicUpdated,
+  /// The whole intrinsic was replaced.
+  IntrinsicReplaced,
+};
+
+/// Expand a vector predication intrinsic. Returns the kind of expansion
+/// that was applied to the intrinsic.
+VPExpansionDetails
+expandVectorPredicationIntrinsic(VPIntrinsic &VPI,
+                                 const TargetTransformInfo &TTI);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 797e29d071dd9a4..2d238326ee1a307 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -1304,6 +1304,9 @@ template <> struct GraphTraits<MachineBasicBlock *> {
   }
 };
 
+static_assert(GraphHasNodeNumbers<MachineBasicBlock *>,
+              "GraphTraits getNumber() not detected");
+
 template <> struct GraphTraits<const MachineBasicBlock *> {
   using NodeRef = const MachineBasicBlock *;
   using ChildIteratorType = MachineBasicBlock::const_succ_iterator;
@@ -1318,6 +1321,9 @@ template <> struct GraphTraits<const MachineBasicBlock *> {
   }
 };
 
+static_assert(GraphHasNodeNumbers<const MachineBasicBlock *>,
+              "GraphTraits getNumber() not detected");
+
 // Provide specializations of GraphTraits to be able to treat a
 // MachineFunction as a graph of MachineBasicBlocks and to walk it
 // in inverse order.  Inverse order for a function is considered
@@ -1341,6 +1347,9 @@ template <> struct GraphTraits<Inverse<MachineBasicBlock*>> {
   }
 };
 
+static_assert(GraphHasNodeNumbers<Inverse<MachineBasicBlock *>>,
+              "GraphTraits getNumber() not detected");
+
 template <> struct GraphTraits<Inverse<const MachineBasicBlock*>> {
   using NodeRef = const MachineBasicBlock *;
   using ChildIteratorType = MachineBasicBlock::const_pred_iterator;
@@ -1358,6 +1367,9 @@ template <> struct GraphTraits<Inverse<const MachineBasicBlock*>> {
   }
 };
 
+static_assert(GraphHasNodeNumbers<Inverse<const MachineBasicBlock *>>,
+              "GraphTraits getNumber() not detected");
+
 // These accessors are handy for sharing templated code between IR and MIR.
 inline auto successors(const MachineBasicBlock *BB) { return BB->successors(); }
 inline auto predecessors(const MachineBasicBlock *BB) {
diff --git a/llvm/include/llvm/FuzzMutate/OpDescriptor.h b/llvm/include/llvm/FuzzMutate/OpDescriptor.h
index 00a8ea0e5babc6c..78114074dbbfce7 100644
--- a/llvm/include/llvm/FuzzMutate/OpDescriptor.h
+++ b/llvm/include/llvm/FuzzMutate/OpDescriptor.h
@@ -89,7 +89,7 @@ class SourcePred {
 struct OpDescriptor {
   unsigned Weight;
   SmallVector<SourcePred, 2> SourcePreds;
-  std::function<Value *(ArrayRef<Value *>, Instruction *)> BuilderFunc;
+  std::function<Value *(ArrayRef<Value *>, BasicBlock::iterator)> BuilderFunc;
 };
 
 static inline SourcePred onlyType(Type *Only) {
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index e1bd193891c1e1d..891e34fec0c7985 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -297,6 +297,9 @@ def SanitizeMemTag : EnumAttr<"sanitize_memtag", [FnAttr]>;
 /// NumericalStabilitySanitizer is on.
 def SanitizeNumericalStability : EnumAttr<"sanitize_numerical_stability", [FnAttr]>;
 
+/// RealtimeSanitizer is on.
+def SanitizeRealtime : EnumAttr<"sanitize_realtime", [FnAttr]>;
+
 /// Speculative Load Hardening is enabled.
 ///
 /// Note that this uses the default compatibility (always compatible during
@@ -385,6 +388,7 @@ def : CompatRule<"isEqual<SanitizeMemoryAttr>">;
 def : CompatRule<"isEqual<SanitizeHWAddressAttr>">;
 def : CompatRule<"isEqual<SanitizeMemTagAttr>">;
 def : CompatRule<"isEqual<SanitizeNumericalStabilityAttr>">;
+def : CompatRule<"isEqual<SanitizeRealtimeAttr>">;
 def : CompatRule<"isEqual<SafeStackAttr>">;
 def : CompatRule<"isEqual<ShadowCallStackAttr>">;
 def : CompatRule<"isEqual<UseSampleProfileAttr>">;
diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h
index d7b94d50e631117..7e2b68e6faea29c 100644
--- a/llvm/include/llvm/Support/GenericDomTree.h
+++ b/llvm/include/llvm/Support/GenericDomTree.h
@@ -262,7 +262,10 @@ class DominatorTreeBase {
       SmallVector<std::unique_ptr<DomTreeNodeBase<NodeT>>>;
   DomTreeNodeStorageTy DomTreeNodes;
   // For graphs where blocks don't have numbers, create a numbering here.
-  DenseMap<const NodeT *, unsigned> NodeNumberMap;
+  // TODO: use an empty struct with [[no_unique_address]] in C++20.
+  std::conditional_t<!GraphHasNodeNumbers<NodeT *>,
+                     DenseMap<const NodeT *, unsigned>, std::tuple<>>
+      NodeNumberMap;
   DomTreeNodeBase<NodeT> *RootNode = nullptr;
   ParentPtr Parent = nullptr;
 
@@ -355,12 +358,8 @@ class DominatorTreeBase {
   }
 
 private:
-  template <typename T>
-  using has_number_t =
-      decltype(GraphTraits<T *>::getNumber(std::declval<T *>()));
-
   std::optional<unsigned> getNodeIndex(const NodeT *BB) const {
-    if constexpr (is_detected<has_number_t, NodeT>::value) {
+    if constexpr (GraphHasNodeNumbers<NodeT *>) {
       // BB can be nullptr, map nullptr to index 0.
       assert(BlockNumberEpoch ==
                  GraphTraits<ParentPtr>::getNumberEpoch(Parent) &&
@@ -374,7 +373,7 @@ class DominatorTreeBase {
   }
 
   unsigned getNodeIndexForInsert(const NodeT *BB) {
-    if constexpr (is_detected<has_number_t, NodeT>::value) {
+    if constexpr (GraphHasNodeNumbers<NodeT *>) {
       // getNodeIndex will never fail if nodes have getNumber().
       unsigned Idx = *getNodeIndex(BB);
       if (Idx >= DomTreeNodes.size()) {
@@ -736,7 +735,8 @@ class DominatorTreeBase {
     }
 
     DomTreeNodes[*IdxOpt] = nullptr;
-    NodeNumberMap.erase(BB);
+    if constexpr (!GraphHasNodeNumbers<NodeT *>)
+      NodeNumberMap.erase(BB);
 
     if (!IsPostDom) return;
 
@@ -830,7 +830,7 @@ class DominatorTreeBase {
 private:
   void updateBlockNumberEpoch() {
     // Nothing to do for graphs that don't number their blocks.
-    if constexpr (is_detected<has_number_t, NodeT>::value)
+    if constexpr (GraphHasNodeNumbers<NodeT *>)
       BlockNumberEpoch = GraphTraits<ParentPtr>::getNumberEpoch(Parent);
   }
 
@@ -849,9 +849,8 @@ class DominatorTreeBase {
   }
 
   /// Update dominator tree after renumbering blocks.
-  template <class T_ = NodeT>
-  std::enable_if_t<is_detected<has_number_t, T_>::value, void>
-  updateBlockNumbers() {
+  template <typename T = NodeT>
+  std::enable_if_t<GraphHasNodeNumbers<T *>, void> updateBlockNumbers() {
     updateBlockNumberEpoch();
 
     unsigned MaxNumber = GraphTraits<ParentPtr>::getMaxNumber(Parent);
@@ -889,7 +888,8 @@ class DominatorTreeBase {
 
   void reset() {
     DomTreeNodes.clear();
-    NodeNumberMap.clear();
+    if constexpr (!GraphHasNodeNumbers<NodeT *>)
+      NodeNumberMap.clear();
     Roots.clear();
     RootNode = nullptr;
     Parent = nullptr;
@@ -989,7 +989,8 @@ class DominatorTreeBase {
   /// assignable and destroyable state, but otherwise invalid.
   void wipe() {
     DomTreeNodes.clear();
-    NodeNumberMap.clear();
+    if constexpr (!GraphHasNodeNumbers<NodeT *>)
+      NodeNumberMap.clear();
     RootNode = nullptr;
     Parent = nullptr;
   }
diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
index 5256aff56205bab..f127d16b8de124d 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
@@ -19,7 +19,8 @@ class Type;
 class PGOCtxProfLoweringPass : public PassInfoMixin<PGOCtxProfLoweringPass> {
 public:
   explicit PGOCtxProfLoweringPass() = default;
-  static bool isContextualIRPGOEnabled();
+  // True if contextual instrumentation is enabled.
+  static bool isCtxIRPGOInstrEnabled();
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
 };
diff --git a/llvm/include/llvm/Transforms/Instrumentation/RealtimeSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/RealtimeSanitizer.h
new file mode 100644
index 000000000000000..f2ce1636551ce2f
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Instrumentation/RealtimeSanitizer.h
@@ -0,0 +1,38 @@
+//===- RealtimeSanitizer.h - RealtimeSanitizer instrumentation --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of the RealtimeSanitizer, an LLVM transformation for
+// detecting and reporting realtime safety violations.
+//
+// The instrumentation pass inserts calls to __rtsan_realtime_enter and
+// __rtsan_realtime_exit at the entry and exit points of functions that are
+// marked with the appropriate attribute.
+//
+// See also: llvm-project/compiler-rt/lib/rtsan/
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_REALTIMESANITIZER_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_REALTIMESANITIZER_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct RealtimeSanitizerOptions {};
+
+class RealtimeSanitizerPass : public PassInfoMixin<RealtimeSanitizerPass> {
+public:
+  RealtimeSanitizerPass(const RealtimeSanitizerOptions &Options);
+  PreservedAnalyses run(Function &F, AnalysisManager<Function> &AM);
+
+  static bool isRequired() { return true; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_INSTRUMENTATION_REALTIMESANITIZER_H
diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp
index f56f910bd778443..fbae705127538a9 100644
--- a/llvm/lib/Analysis/CtxProfAnalysis.cpp
+++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp
@@ -17,11 +17,17 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/ProfileData/PGOCtxProfReader.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 #define DEBUG_TYPE "ctx_prof"
 
+using namespace llvm;
+cl::opt<std::string>
+    UseCtxProfile("use-ctx-profile", cl::init(""), cl::Hidden,
+                  cl::desc("Use the specified contextual profile file"));
+
 namespace llvm {
 namespace json {
 Value toJSON(const PGOCtxProfContext &P) {
@@ -58,8 +64,6 @@ Value toJSON(const PGOCtxProfContext::CallTargetMapTy &P) {
 } // namespace json
 } // namespace llvm
 
-using namespace llvm;
-
 AnalysisKey CtxProfAnalysis::Key;
 
 CtxProfAnalysis::Result CtxProfAnalysis::run(Module &M,
diff --git a/llvm/lib/Analysis/MustExecute.cpp b/llvm/lib/Analysis/MustExecute.cpp
index 904d30d0544654d..caed62679a683cc 100644
--- a/llvm/lib/Analysis/MustExecute.cpp
+++ b/llvm/lib/Analysis/MustExecute.cpp
@@ -135,16 +135,21 @@ static bool CanProveNotTakenFirstIteration(const BasicBlock *ExitBlock,
   // todo: this would be a lot more powerful if we used scev, but all the
   // plumbing is currently missing to pass a pointer in from the pass
   // Check for cmp (phi [x, preheader] ...), y where (pred x, y is known
+  ICmpInst::Predicate Pred = Cond->getPredicate();
   auto *LHS = dyn_cast<PHINode>(Cond->getOperand(0));
   auto *RHS = Cond->getOperand(1);
-  if (!LHS || LHS->getParent() != CurLoop->getHeader())
-    return false;
-  auto DL = ExitBlock->getDataLayout();
+  if (!LHS || LHS->getParent() != CurLoop->getHeader()) {
+    Pred = Cond->getSwappedPredicate();
+    LHS = dyn_cast<PHINode>(Cond->getOperand(1));
+    RHS = Cond->getOperand(0);
+    if (!LHS || LHS->getParent() != CurLoop->getHeader())
+      return false;
+  }
+
+  auto DL = ExitBlock->getModule()->getDataLayout();
   auto *IVStart = LHS->getIncomingValueForBlock(CurLoop->getLoopPreheader());
-  auto *SimpleValOrNull = simplifyCmpInst(Cond->getPredicate(),
-                                          IVStart, RHS,
-                                          {DL, /*TLI*/ nullptr,
-                                              DT, /*AC*/ nullptr, BI});
+  auto *SimpleValOrNull = simplifyCmpInst(
+      Pred, IVStart, RHS, {DL, /*TLI*/ nullptr, DT, /*AC*/ nullptr, BI});
   auto *SimpleCst = dyn_cast_or_null<Constant>(SimpleValOrNull);
   if (!SimpleCst)
     return false;
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 9358f89e2bf9dca..f41907f03512576 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -3109,8 +3109,8 @@ bool LLParser::parseRangeAttr(AttrBuilder &B) {
   if (ParseAPSInt(BitWidth, Lower) ||
       parseToken(lltok::comma, "expected ','") || ParseAPSInt(BitWidth, Upper))
     return true;
-  if (Lower == Upper)
-    return tokError("the range should not represent the full or empty set!");
+  if (Lower == Upper && !Lower.isZero())
+    return tokError("the range represent the empty set but limits aren't 0!");
 
   if (parseToken(lltok::rparen, "expected ')'"))
     return true;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index fd4ae109b4bb8f7..d4dbab04e8ecdbc 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2141,6 +2141,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::SanitizeMemory;
   case bitc::ATTR_KIND_SANITIZE_NUMERICAL_STABILITY:
     return Attribute::SanitizeNumericalStability;
+  case bitc::ATTR_KIND_SANITIZE_REALTIME:
+    return Attribute::SanitizeRealtime;
   case bitc::ATTR_KIND_SPECULATIVE_LOAD_HARDENING:
     return Attribute::SpeculativeLoadHardening;
   case bitc::ATTR_KIND_SWIFT_ERROR:
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 52e15e6880ef286..33ec14b60dd2884 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -843,6 +843,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_SANITIZE_MEMORY;
   case Attribute::SanitizeNumericalStability:
     return bitc::ATTR_KIND_SANITIZE_NUMERICAL_STABILITY;
+  case Attribute::SanitizeRealtime:
+    return bitc::ATTR_KIND_SANITIZE_REALTIME;
   case Attribute::SpeculativeLoadHardening:
     return bitc::ATTR_KIND_SPECULATIVE_LOAD_HARDENING;
   case Attribute::SwiftError:
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 5ffdbcda93e7530..675d88d6d38cd9e 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -160,11 +160,15 @@ struct CachingVPExpander {
   Value *convertEVLToMask(IRBuilder<> &Builder, Value *EVLParam,
                           ElementCount ElemCount);
 
-  Value *foldEVLIntoMask(VPIntrinsic &VPI);
+  /// If needed, folds the EVL in the mask operand and discards the EVL
+  /// parameter. Returns a pair of the value of the intrinsic after the change
+  /// (if any) and whether the mask was actually folded.
+  std::pair<Value *, bool> foldEVLIntoMask(VPIntrinsic &VPI);
 
   /// "Remove" the %evl parameter of \p PI by setting it to the static vector
-  /// length of the operation.
-  void discardEVLParameter(VPIntrinsic &PI);
+  /// length of the operation. Returns true if the %evl (if any) was effectively
+  /// changed.
+  bool discardEVLParameter(VPIntrinsic &PI);
 
   /// Lower this VP binary operator to a unpredicated binary operator.
   Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder,
@@ -206,7 +210,9 @@ struct CachingVPExpander {
   CachingVPExpander(const TargetTransformInfo &TTI)
       : TTI(TTI), UsingTTIOverrides(anyExpandVPOverridesSet()) {}
 
-  bool expandVectorPredication(VPIntrinsic &VPI);
+  /// Expand llvm.vp.* intrinsics as requested by \p TTI.
+  /// Returns the details of the expansion.
+  VPExpansionDetails expandVectorPredication(VPIntrinsic &VPI);
 };
 
 //// CachingVPExpander {
@@ -645,15 +651,15 @@ Value *CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder,
   return NewCmp;
 }
 
-void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
+bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
   LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n");
 
   if (VPI.canIgnoreVectorLengthParam())
-    return;
+    return false;
 
   Value *EVLParam = VPI.getVectorLengthParam();
   if (!EVLParam)
-    return;
+    return false;
 
   ElementCount StaticElemCount = VPI.getStaticVectorLength();
   Value *MaxEVL = nullptr;
@@ -672,16 +678,17 @@ void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
     MaxEVL = ConstantInt::get(Int32Ty, StaticElemCount.getFixedValue(), false);
   }
   VPI.setVectorLengthParam(MaxEVL);
+  return true;
 }
 
-Value *CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
+std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
   LLVM_DEBUG(dbgs() << "Folding vlen for " << VPI << '\n');
 
   IRBuilder<> Builder(&VPI);
 
   // Ineffective %evl parameter and so nothing to do here.
   if (VPI.canIgnoreVectorLengthParam())
-    return &VPI;
+    return {&VPI, false};
 
   // Only VP intrinsics can have an %evl parameter.
   Value *OldMaskParam = VPI.getMaskParam();
@@ -704,7 +711,7 @@ Value *CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
          "transformation did not render the evl param ineffective!");
 
   // Reassess the modified instruction.
-  return &VPI;
+  return {&VPI, true};
 }
 
 Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
@@ -807,21 +814,27 @@ CachingVPExpander::getVPLegalizationStrategy(const VPIntrinsic &VPI) const {
   return VPStrat;
 }
 
-/// Expand llvm.vp.* intrinsics as requested by \p TTI.
-bool CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) {
+VPExpansionDetails
+CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) {
   auto Strategy = getVPLegalizationStrategy(VPI);
   sanitizeStrategy(VPI, Strategy);
 
+  VPExpansionDetails Changed = VPExpansionDetails::IntrinsicUnchanged;
+
   // Transform the EVL parameter.
   switch (Strategy.EVLParamStrategy) {
   case VPLegalization::Legal:
     break;
   case VPLegalization::Discard:
-    discardEVLParameter(VPI);
+    if (discardEVLParameter(VPI))
+      Changed = VPExpansionDetails::IntrinsicUpdated;
     break;
   case VPLegalization::Convert:
-    if (foldEVLIntoMask(VPI))
+    if (auto [NewVPI, Folded] = foldEVLIntoMask(VPI); Folded) {
+      (void)NewVPI;
+      Changed = VPExpansionDetails::IntrinsicUpdated;
       ++NumFoldedVL;
+    }
     break;
   }
 
@@ -834,17 +847,17 @@ bool CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) {
   case VPLegalization::Convert:
     if (Value *V = expandPredication(VPI); V != &VPI) {
       ++NumLoweredVPOps;
-      // Return true if and only if the intrinsic was actually removed.
-      return true;
+      Changed = VPExpansionDetails::IntrinsicReplaced;
     }
     break;
   }
 
-  return false;
+  return Changed;
 }
 } // namespace
 
-bool llvm::expandVectorPredicationIntrinsic(VPIntrinsic &VPI,
-                                            const TargetTransformInfo &TTI) {
+VPExpansionDetails
+llvm::expandVectorPredicationIntrinsic(VPIntrinsic &VPI,
+                                       const TargetTransformInfo &TTI) {
   return CachingVPExpander(TTI).expandVectorPredication(VPI);
 }
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 0df90f402aaf481..3373b76edb268f8 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -361,10 +361,14 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
         Function *Parent = CI->getParent()->getParent();
         const TargetTransformInfo &TTI = LookupTTI(*Parent);
         auto *VPI = cast<VPIntrinsic>(CI);
-        return expandVectorPredicationIntrinsic(*VPI, TTI);
+        VPExpansionDetails ED = expandVectorPredicationIntrinsic(*VPI, TTI);
+        // Expansion of VP intrinsics may change the IR but not actually
+        // replace the intrinsic, so update Changed for the pass
+        // and compute Removed for forEachCall.
+        Changed |= ED != VPExpansionDetails::IntrinsicUnchanged;
+        bool Removed = ED == VPExpansionDetails::IntrinsicReplaced;
+        return Removed;
       });
-      // Not all intrinsics are removed, but the code is changed in any case.
-      Changed = true;
       break;
     case Intrinsic::objc_autorelease:
       Changed |= lowerObjCCall(F, "objc_autorelease");
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7a6a1c6b832b39c..f827eb559a01cf8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2563,14 +2563,12 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) {
 
   if ((!LegalOperations || hasOperation(ISD::AVGCEILU, VT)) &&
       sd_match(N, m_Sub(m_Or(m_Value(A), m_Value(B)),
-                        m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)),
-                              m_SpecificInt(1))))) {
+                        m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)), m_One())))) {
     return DAG.getNode(ISD::AVGCEILU, DL, VT, A, B);
   }
   if ((!LegalOperations || hasOperation(ISD::AVGCEILS, VT)) &&
       sd_match(N, m_Sub(m_Or(m_Value(A), m_Value(B)),
-                        m_Sra(m_Xor(m_Deferred(A), m_Deferred(B)),
-                              m_SpecificInt(1))))) {
+                        m_Sra(m_Xor(m_Deferred(A), m_Deferred(B)), m_One())))) {
     return DAG.getNode(ISD::AVGCEILS, DL, VT, A, B);
   }
   return SDValue();
@@ -2928,14 +2926,12 @@ SDValue DAGCombiner::foldAddToAvg(SDNode *N, const SDLoc &DL) {
 
   if ((!LegalOperations || hasOperation(ISD::AVGFLOORU, VT)) &&
       sd_match(N, m_Add(m_And(m_Value(A), m_Value(B)),
-                        m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)),
-                              m_SpecificInt(1))))) {
+                        m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)), m_One())))) {
     return DAG.getNode(ISD::AVGFLOORU, DL, VT, A, B);
   }
   if ((!LegalOperations || hasOperation(ISD::AVGFLOORS, VT)) &&
       sd_match(N, m_Add(m_And(m_Value(A), m_Value(B)),
-                        m_Sra(m_Xor(m_Deferred(A), m_Deferred(B)),
-                              m_SpecificInt(1))))) {
+                        m_Sra(m_Xor(m_Deferred(A), m_Deferred(B)), m_One())))) {
     return DAG.getNode(ISD::AVGFLOORS, DL, VT, A, B);
   }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 0d117f7cf8734f8..a71afe1a3162f03 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -1697,22 +1697,24 @@ Error MachOPlatform::MachOPlatformPlugin::addSymbolTableRegistration(
     HeaderAddr = I->second;
   }
 
-  SymbolTableVector LocalSymTab;
-  auto &SymTab = LLVM_LIKELY(!InBootstrapPhase) ? LocalSymTab
-                                                : MP.Bootstrap.load()->SymTab;
+  if (LLVM_UNLIKELY(InBootstrapPhase)) {
+    // If we're in the bootstrap phase then just record these symbols in the
+    // bootstrap object and then bail out -- registration will be attached to
+    // the bootstrap graph.
+    std::lock_guard<std::mutex> Lock(MP.Bootstrap.load()->Mutex);
+    auto &SymTab = MP.Bootstrap.load()->SymTab;
+    for (auto &[OriginalSymbol, NameSym] : JITSymTabInfo)
+      SymTab.push_back({NameSym->getAddress(), OriginalSymbol->getAddress(),
+                        flagsForSymbol(*OriginalSymbol)});
+    return Error::success();
+  }
+
+  SymbolTableVector SymTab;
   for (auto &[OriginalSymbol, NameSym] : JITSymTabInfo)
     SymTab.push_back({NameSym->getAddress(), OriginalSymbol->getAddress(),
                       flagsForSymbol(*OriginalSymbol)});
 
-  // Bail out if we're in the bootstrap phase -- registration of thees symbols
-  // will be attached to the bootstrap graph.
-  if (LLVM_UNLIKELY(InBootstrapPhase))
-    return Error::success();
-
-  shared::AllocActions &allocActions = LLVM_LIKELY(!InBootstrapPhase)
-                                           ? G.allocActions()
-                                           : MP.Bootstrap.load()->DeferredAAs;
-  allocActions.push_back(
+  G.allocActions().push_back(
       {cantFail(WrapperFunctionCall::Create<SPSRegisterSymbolsArgs>(
            MP.RegisterObjectSymbolTable.Addr, HeaderAddr, SymTab)),
        cantFail(WrapperFunctionCall::Create<SPSRegisterSymbolsArgs>(
diff --git a/llvm/lib/FuzzMutate/IRMutator.cpp b/llvm/lib/FuzzMutate/IRMutator.cpp
index 3f27daad55e39ba..72e0de593760760 100644
--- a/llvm/lib/FuzzMutate/IRMutator.cpp
+++ b/llvm/lib/FuzzMutate/IRMutator.cpp
@@ -148,7 +148,7 @@ void InjectorIRStrategy::mutate(BasicBlock &BB, RandomIRBuilder &IB) {
   for (const auto &Pred : ArrayRef(OpDesc->SourcePreds).slice(1))
     Srcs.push_back(IB.findOrCreateSource(BB, InstsBefore, Srcs, Pred));
 
-  if (Value *Op = OpDesc->BuilderFunc(Srcs, Insts[IP])) {
+  if (Value *Op = OpDesc->BuilderFunc(Srcs, Insts[IP]->getIterator())) {
     // Find a sink and wire up the results of the operation.
     IB.connectToSink(BB, InstsAfter, Op);
   }
@@ -388,9 +388,9 @@ void InsertFunctionStrategy::mutate(BasicBlock &BB, RandomIRBuilder &IB) {
   }
   bool isRetVoid = (F->getReturnType() == Type::getVoidTy(M->getContext()));
   auto BuilderFunc = [FTy, F, isRetVoid](ArrayRef<Value *> Srcs,
-                                         Instruction *Inst) {
+                                         BasicBlock::iterator InsertPt) {
     StringRef Name = isRetVoid ? nullptr : "C";
-    CallInst *Call = CallInst::Create(FTy, F, Srcs, Name, Inst);
+    CallInst *Call = CallInst::Create(FTy, F, Srcs, Name, InsertPt);
     // Don't return this call inst if it return void as it can't be sinked.
     return isRetVoid ? nullptr : Call;
   };
@@ -414,7 +414,7 @@ void InsertFunctionStrategy::mutate(BasicBlock &BB, RandomIRBuilder &IB) {
     Srcs.push_back(IB.findOrCreateSource(BB, InstsBefore, Srcs, Pred));
   }
 
-  if (Value *Op = BuilderFunc(Srcs, Insts[IP])) {
+  if (Value *Op = BuilderFunc(Srcs, Insts[IP]->getIterator())) {
     // Find a sink and wire up the results of the operation.
     IB.connectToSink(BB, InstsAfter, Op);
   }
@@ -543,7 +543,7 @@ void InsertPHIStrategy::mutate(BasicBlock &BB, RandomIRBuilder &IB) {
   if (&BB == &BB.getParent()->getEntryBlock())
     return;
   Type *Ty = IB.randomType();
-  PHINode *PHI = PHINode::Create(Ty, llvm::pred_size(&BB), "", &BB.front());
+  PHINode *PHI = PHINode::Create(Ty, llvm::pred_size(&BB), "", BB.begin());
 
   // Use a map to make sure the same incoming basic block has the same value.
   DenseMap<BasicBlock *, Value *> IncomingValues;
diff --git a/llvm/lib/FuzzMutate/Operations.cpp b/llvm/lib/FuzzMutate/Operations.cpp
index 408f35879acd3b4..389ff8130771c88 100644
--- a/llvm/lib/FuzzMutate/Operations.cpp
+++ b/llvm/lib/FuzzMutate/Operations.cpp
@@ -98,8 +98,8 @@ void llvm::describeFuzzerVectorOps(std::vector<fuzzerop::OpDescriptor> &Ops) {
 }
 
 OpDescriptor llvm::fuzzerop::selectDescriptor(unsigned Weight) {
-  auto buildOp = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return SelectInst::Create(Srcs[0], Srcs[1], Srcs[2], "S", Inst);
+  auto buildOp = [](ArrayRef<Value *> Srcs, BasicBlock::iterator InsertPt) {
+    return SelectInst::Create(Srcs[0], Srcs[1], Srcs[2], "S", InsertPt);
   };
   return {Weight,
           {boolOrVecBoolType(), matchFirstLengthWAnyType(), matchSecondType()},
@@ -107,16 +107,16 @@ OpDescriptor llvm::fuzzerop::selectDescriptor(unsigned Weight) {
 }
 
 OpDescriptor llvm::fuzzerop::fnegDescriptor(unsigned Weight) {
-  auto buildOp = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return UnaryOperator::Create(Instruction::FNeg, Srcs[0], "F", Inst);
+  auto buildOp = [](ArrayRef<Value *> Srcs, BasicBlock::iterator InsertPt) {
+    return UnaryOperator::Create(Instruction::FNeg, Srcs[0], "F", InsertPt);
   };
   return {Weight, {anyFloatOrVecFloatType()}, buildOp};
 }
 
 OpDescriptor llvm::fuzzerop::binOpDescriptor(unsigned Weight,
                                              Instruction::BinaryOps Op) {
-  auto buildOp = [Op](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return BinaryOperator::Create(Op, Srcs[0], Srcs[1], "B", Inst);
+  auto buildOp = [Op](ArrayRef<Value *> Srcs, BasicBlock::iterator InsertPt) {
+    return BinaryOperator::Create(Op, Srcs[0], Srcs[1], "B", InsertPt);
   };
   switch (Op) {
   case Instruction::Add:
@@ -148,8 +148,9 @@ OpDescriptor llvm::fuzzerop::binOpDescriptor(unsigned Weight,
 OpDescriptor llvm::fuzzerop::cmpOpDescriptor(unsigned Weight,
                                              Instruction::OtherOps CmpOp,
                                              CmpInst::Predicate Pred) {
-  auto buildOp = [CmpOp, Pred](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return CmpInst::Create(CmpOp, Pred, Srcs[0], Srcs[1], "C", Inst);
+  auto buildOp = [CmpOp, Pred](ArrayRef<Value *> Srcs,
+                               BasicBlock::iterator InsertPt) {
+    return CmpInst::Create(CmpOp, Pred, Srcs[0], Srcs[1], "C", InsertPt);
   };
 
   switch (CmpOp) {
@@ -163,9 +164,10 @@ OpDescriptor llvm::fuzzerop::cmpOpDescriptor(unsigned Weight,
 }
 
 OpDescriptor llvm::fuzzerop::splitBlockDescriptor(unsigned Weight) {
-  auto buildSplitBlock = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    BasicBlock *Block = Inst->getParent();
-    BasicBlock *Next = Block->splitBasicBlock(Inst, "BB");
+  auto buildSplitBlock = [](ArrayRef<Value *> Srcs,
+                            BasicBlock::iterator InsertPt) {
+    BasicBlock *Block = InsertPt->getParent();
+    BasicBlock *Next = Block->splitBasicBlock(InsertPt, "BB");
 
     // If it was an exception handling block, we are done.
     if (Block->isEHPad())
@@ -174,7 +176,8 @@ OpDescriptor llvm::fuzzerop::splitBlockDescriptor(unsigned Weight) {
     // Loop back on this block by replacing the unconditional forward branch
     // with a conditional with a backedge.
     if (Block != &Block->getParent()->getEntryBlock()) {
-      BranchInst::Create(Block, Next, Srcs[0], Block->getTerminator());
+      BranchInst::Create(Block, Next, Srcs[0],
+                         Block->getTerminator()->getIterator());
       Block->getTerminator()->eraseFromParent();
 
       // We need values for each phi in the block. Since there isn't a good way
@@ -193,12 +196,12 @@ OpDescriptor llvm::fuzzerop::splitBlockDescriptor(unsigned Weight) {
 }
 
 OpDescriptor llvm::fuzzerop::gepDescriptor(unsigned Weight) {
-  auto buildGEP = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
+  auto buildGEP = [](ArrayRef<Value *> Srcs, BasicBlock::iterator InsertPt) {
     // TODO: It would be better to generate a random type here, rather than
     // generating a random value and picking its type.
     Type *Ty = Srcs[1]->getType();
     auto Indices = ArrayRef(Srcs).drop_front(2);
-    return GetElementPtrInst::Create(Ty, Srcs[0], Indices, "G", Inst);
+    return GetElementPtrInst::Create(Ty, Srcs[0], Indices, "G", InsertPt);
   };
   // TODO: Handle aggregates and vectors
   // TODO: Support multiple indices.
@@ -239,10 +242,11 @@ static SourcePred validExtractValueIndex() {
 }
 
 OpDescriptor llvm::fuzzerop::extractValueDescriptor(unsigned Weight) {
-  auto buildExtract = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
+  auto buildExtract = [](ArrayRef<Value *> Srcs,
+                         BasicBlock::iterator InsertPt) {
     // TODO: It's pretty inefficient to shuffle this all through constants.
     unsigned Idx = cast<ConstantInt>(Srcs[1])->getZExtValue();
-    return ExtractValueInst::Create(Srcs[0], {Idx}, "E", Inst);
+    return ExtractValueInst::Create(Srcs[0], {Idx}, "E", InsertPt);
   };
   // TODO: Should we handle multiple indices?
   return {Weight, {anyAggregateType(), validExtractValueIndex()}, buildExtract};
@@ -298,10 +302,10 @@ static SourcePred validInsertValueIndex() {
 }
 
 OpDescriptor llvm::fuzzerop::insertValueDescriptor(unsigned Weight) {
-  auto buildInsert = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
+  auto buildInsert = [](ArrayRef<Value *> Srcs, BasicBlock::iterator InsertPt) {
     // TODO: It's pretty inefficient to shuffle this all through constants.
     unsigned Idx = cast<ConstantInt>(Srcs[2])->getZExtValue();
-    return InsertValueInst::Create(Srcs[0], Srcs[1], {Idx}, "I", Inst);
+    return InsertValueInst::Create(Srcs[0], Srcs[1], {Idx}, "I", InsertPt);
   };
   return {
       Weight,
@@ -310,16 +314,17 @@ OpDescriptor llvm::fuzzerop::insertValueDescriptor(unsigned Weight) {
 }
 
 OpDescriptor llvm::fuzzerop::extractElementDescriptor(unsigned Weight) {
-  auto buildExtract = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return ExtractElementInst::Create(Srcs[0], Srcs[1], "E", Inst);
+  auto buildExtract = [](ArrayRef<Value *> Srcs,
+                         BasicBlock::iterator InsertPt) {
+    return ExtractElementInst::Create(Srcs[0], Srcs[1], "E", InsertPt);
   };
   // TODO: Try to avoid undefined accesses.
   return {Weight, {anyVectorType(), anyIntType()}, buildExtract};
 }
 
 OpDescriptor llvm::fuzzerop::insertElementDescriptor(unsigned Weight) {
-  auto buildInsert = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return InsertElementInst::Create(Srcs[0], Srcs[1], Srcs[2], "I", Inst);
+  auto buildInsert = [](ArrayRef<Value *> Srcs, BasicBlock::iterator InsertPt) {
+    return InsertElementInst::Create(Srcs[0], Srcs[1], Srcs[2], "I", InsertPt);
   };
   // TODO: Try to avoid undefined accesses.
   return {Weight,
@@ -343,8 +348,9 @@ static SourcePred validShuffleVectorIndex() {
 }
 
 OpDescriptor llvm::fuzzerop::shuffleVectorDescriptor(unsigned Weight) {
-  auto buildShuffle = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return new ShuffleVectorInst(Srcs[0], Srcs[1], Srcs[2], "S", Inst);
+  auto buildShuffle = [](ArrayRef<Value *> Srcs,
+                         BasicBlock::iterator InsertPt) {
+    return new ShuffleVectorInst(Srcs[0], Srcs[1], Srcs[2], "S", InsertPt);
   };
   return {Weight,
           {anyVectorType(), matchFirstType(), validShuffleVectorIndex()},
diff --git a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
index 5569888e5b28e8b..fe4ad10a02d57d3 100644
--- a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
+++ b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
@@ -69,9 +69,9 @@ AllocaInst *RandomIRBuilder::createStackMemory(Function *F, Type *Ty,
   BasicBlock *EntryBB = &F->getEntryBlock();
   DataLayout DL(F->getParent());
   AllocaInst *Alloca = new AllocaInst(Ty, DL.getAllocaAddrSpace(), "A",
-                                      &*EntryBB->getFirstInsertionPt());
+                                      EntryBB->getFirstInsertionPt());
   if (Init)
-    new StoreInst(Init, Alloca, Alloca->getNextNode());
+    new StoreInst(Init, Alloca, std::next(Alloca->getIterator()));
   return Alloca;
 }
 
@@ -165,7 +165,7 @@ Value *RandomIRBuilder::findOrCreateSource(BasicBlock &BB,
       Type *Ty = GV->getValueType();
       LoadInst *LoadGV = nullptr;
       if (BB.getTerminator()) {
-        LoadGV = new LoadInst(Ty, GV, "LGV", &*BB.getFirstInsertionPt());
+        LoadGV = new LoadInst(Ty, GV, "LGV", BB.getFirstInsertionPt());
       } else {
         LoadGV = new LoadInst(Ty, GV, "LGV", &BB);
       }
@@ -213,7 +213,7 @@ Value *RandomIRBuilder::newSource(BasicBlock &BB, ArrayRef<Instruction *> Insts,
     }
     // Pick the type independently.
     Type *AccessTy = RS.getSelection()->getType();
-    auto *NewLoad = new LoadInst(AccessTy, Ptr, "L", &*IP);
+    auto *NewLoad = new LoadInst(AccessTy, Ptr, "L", IP);
 
     // Only sample this load if it really matches the descriptor
     if (Pred.matches(Srcs, NewLoad))
@@ -231,7 +231,8 @@ Value *RandomIRBuilder::newSource(BasicBlock &BB, ArrayRef<Instruction *> Insts,
     Function *F = BB.getParent();
     AllocaInst *Alloca = createStackMemory(F, Ty, newSrc);
     if (BB.getTerminator()) {
-      newSrc = new LoadInst(Ty, Alloca, /*ArrLen,*/ "L", BB.getTerminator());
+      newSrc = new LoadInst(Ty, Alloca, /*ArrLen,*/ "L",
+                            BB.getTerminator()->getIterator());
     } else {
       newSrc = new LoadInst(Ty, Alloca, /*ArrLen,*/ "L", &BB);
     }
@@ -325,7 +326,7 @@ Instruction *RandomIRBuilder::connectToSink(BasicBlock &BB,
       for (BasicBlock *Dom : Dominators) {
         for (Instruction &I : *Dom) {
           if (isa<PointerType>(I.getType()))
-            return new StoreInst(V, &I, Insts.back());
+            return new StoreInst(V, &I, Insts.back()->getIterator());
         }
       }
       break;
@@ -351,7 +352,7 @@ Instruction *RandomIRBuilder::connectToSink(BasicBlock &BB,
       Module *M = BB.getParent()->getParent();
       auto [GV, DidCreate] =
           findOrCreateGlobalVariable(M, {}, fuzzerop::onlyType(V->getType()));
-      return new StoreInst(V, GV, Insts.back());
+      return new StoreInst(V, GV, Insts.back()->getIterator());
     }
     case EndOfValueSink:
     default:
@@ -373,7 +374,7 @@ Instruction *RandomIRBuilder::newSink(BasicBlock &BB,
     }
   }
 
-  return new StoreInst(V, Ptr, Insts.back());
+  return new StoreInst(V, Ptr, Insts.back()->getIterator());
 }
 
 Value *RandomIRBuilder::findPointer(BasicBlock &BB,
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index abd05e316bec148..fa124e46483dcec 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -171,6 +171,7 @@ Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind,
                          const ConstantRange &CR) {
   assert(Attribute::isConstantRangeAttrKind(Kind) &&
          "Not a ConstantRange attribute");
+  assert(!CR.isFullSet() && "ConstantRange attribute must not be full");
   LLVMContextImpl *pImpl = Context.pImpl;
   FoldingSetNodeID ID;
   ID.AddInteger(Kind);
@@ -2020,6 +2021,9 @@ AttrBuilder &AttrBuilder::addInAllocaAttr(Type *Ty) {
 
 AttrBuilder &AttrBuilder::addConstantRangeAttr(Attribute::AttrKind Kind,
                                                const ConstantRange &CR) {
+  if (CR.isFullSet())
+    return *this;
+
   return addAttribute(Attribute::get(Ctx, Kind, CR));
 }
 
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 58ebe7e95cd06cd..93fa635e9b4e171 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2484,8 +2484,9 @@ void ExtractValueInst::init(ArrayRef<unsigned> Idxs, const Twine &Name) {
 }
 
 ExtractValueInst::ExtractValueInst(const ExtractValueInst &EVI)
-  : UnaryInstruction(EVI.getType(), ExtractValue, EVI.getOperand(0)),
-    Indices(EVI.Indices) {
+    : UnaryInstruction(EVI.getType(), ExtractValue, EVI.getOperand(0),
+                       (BasicBlock *)nullptr),
+      Indices(EVI.Indices) {
   SubclassOptionalData = EVI.SubclassOptionalData;
 }
 
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 8669825749d83c7..2599de3437ba293 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -346,6 +346,12 @@ void ReplaceableMetadataImpl::SalvageDebugInfo(const Constant &C) {
     MetadataTracking::OwnerTy Owner = Pair.second.first;
     if (!Owner)
       continue;
+    // Check for MetadataAsValue.
+    if (isa<MetadataAsValue *>(Owner)) {
+      cast<MetadataAsValue *>(Owner)->handleChangedMetadata(
+          ValueAsMetadata::get(UndefValue::get(C.getType())));
+      continue;
+    }
     if (!isa<Metadata *>(Owner))
       continue;
     auto *OwnerMD = dyn_cast_if_present<MDNode>(cast<Metadata *>(Owner));
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index c40a074137ab2da..7a8a6c8daf95938 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -62,10 +63,23 @@
 
 using namespace llvm;
 
-#undef  DEBUG_TYPE
-#define DEBUG_TYPE "reloc-info"
+#define DEBUG_TYPE "elf-object-writer"
 
 namespace {
+namespace stats {
+
+STATISTIC(AllocTextBytes, "Total size of SHF_ALLOC text sections");
+STATISTIC(AllocROBytes, "Total size of SHF_ALLOC readonly sections");
+STATISTIC(AllocRWBytes, "Total size of SHF_ALLOC read-write sections");
+STATISTIC(StrtabBytes, "Total size of SHT_STRTAB sections");
+STATISTIC(SymtabBytes, "Total size of SHT_SYMTAB sections");
+STATISTIC(RelocationBytes, "Total size of relocation sections");
+STATISTIC(DynsymBytes, "Total size of SHT_DYNSYM sections");
+STATISTIC(DebugBytes, "Total size of debug info sections");
+STATISTIC(UnwindBytes, "Total size of unwind sections");
+STATISTIC(OtherBytes, "Total size of uncategorized sections");
+
+} // namespace stats
 
 struct ELFWriter;
 
@@ -951,6 +965,44 @@ void ELFWriter::writeSectionHeader(const MCAssembler &Asm) {
     else
       Size = Offsets.second - Offsets.first;
 
+    auto SectionHasFlag = [&](uint64_t Flag) -> bool {
+      return Section->getFlags() & Flag;
+    };
+
+    if (Section->getName().starts_with(".debug")) {
+      stats::DebugBytes += Size;
+    } else if (Section->getName().starts_with(".eh_frame")) {
+      stats::UnwindBytes += Size;
+    } else if (SectionHasFlag(ELF::SHF_ALLOC)) {
+      if (SectionHasFlag(ELF::SHF_EXECINSTR)) {
+        stats::AllocTextBytes += Size;
+      } else if (SectionHasFlag(ELF::SHF_WRITE)) {
+        stats::AllocRWBytes += Size;
+      } else {
+        stats::AllocROBytes += Size;
+      }
+    } else {
+      switch (Section->getType()) {
+      case ELF::SHT_STRTAB:
+        stats::StrtabBytes += Size;
+        break;
+      case ELF::SHT_SYMTAB:
+        stats::SymtabBytes += Size;
+        break;
+      case ELF::SHT_DYNSYM:
+        stats::DynsymBytes += Size;
+        break;
+      case ELF::SHT_REL:
+      case ELF::SHT_RELA:
+      case ELF::SHT_CREL:
+        stats::RelocationBytes += Size;
+        break;
+      default:
+        stats::OtherBytes += Size;
+        break;
+      }
+    }
+
     writeSection(GroupSymbolIndex, Offsets.first, Size, *Section);
   }
 }
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index bcc69d5ac3db67e..7bc1c870ce5191a 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -199,6 +199,7 @@
 #include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Instrumentation/PoisonChecking.h"
+#include "llvm/Transforms/Instrumentation/RealtimeSanitizer.h"
 #include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
@@ -1210,6 +1211,11 @@ parseRegAllocFastPassOptions(PassBuilder &PB, StringRef Params) {
   return Opts;
 }
 
+Expected<RealtimeSanitizerOptions> parseRtSanPassOptions(StringRef Params) {
+  RealtimeSanitizerOptions Result;
+  return Result;
+}
+
 } // namespace
 
 /// Tests whether a pass name starts with a valid prefix for a default pipeline
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index c175ee898098495..6927a2886b962b5 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -304,9 +304,7 @@ static cl::opt<bool> UseLoopVersioningLICM(
     "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
     cl::desc("Enable the experimental Loop Versioning LICM pass"));
 
-cl::opt<std::string>
-    UseCtxProfile("use-ctx-profile", cl::init(""), cl::Hidden,
-                  cl::desc("Use the specified contextual profile file"));
+extern cl::opt<std::string> UseCtxProfile;
 
 namespace llvm {
 extern cl::opt<bool> EnableMemProfContextDisambiguation;
@@ -1173,13 +1171,12 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   const bool IsMemprofUse = IsPGOPreLink && !PGOOpt->MemoryProfile.empty();
   // We don't want to mix pgo ctx gen and pgo gen; we also don't currently
   // enable ctx profiling from the frontend.
-  assert(
-      !(IsPGOInstrGen && PGOCtxProfLoweringPass::isContextualIRPGOEnabled()) &&
-      "Enabling both instrumented FDO and contextual instrumentation is not "
-      "supported.");
+  assert(!(IsPGOInstrGen && PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled()) &&
+         "Enabling both instrumented PGO and contextual instrumentation is not "
+         "supported.");
   // Enable contextual profiling instrumentation.
   const bool IsCtxProfGen = !IsPGOInstrGen && IsPreLink &&
-                            PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+                            PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled();
   const bool IsCtxProfUse = !UseCtxProfile.empty() && !PGOOpt &&
                             Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
 
@@ -1670,8 +1667,10 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
   // In pre-link, for ctx prof use, we stop here with an instrumented IR. We let
   // thinlto use the contextual info to perform imports; then use the contextual
   // profile in the post-thinlink phase.
-  if (!UseCtxProfile.empty() && !PGOOpt)
+  if (!UseCtxProfile.empty() && !PGOOpt) {
+    addRequiredLTOPreLinkPasses(MPM);
     return MPM;
+  }
 
   // Run partial inlining pass to partially inline functions that have
   // large bodies.
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 61a5bab92927fe6..95842d15a35bf60 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -592,6 +592,10 @@ FUNCTION_PASS_WITH_PARAMS(
       return WinEHPreparePass(DemoteCatchSwitchPHIOnly);
     },
     parseWinEHPrepareOptions, "demote-catchswitch-only")
+FUNCTION_PASS_WITH_PARAMS(
+    "rtsan", "RealtimeSanitizerPass",
+    [](RealtimeSanitizerOptions Opts) { return RealtimeSanitizerPass(Opts); },
+    parseRtSanPassOptions, "")
 #undef FUNCTION_PASS_WITH_PARAMS
 
 #ifndef LOOPNEST_PASS
diff --git a/llvm/lib/Support/regcomp.c b/llvm/lib/Support/regcomp.c
index 990aef32a396faf..daa41eb4912ef47 100644
--- a/llvm/lib/Support/regcomp.c
+++ b/llvm/lib/Support/regcomp.c
@@ -278,7 +278,7 @@ static char nuls[10];		/* place to point scanner in event of error */
 #else
 #define	DUPMAX	255
 #endif
-#define	INFINITY	(DUPMAX + 1)
+#define REGINFINITY (DUPMAX + 1)
 
 #ifndef NDEBUG
 static int never = 0;		/* for use in asserts; shuts lint up */
@@ -582,7 +582,7 @@ p_ere_exp(struct parse *p)
 				count2 = p_count(p);
 				REQUIRE(count <= count2, REG_BADBR);
 			} else		/* single number with comma */
-				count2 = INFINITY;
+				count2 = REGINFINITY;
 		} else		/* just a single number */
 			count2 = count;
 		repeat(p, pos, count, count2);
@@ -753,7 +753,7 @@ p_simp_re(struct parse *p,
 				count2 = p_count(p);
 				REQUIRE(count <= count2, REG_BADBR);
 			} else		/* single number with comma */
-				count2 = INFINITY;
+				count2 = REGINFINITY;
 		} else		/* just a single number */
 			count2 = count;
 		repeat(p, pos, count, count2);
@@ -1115,7 +1115,7 @@ repeat(struct parse *p,
 #	define	N	2
 #	define	INF	3
 #	define	REP(f, t)	((f)*8 + (t))
-#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
+#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == REGINFINITY) ? INF : N)
 	sopno copy;
 
 	if (p->error != 0)	/* head off possible runaway recursion */
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 862b5c7e3e3d753..3fc02c6da37cb8a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5999,34 +5999,6 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
   return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
 }
 
-TargetLowering::AtomicExpansionKind
-AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
-  switch (RMW->getOperation()) {
-  case AtomicRMWInst::Nand:
-  case AtomicRMWInst::FAdd:
-  case AtomicRMWInst::FSub:
-  case AtomicRMWInst::FMax:
-  case AtomicRMWInst::FMin:
-    return AtomicExpansionKind::CmpXChg;
-  case AtomicRMWInst::Xchg: {
-    const DataLayout &DL = RMW->getFunction()->getDataLayout();
-    unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
-    if (ValSize == 32 || ValSize == 64)
-      return AtomicExpansionKind::None;
-    return AtomicExpansionKind::CmpXChg;
-  }
-  default: {
-    if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
-      unsigned Size = IntTy->getBitWidth();
-      if (Size == 32 || Size == 64)
-        return AtomicExpansionKind::None;
-    }
-
-    return AtomicExpansionKind::CmpXChg;
-  }
-  }
-}
-
 /// Whether it is profitable to sink the operands of an
 /// Instruction I to the basic block of I.
 /// This helps using several modifiers (like abs and neg) more often.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 37572af3897f2ec..1a5244f7ec809b8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -388,8 +388,6 @@ class AMDGPUTargetLowering : public TargetLowering {
     return MVT::i32;
   }
 
-  AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
-
   bool shouldSinkOperands(Instruction *I,
                           SmallVectorImpl<Use *> &Ops) const override;
 };
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index c8b594ffbc64522..bbd9d75aac0e916 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDKernelCodeT.h"
+#include "MCTargetDesc/AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCExpr.h"
 #include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -1133,7 +1134,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   void print(raw_ostream &OS) const override {
     switch (Kind) {
     case Register:
-      OS << "<register " << getReg() << " mods: " << Reg.Mods << '>';
+      OS << "<register " << AMDGPUInstPrinter::getRegisterName(getReg())
+         << " mods: " << Reg.Mods << '>';
       break;
     case Immediate:
       OS << '<' << getImm();
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 7e0d96622f3c5da..c79688bf038be30 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -2175,14 +2175,33 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
 TargetLowering::AtomicExpansionKind
 R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   switch (RMW->getOperation()) {
+  case AtomicRMWInst::Nand:
+  case AtomicRMWInst::FAdd:
+  case AtomicRMWInst::FSub:
+  case AtomicRMWInst::FMax:
+  case AtomicRMWInst::FMin:
+    return AtomicExpansionKind::CmpXChg;
   case AtomicRMWInst::UIncWrap:
   case AtomicRMWInst::UDecWrap:
     // FIXME: Cayman at least appears to have instructions for this, but the
     // instruction defintions appear to be missing.
     return AtomicExpansionKind::CmpXChg;
+  case AtomicRMWInst::Xchg: {
+    const DataLayout &DL = RMW->getFunction()->getDataLayout();
+    unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
+    if (ValSize == 32 || ValSize == 64)
+      return AtomicExpansionKind::None;
+    return AtomicExpansionKind::CmpXChg;
+  }
   default:
-    break;
+    if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
+      unsigned Size = IntTy->getBitWidth();
+      if (Size == 32 || Size == 64)
+        return AtomicExpansionKind::None;
+    }
+
+    return AtomicExpansionKind::CmpXChg;
   }
 
-  return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
+  llvm_unreachable("covered atomicrmw op switch");
 }
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 32ecf350db59cf9..875738dad74cedf 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1460,7 +1460,15 @@ bool SIFoldOperands::tryFoldFoldableCopy(
     return false;
   }
 
-  MachineOperand &OpToFold = MI.getOperand(1);
+  MachineOperand *OpToFoldPtr;
+  if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
+    // Folding when any src_modifiers are non-zero is unsupported
+    if (TII->hasAnyModifiersSet(MI))
+      return false;
+    OpToFoldPtr = &MI.getOperand(2);
+  } else
+    OpToFoldPtr = &MI.getOperand(1);
+  MachineOperand &OpToFold = *OpToFoldPtr;
   bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
 
   // FIXME: We could also be folding things like TargetIndexes.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4e9c271197613b1..f8767e00949bf0e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16114,6 +16114,39 @@ static bool isBFloat2(Type *Ty) {
   return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
 }
 
+/// \return true if atomicrmw integer ops work for the type.
+static bool isAtomicRMWLegalIntTy(Type *Ty) {
+  if (auto *IT = dyn_cast<IntegerType>(Ty)) {
+    unsigned BW = IT->getBitWidth();
+    return BW == 32 || BW == 64;
+  }
+
+  return false;
+}
+
+/// \return true if this atomicrmw xchg type can be selected.
+static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
+  Type *Ty = RMW->getType();
+  if (isAtomicRMWLegalIntTy(Ty))
+    return true;
+
+  if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
+    const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
+    unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
+    return BW == 32 || BW == 64;
+  }
+
+  if (Ty->isFloatTy() || Ty->isDoubleTy())
+    return true;
+
+  if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
+    return VT->getNumElements() == 2 &&
+           VT->getElementType()->getPrimitiveSizeInBits() == 16;
+  }
+
+  return false;
+}
+
 /// \returns true if it's valid to emit a native instruction for \p RMW, based
 /// on the properties of the target memory.
 static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
@@ -16142,6 +16175,14 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
       .getValueAsBool();
 }
 
+/// \return Action to perform on AtomicRMWInsts for integer operations.
+static TargetLowering::AtomicExpansionKind
+atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
+  return isAtomicRMWLegalIntTy(RMW->getType())
+             ? TargetLowering::AtomicExpansionKind::None
+             : TargetLowering::AtomicExpansionKind::CmpXChg;
+}
+
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   unsigned AS = RMW->getPointerAddressSpace();
@@ -16161,7 +16202,22 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
       SSID == SyncScope::System ||
       SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
 
-  switch (RMW->getOperation()) {
+  auto Op = RMW->getOperation();
+  switch (Op) {
+  case AtomicRMWInst::Xchg: {
+    // PCIe supports add and xchg for system atomics.
+    return isAtomicRMWLegalXChgTy(RMW)
+               ? TargetLowering::AtomicExpansionKind::None
+               : TargetLowering::AtomicExpansionKind::CmpXChg;
+
+    // PCIe supports add and xchg for system atomics.
+    return atomicSupportedIfLegalIntType(RMW);
+  }
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::And:
+  case AtomicRMWInst::UIncWrap:
+  case AtomicRMWInst::UDecWrap:
+    return atomicSupportedIfLegalIntType(RMW);
   case AtomicRMWInst::Sub:
   case AtomicRMWInst::Or:
   case AtomicRMWInst::Xor: {
@@ -16173,7 +16229,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
         return AtomicExpansionKind::Expand;
     }
 
-    break;
+    return atomicSupportedIfLegalIntType(RMW);
   }
   case AtomicRMWInst::FAdd: {
     Type *Ty = RMW->getType();
@@ -16335,13 +16391,16 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
       if (HasSystemScope)
         return AtomicExpansionKind::CmpXChg;
     }
-    break;
+
+    return atomicSupportedIfLegalIntType(RMW);
   }
+  case AtomicRMWInst::Nand:
+  case AtomicRMWInst::FSub:
   default:
-    break;
+    return AtomicExpansionKind::CmpXChg;
   }
 
-  return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
+  llvm_unreachable("covered atomicrmw op switch");
 }
 
 TargetLowering::AtomicExpansionKind
@@ -16550,9 +16609,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   //
   // With this expansion we produce the following code:
   //   [...]
-  //   br label %atomicrmw.check.shared
-  //
-  // atomicrmw.check.shared:
   //   %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
   //   br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
   //
@@ -16595,8 +16651,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   Function *F = BB->getParent();
   BasicBlock *ExitBB =
       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
-  BasicBlock *CheckSharedBB =
-      BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
   BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
   BasicBlock *CheckPrivateBB =
       BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
@@ -16623,9 +16677,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
 
   std::prev(BB->end())->eraseFromParent();
   Builder.SetInsertPoint(BB);
-  Builder.CreateBr(CheckSharedBB);
-
-  Builder.SetInsertPoint(CheckSharedBB);
   CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
                                                {Addr}, nullptr, "is.shared");
   Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
@@ -16659,12 +16710,13 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   Builder.CreateBr(PhiBB);
 
   Builder.SetInsertPoint(PhiBB);
-  PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
+  PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
   Loaded->addIncoming(LoadedShared, SharedBB);
   Loaded->addIncoming(LoadedPrivate, PrivateBB);
   Loaded->addIncoming(LoadedGlobal, GlobalBB);
   Builder.CreateBr(ExitBB);
 
+  Loaded->takeName(AI);
   AI->replaceAllUsesWith(Loaded);
   AI->eraseFromParent();
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1315aa08557888f..59a1eee8d4f91d1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2611,6 +2611,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     Modified = true;
   }
   ReleaseVGPRInsts.clear();
+  SLoadAddresses.clear();
 
   return Modified;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b6dd4905fb61bb6..8af5c364509f0e6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3369,6 +3369,8 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
 
 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
+  case AMDGPU::V_MOV_B16_t16_e32:
+  case AMDGPU::V_MOV_B16_t16_e64:
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::V_MOV_B32_e64:
   case AMDGPU::V_MOV_B64_PSEUDO:
@@ -5639,7 +5641,9 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
   unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
   const TargetRegisterClass *RC = RI.getRegClass(RCID);
   unsigned Size = RI.getRegSizeInBits(*RC);
-  unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
+  unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
+                    : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
+                                 : AMDGPU::V_MOV_B32_e32;
   if (MO.isReg())
     Opcode = AMDGPU::COPY;
   else if (RI.isSGPRClass(RC))
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index dd4e0d53202d4ed..ee72837a50fc436 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -797,6 +797,23 @@ int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
 
 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
                                                  int Idx) const {
+  switch (MI->getOpcode()) {
+  case AMDGPU::V_ADD_U32_e32:
+  case AMDGPU::V_ADD_U32_e64:
+  case AMDGPU::V_ADD_CO_U32_e32: {
+    int OtherIdx = Idx == 1 ? 2 : 1;
+    const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
+    return OtherOp.isImm() ? OtherOp.getImm() : 0;
+  }
+  case AMDGPU::V_ADD_CO_U32_e64: {
+    int OtherIdx = Idx == 2 ? 3 : 2;
+    const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
+    return OtherOp.isImm() ? OtherOp.getImm() : 0;
+  }
+  default:
+    break;
+  }
+
   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
     return 0;
 
@@ -809,7 +826,60 @@ int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
   return getScratchInstrOffset(MI);
 }
 
+static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI,
+                              const MachineInstr &MI) {
+  assert(MI.getDesc().isAdd());
+  const MachineOperand &Src0 = MI.getOperand(1);
+  const MachineOperand &Src1 = MI.getOperand(2);
+
+  if (Src0.isFI()) {
+    return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
+                                                       Src1.getReg()));
+  }
+
+  if (Src1.isFI()) {
+    return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
+                                                       Src0.getReg()));
+  }
+
+  return false;
+}
+
 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+  // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
+  switch (MI->getOpcode()) {
+  case AMDGPU::V_ADD_U32_e32: {
+    // TODO: We could handle this but it requires work to avoid violating
+    // operand restrictions.
+    if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
+        !isFIPlusImmOrVGPR(*this, *MI))
+      return false;
+    [[fallthrough]];
+  }
+  case AMDGPU::V_ADD_U32_e64:
+    // FIXME: This optimization is barely profitable enableFlatScratch as-is.
+    //
+    // Much of the benefit with the MUBUF handling is we avoid duplicating the
+    // shift of the frame register, which isn't needed with scratch.
+    //
+    // materializeFrameBaseRegister doesn't know the register classes of the
+    // uses, and unconditionally uses an s_add_i32, which will end up using a
+    // copy for the vector uses.
+    return !ST.enableFlatScratch();
+  case AMDGPU::V_ADD_CO_U32_e32:
+    if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
+        !isFIPlusImmOrVGPR(*this, *MI))
+      return false;
+    // We can't deal with the case where the carry out has a use (though this
+    // should never happen)
+    return MI->getOperand(3).isDead();
+  case AMDGPU::V_ADD_CO_U32_e64:
+    // TODO: Should we check use_empty instead?
+    return MI->getOperand(1).isDead();
+  default:
+    break;
+  }
+
   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
     return false;
 
@@ -860,6 +930,8 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
     .addFrameIndex(FrameIdx);
 
   if (ST.enableFlatScratch() ) {
+    // FIXME: Mark scc as dead
+    // FIXME: Make sure scc isn't live in.
     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
         .addReg(OffsetReg, RegState::Kill)
         .addReg(FIReg);
@@ -877,6 +949,86 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                                        int64_t Offset) const {
   const SIInstrInfo *TII = ST.getInstrInfo();
+
+  switch (MI.getOpcode()) {
+  case AMDGPU::V_ADD_U32_e32:
+  case AMDGPU::V_ADD_CO_U32_e32: {
+    MachineOperand *FIOp = &MI.getOperand(2);
+    MachineOperand *ImmOp = &MI.getOperand(1);
+    if (!FIOp->isFI())
+      std::swap(FIOp, ImmOp);
+
+    if (!ImmOp->isImm()) {
+      assert(Offset == 0);
+      FIOp->ChangeToRegister(BaseReg, false);
+      TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
+      return;
+    }
+
+    int64_t TotalOffset = ImmOp->getImm() + Offset;
+    if (TotalOffset == 0) {
+      MI.setDesc(TII->get(AMDGPU::COPY));
+      for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
+        MI.removeOperand(I);
+
+      MI.getOperand(1).ChangeToRegister(BaseReg, false);
+      return;
+    }
+
+    ImmOp->setImm(TotalOffset);
+
+    MachineBasicBlock *MBB = MI.getParent();
+    MachineFunction *MF = MBB->getParent();
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    // FIXME: materializeFrameBaseRegister does not know the register class of
+    // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
+    // a copy so we have a legal operand and hope the register coalescer can
+    // clean it up.
+    if (isSGPRReg(MRI, BaseReg)) {
+      Register BaseRegVGPR =
+          MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
+          .addReg(BaseReg);
+      MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
+    } else {
+      MI.getOperand(2).ChangeToRegister(BaseReg, false);
+    }
+    return;
+  }
+  case AMDGPU::V_ADD_U32_e64:
+  case AMDGPU::V_ADD_CO_U32_e64: {
+    int Src0Idx = MI.getNumExplicitDefs();
+    MachineOperand *FIOp = &MI.getOperand(Src0Idx);
+    MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
+    if (!FIOp->isFI())
+      std::swap(FIOp, ImmOp);
+
+    if (!ImmOp->isImm()) {
+      FIOp->ChangeToRegister(BaseReg, false);
+      TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
+      return;
+    }
+
+    int64_t TotalOffset = ImmOp->getImm() + Offset;
+    if (TotalOffset == 0) {
+      MI.setDesc(TII->get(AMDGPU::COPY));
+
+      for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
+        MI.removeOperand(I);
+
+      MI.getOperand(1).ChangeToRegister(BaseReg, false);
+    } else {
+      FIOp->ChangeToRegister(BaseReg, false);
+      ImmOp->setImm(TotalOffset);
+    }
+
+    return;
+  }
+  default:
+    break;
+  }
+
   bool IsFlat = TII->isFLATScratch(MI);
 
 #ifndef NDEBUG
@@ -925,6 +1077,18 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
                                         Register BaseReg,
                                         int64_t Offset) const {
+
+  switch (MI->getOpcode()) {
+  case AMDGPU::V_ADD_U32_e32:
+  case AMDGPU::V_ADD_CO_U32_e32:
+    return true;
+  case AMDGPU::V_ADD_U32_e64:
+  case AMDGPU::V_ADD_CO_U32_e64:
+    return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
+  default:
+    break;
+  }
+
   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 79bcf5e8cd30d4f..155747551471e38 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -657,6 +657,7 @@ void SIShrinkInstructions::dropInstructionKeepingImpDefs(
 // although requirements match the pass placement and it reduces code size too.
 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
   assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+         MovT.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
          MovT.getOpcode() == AMDGPU::COPY);
 
   Register T = MovT.getOperand(0).getReg();
@@ -668,7 +669,12 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
   Register X = Xop.getReg();
   unsigned Xsub = Xop.getSubReg();
 
-  unsigned Size = TII->getOpSize(MovT, 0) / 4;
+  unsigned Size = TII->getOpSize(MovT, 0);
+
+  // We can't match v_swap_b16 pre-RA, because VGPR_16_Lo128 registers
+  // are not allocatble.
+  if (Size == 2 && X.isVirtual())
+    return nullptr;
 
   if (!TRI->isVGPR(*MRI, X))
     return nullptr;
@@ -684,9 +690,9 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
     KilledT = MovY->killsRegister(T, TRI);
 
     if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+         MovY->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 &&
          MovY->getOpcode() != AMDGPU::COPY) ||
-        !MovY->getOperand(1).isReg()        ||
-        MovY->getOperand(1).getReg() != T   ||
+        !MovY->getOperand(1).isReg() || MovY->getOperand(1).getReg() != T ||
         MovY->getOperand(1).getSubReg() != Tsub)
       continue;
 
@@ -714,6 +720,7 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
       }
       if (MovX ||
           (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+           I->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 &&
            I->getOpcode() != AMDGPU::COPY) ||
           I->getOperand(0).getReg() != X ||
           I->getOperand(0).getSubReg() != Xsub) {
@@ -721,7 +728,7 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
         break;
       }
 
-      if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
+      if (Size > 4 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
         continue;
 
       MovX = &*I;
@@ -730,23 +737,40 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
     if (!MovX)
       continue;
 
-    LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY);
+    LLVM_DEBUG(dbgs() << "Matched v_swap:\n" << MovT << *MovX << *MovY);
 
-    for (unsigned I = 0; I < Size; ++I) {
-      TargetInstrInfo::RegSubRegPair X1, Y1;
-      X1 = getSubRegForIndex(X, Xsub, I);
-      Y1 = getSubRegForIndex(Y, Ysub, I);
-      MachineBasicBlock &MBB = *MovT.getParent();
+    MachineBasicBlock &MBB = *MovT.getParent();
+    SmallVector<MachineInstr *, 4> Swaps;
+    if (Size == 2) {
       auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
-                         TII->get(AMDGPU::V_SWAP_B32))
-        .addDef(X1.Reg, 0, X1.SubReg)
-        .addDef(Y1.Reg, 0, Y1.SubReg)
-        .addReg(Y1.Reg, 0, Y1.SubReg)
-        .addReg(X1.Reg, 0, X1.SubReg).getInstr();
-      if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
-        // Drop implicit EXEC.
-        MIB->removeOperand(MIB->getNumExplicitOperands());
-        MIB->copyImplicitOps(*MBB.getParent(), *MovX);
+                         TII->get(AMDGPU::V_SWAP_B16))
+                     .addDef(X)
+                     .addDef(Y)
+                     .addReg(Y)
+                     .addReg(X)
+                     .getInstr();
+      Swaps.push_back(MIB);
+    } else {
+      assert(Size > 0 && Size % 4 == 0);
+      for (unsigned I = 0; I < Size / 4; ++I) {
+        TargetInstrInfo::RegSubRegPair X1, Y1;
+        X1 = getSubRegForIndex(X, Xsub, I);
+        Y1 = getSubRegForIndex(Y, Ysub, I);
+        auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
+                           TII->get(AMDGPU::V_SWAP_B32))
+                       .addDef(X1.Reg, 0, X1.SubReg)
+                       .addDef(Y1.Reg, 0, Y1.SubReg)
+                       .addReg(Y1.Reg, 0, Y1.SubReg)
+                       .addReg(X1.Reg, 0, X1.SubReg)
+                       .getInstr();
+        Swaps.push_back(MIB);
+      }
+    }
+    // Drop implicit EXEC.
+    if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
+      for (MachineInstr *Swap : Swaps) {
+        Swap->removeOperand(Swap->getNumExplicitOperands());
+        Swap->copyImplicitOps(*MBB.getParent(), *MovX);
       }
     }
     MovX->eraseFromParent();
@@ -833,6 +857,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       }
 
       if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+                            MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
                             MI.getOpcode() == AMDGPU::COPY)) {
         if (auto *NextMI = matchSwap(MI)) {
           Next = NextMI->getIterator();
@@ -1023,7 +1048,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
               MachineFunctionProperties::Property::NoVRegs))
         continue;
 
-      if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
+      if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
           !shouldShrinkTrue16(MI))
         continue;
 
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index de3191bd91df608..2e73a1a15f6b323 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -640,12 +640,12 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
 
 
 let Defs = [SCC] in { // Carry out goes to SCC
-let isCommutable = 1 in {
+let isCommutable = 1, isAdd = 1 in {
 def S_ADD_U32 : SOP2_32 <"s_add_u32">;
 def S_ADD_I32 : SOP2_32 <"s_add_i32",
   [(set i32:$sdst, (UniformBinFrag<add> SSrc_b32:$src0, SSrc_b32:$src1))]
 >;
-} // End isCommutable = 1
+} // End isCommutable = 1, isAdd = 1
 
 def S_SUB_U32 : SOP2_32 <"s_sub_u32">;
 def S_SUB_I32 : SOP2_32 <"s_sub_i32",
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 0a2e338b347871c..34d12aa5e078354 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -751,7 +751,7 @@ let SubtargetPredicate = isGFX11Plus in {
     let IsInvalidSingleUseConsumer = 1;
     let IsInvalidSingleUseProducer = 1;
   }
-  defm V_MOV_B16_t16    : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
+  defm V_MOV_B16        : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
   defm V_NOT_B16        : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
   defm V_CVT_I32_I16    : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
   defm V_CVT_U32_U16    : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 44eb5f5abafe000..d17b4f24081312c 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -763,7 +763,11 @@ def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
 
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
-defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32", 1>;
+
+let isAdd = 1 in {
+  defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32", 1>;
+}
+
 defm V_SUB_CO_U32 : VOP2bInst <"v_sub_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
 defm V_SUBREV_CO_U32 : VOP2bInst <"v_subrev_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
 defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32", 1>;
@@ -772,7 +776,11 @@ defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_f
 
 
 let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in {
-defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32", 1>;
+
+let isAdd = 1 in {
+  defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32", 1>;
+}
+
 defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
 defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
 }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 75d16a42d0205a8..476b7b349294ab3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3024,18 +3024,27 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
 
   assert(Subtarget->supportsTailCall());
 
-  // Indirect tail calls cannot be optimized for Thumb1 if the args
-  // to the call take up r0-r3. The reason is that there are no legal registers
-  // left to hold the pointer to the function to be called.
-  // Similarly, if the function uses return address sign and authentication,
-  // r12 is needed to hold the PAC and is not available to hold the callee
-  // address.
-  if (Outs.size() >= 4 &&
-      (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {
-    if (Subtarget->isThumb1Only())
-      return false;
-    // Conservatively assume the function spills LR.
-    if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true))
+  // Indirect tail-calls require a register to hold the target address. That
+  // register must be:
+  // * Allocatable (i.e. r0-r7 if the target is Thumb1).
+  // * Not callee-saved, so must be one of r0-r3 or r12.
+  // * Not used to hold an argument to the tail-called function, which might be
+  //   in r0-r3.
+  // * Not used to hold the return address authentication code, which is in r12
+  //   if enabled.
+  // Sometimes, no register matches all of these conditions, so we can't do a
+  // tail-call.
+  if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
+    SmallSet<MCPhysReg, 5> AddressRegisters;
+    for (Register R : {ARM::R0, ARM::R1, ARM::R2, ARM::R3})
+      AddressRegisters.insert(R);
+    if (!(Subtarget->isThumb1Only() ||
+          MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
+      AddressRegisters.insert(ARM::R12);
+    for (const CCValAssign &AL : ArgLocs)
+      if (AL.isRegLoc())
+        AddressRegisters.erase(AL.getLocReg());
+    if (AddressRegisters.empty())
       return false;
   }
 
diff --git a/llvm/lib/Target/LoongArch/CMakeLists.txt b/llvm/lib/Target/LoongArch/CMakeLists.txt
index cadc335a621f2e5..0f674b1b0fa9e22 100644
--- a/llvm/lib/Target/LoongArch/CMakeLists.txt
+++ b/llvm/lib/Target/LoongArch/CMakeLists.txt
@@ -24,6 +24,7 @@ add_llvm_target(LoongArchCodeGen
   LoongArchISelDAGToDAG.cpp
   LoongArchISelLowering.cpp
   LoongArchMCInstLower.cpp
+  LoongArchMergeBaseOffset.cpp
   LoongArchOptWInstrs.cpp
   LoongArchRegisterInfo.cpp
   LoongArchSubtarget.cpp
diff --git a/llvm/lib/Target/LoongArch/LoongArch.h b/llvm/lib/Target/LoongArch/LoongArch.h
index adfb844ee31b649..db6052373888093 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.h
+++ b/llvm/lib/Target/LoongArch/LoongArch.h
@@ -36,12 +36,14 @@ bool lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO,
 FunctionPass *createLoongArchDeadRegisterDefinitionsPass();
 FunctionPass *createLoongArchExpandAtomicPseudoPass();
 FunctionPass *createLoongArchISelDag(LoongArchTargetMachine &TM);
+FunctionPass *createLoongArchMergeBaseOffsetOptPass();
 FunctionPass *createLoongArchOptWInstrsPass();
 FunctionPass *createLoongArchPreRAExpandPseudoPass();
 FunctionPass *createLoongArchExpandPseudoPass();
 void initializeLoongArchDAGToDAGISelLegacyPass(PassRegistry &);
 void initializeLoongArchDeadRegisterDefinitionsPass(PassRegistry &);
 void initializeLoongArchExpandAtomicPseudoPass(PassRegistry &);
+void initializeLoongArchMergeBaseOffsetOptPass(PassRegistry &);
 void initializeLoongArchOptWInstrsPass(PassRegistry &);
 void initializeLoongArchPreRAExpandPseudoPass(PassRegistry &);
 void initializeLoongArchExpandPseudoPass(PassRegistry &);
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index f478870217ec606..8bb9497a847fa79 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -130,10 +130,16 @@ bool LoongArchAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   OS << "$" << LoongArchInstPrinter::getRegisterName(BaseMO.getReg());
   // Print the offset operand.
   const MachineOperand &OffsetMO = MI->getOperand(OpNo + 1);
+  MCOperand MCO;
+  if (!lowerOperand(OffsetMO, MCO))
+    return true;
   if (OffsetMO.isReg())
     OS << ", $" << LoongArchInstPrinter::getRegisterName(OffsetMO.getReg());
   else if (OffsetMO.isImm())
     OS << ", " << OffsetMO.getImm();
+  else if (OffsetMO.isGlobal() || OffsetMO.isBlockAddress() ||
+           OffsetMO.isMCSymbol())
+    OS << ", " << *MCO.getExpr();
   else
     return true;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index bdbb7ab0f513957..44d684331b3eb46 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -5573,6 +5573,24 @@ SDValue LoongArchTargetLowering::LowerReturn(
   return DAG.getNode(LoongArchISD::RET, DL, MVT::Other, RetOps);
 }
 
+bool LoongArchTargetLowering::isFPImmVLDILegal(const APFloat &Imm,
+                                               EVT VT) const {
+  if (!Subtarget.hasExtLSX())
+    return false;
+
+  if (VT == MVT::f32) {
+    uint64_t masked = Imm.bitcastToAPInt().getZExtValue() & 0x7e07ffff;
+    return (masked == 0x3e000000 || masked == 0x40000000);
+  }
+
+  if (VT == MVT::f64) {
+    uint64_t masked = Imm.bitcastToAPInt().getZExtValue() & 0x7fc0ffffffffffff;
+    return (masked == 0x3fc0000000000000 || masked == 0x4000000000000000);
+  }
+
+  return false;
+}
+
 bool LoongArchTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
                                            bool ForCodeSize) const {
   // TODO: Maybe need more checks here after vector extension is supported.
@@ -5580,7 +5598,7 @@ bool LoongArchTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
     return false;
   if (VT == MVT::f64 && !Subtarget.hasBasicD())
     return false;
-  return (Imm.isZero() || Imm.isExactlyValue(+1.0));
+  return (Imm.isZero() || Imm.isExactlyValue(1.0) || isFPImmVLDILegal(Imm, VT));
 }
 
 bool LoongArchTargetLowering::isCheapToSpeculateCttz(Type *) const {
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index d834a5d8587fd95..9723789e919b153 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -260,6 +260,8 @@ class LoongArchTargetLowering : public TargetLowering {
   bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
                               Align &PrefAlign) const override;
 
+  bool isFPImmVLDILegal(const APFloat &Imm, EVT VT) const;
+
 private:
   /// Target-specific function used to lower LoongArch calling conventions.
   typedef bool LoongArchCCAssignFn(const DataLayout &DL, LoongArchABI::ABI ABI,
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 0580683c3ce3033..cac47e35afe258c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -203,6 +203,29 @@ def to_valid_timm : SDNodeXForm<timm, [{
   return CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(N), Subtarget->getGRLenVT());
 }]>;
 
+// FP immediate of VLDI patterns.
+def f32imm_vldi : PatLeaf<(fpimm), [{
+  const auto &TLI =
+      *static_cast<const LoongArchTargetLowering*>(getTargetLowering());
+  return TLI.isFPImmVLDILegal(N->getValueAPF(), MVT::f32);
+}]>;
+def f64imm_vldi : PatLeaf<(fpimm), [{
+  const auto &TLI =
+      *static_cast<const LoongArchTargetLowering*>(getTargetLowering());
+  return TLI.isFPImmVLDILegal(N->getValueAPF(), MVT::f64);
+}]>;
+
+def to_f32imm_vldi : SDNodeXForm<fpimm, [{
+  uint64_t x = N->getValueAPF().bitcastToAPInt().getZExtValue();
+  x = (0b11011 << 8) | (((x >> 24) & 0xc0) ^ 0x40) | ((x >> 19) & 0x3f);
+  return CurDAG->getTargetConstant(SignExtend32<13>(x), SDLoc(N), MVT::i32);
+}]>;
+def to_f64imm_vldi : SDNodeXForm<fpimm, [{
+  uint64_t x = N->getValueAPF().bitcastToAPInt().getZExtValue();
+  x = (0b11100 << 8) | (((x >> 56) & 0xc0) ^ 0x40) | ((x >> 48) & 0x3f);
+  return CurDAG->getTargetConstant(SignExtend32<13>(x), SDLoc(N), MVT::i32);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
@@ -663,7 +686,9 @@ def VMSKGEZ_B : LSX2R_VV<0x729c5000>;
 
 def VMSKNZ_B : LSX2R_VV<0x729c6000>;
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def VLDI : LSX1RI13_VI<0x73e00000>;
+}
 
 def VAND_V : LSX3R_VVV<0x71260000>;
 def VOR_V : LSX3R_VVV<0x71268000>;
@@ -1910,6 +1935,12 @@ def : Pat<(v2i64 (fp_to_sint v2f64:$vj)), (VFTINTRZ_L_D v2f64:$vj)>;
 def : Pat<(v4i32 (fp_to_uint v4f32:$vj)), (VFTINTRZ_WU_S v4f32:$vj)>;
 def : Pat<(v2i64 (fp_to_uint v2f64:$vj)), (VFTINTRZ_LU_D v2f64:$vj)>;
 
+// Vector loads floating-point constants
+def : Pat<(f32 f32imm_vldi:$in),
+          (f32 (EXTRACT_SUBREG (VLDI (to_f32imm_vldi f32imm_vldi:$in)), sub_32))>;
+def : Pat<(f64 f64imm_vldi:$in),
+          (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>;
+
 } // Predicates = [HasExtLSX]
 
 /// Intrinsic pattern
diff --git a/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp b/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp
new file mode 100644
index 000000000000000..ae50b7a6f923e3a
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp
@@ -0,0 +1,636 @@
+//===---- LoongArchMergeBaseOffset.cpp - Optimise address calculations ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Merge the offset of address calculation into the offset field
+// of instructions in a global address lowering sequence.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArch.h"
+#include "LoongArchTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetOptions.h"
+#include <optional>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loongarch-merge-base-offset"
+#define LoongArch_MERGE_BASE_OFFSET_NAME "LoongArch Merge Base Offset"
+
+namespace {
+
+class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass {
+  const LoongArchSubtarget *ST = nullptr;
+  MachineRegisterInfo *MRI;
+
+public:
+  static char ID;
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+  bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12,
+                      MachineInstr *&Lo20, MachineInstr *&Hi12,
+                      MachineInstr *&Last);
+
+  bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12,
+                           MachineInstr *&Lo20, MachineInstr *&Hi12,
+                           MachineInstr *&Last);
+  void foldOffset(MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
+                  MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
+                  int64_t Offset);
+  bool foldLargeOffset(MachineInstr &Hi20, MachineInstr &Lo12,
+                       MachineInstr *&Lo20, MachineInstr *&Hi12,
+                       MachineInstr *&Last, MachineInstr &TailAdd,
+                       Register GAReg);
+
+  bool foldIntoMemoryOps(MachineInstr &Hi20, MachineInstr &Lo12,
+                         MachineInstr *&Lo20, MachineInstr *&Hi12,
+                         MachineInstr *&Last);
+
+  LoongArchMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override {
+    return LoongArch_MERGE_BASE_OFFSET_NAME;
+  }
+};
+} // end anonymous namespace
+
+char LoongArchMergeBaseOffsetOpt::ID = 0;
+INITIALIZE_PASS(LoongArchMergeBaseOffsetOpt, DEBUG_TYPE,
+                LoongArch_MERGE_BASE_OFFSET_NAME, false, false)
+
+// Detect either of the patterns:
+//
+// 1. (small/medium):
+//   pcalau12i vreg1, %pc_hi20(s)
+//   addi.d    vreg2, vreg1, %pc_lo12(s)
+//
+// 2. (large):
+//   pcalau12i vreg1, %pc_hi20(s)
+//   addi.d    vreg2, $zero, %pc_lo12(s)
+//   lu32i.d   vreg3, vreg2, %pc64_lo20(s)
+//   lu52i.d   vreg4, vreg3, %pc64_hi12(s)
+//   add.d     vreg5, vreg4, vreg1
+
+// The pattern is only accepted if:
+//    1) For small and medium pattern, the first instruction has only one use,
+//       which is the ADDI.
+//    2) For large pattern, the first four instructions each have only one use,
+//       and the user of the fourth instruction is ADD.
+//    3) The address operands have the appropriate type, reflecting the
+//       lowering of a global address or constant pool using the pattern.
+//    4) The offset value in the Global Address or Constant Pool is 0.
+bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
+                                                 MachineInstr *&Lo12,
+                                                 MachineInstr *&Lo20,
+                                                 MachineInstr *&Hi12,
+                                                 MachineInstr *&Last) {
+  if (Hi20.getOpcode() != LoongArch::PCALAU12I)
+    return false;
+
+  const MachineOperand &Hi20Op1 = Hi20.getOperand(1);
+  if (Hi20Op1.getTargetFlags() != LoongArchII::MO_PCREL_HI)
+    return false;
+
+  auto isGlobalOrCPIOrBlockAddress = [](const MachineOperand &Op) {
+    return Op.isGlobal() || Op.isCPI() || Op.isBlockAddress();
+  };
+
+  if (!isGlobalOrCPIOrBlockAddress(Hi20Op1) || Hi20Op1.getOffset() != 0)
+    return false;
+
+  Register HiDestReg = Hi20.getOperand(0).getReg();
+  if (!MRI->hasOneUse(HiDestReg))
+    return false;
+
+  MachineInstr *UseInst = &*MRI->use_instr_begin(HiDestReg);
+  if (UseInst->getOpcode() != LoongArch::ADD_D) {
+    Lo12 = UseInst;
+    if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
+        (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
+      return false;
+  } else {
+    assert(ST->is64Bit());
+    Last = UseInst;
+
+    Register LastOp1Reg = Last->getOperand(1).getReg();
+    if (!LastOp1Reg.isVirtual())
+      return false;
+    Hi12 = MRI->getVRegDef(LastOp1Reg);
+    const MachineOperand &Hi12Op2 = Hi12->getOperand(2);
+    if (Hi12Op2.getTargetFlags() != LoongArchII::MO_PCREL64_HI)
+      return false;
+    if (!isGlobalOrCPIOrBlockAddress(Hi12Op2) || Hi12Op2.getOffset() != 0)
+      return false;
+    if (!MRI->hasOneUse(Hi12->getOperand(0).getReg()))
+      return false;
+
+    Lo20 = MRI->getVRegDef(Hi12->getOperand(1).getReg());
+    const MachineOperand &Lo20Op2 = Lo20->getOperand(2);
+    if (Lo20Op2.getTargetFlags() != LoongArchII::MO_PCREL64_LO)
+      return false;
+    if (!isGlobalOrCPIOrBlockAddress(Lo20Op2) || Lo20Op2.getOffset() != 0)
+      return false;
+    if (!MRI->hasOneUse(Lo20->getOperand(0).getReg()))
+      return false;
+
+    Lo12 = MRI->getVRegDef(Lo20->getOperand(1).getReg());
+    if (!MRI->hasOneUse(Lo12->getOperand(0).getReg()))
+      return false;
+  }
+
+  const MachineOperand &Lo12Op2 = Lo12->getOperand(2);
+  assert(Hi20.getOpcode() == LoongArch::PCALAU12I);
+  if (Lo12Op2.getTargetFlags() != LoongArchII::MO_PCREL_LO ||
+      !(isGlobalOrCPIOrBlockAddress(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
+      Lo12Op2.getOffset() != 0)
+    return false;
+
+  if (Hi20Op1.isGlobal()) {
+    LLVM_DEBUG(dbgs() << "  Found lowered global address: "
+                      << *Hi20Op1.getGlobal() << "\n");
+  } else if (Hi20Op1.isBlockAddress()) {
+    LLVM_DEBUG(dbgs() << "  Found lowered basic address: "
+                      << *Hi20Op1.getBlockAddress() << "\n");
+  } else if (Hi20Op1.isCPI()) {
+    LLVM_DEBUG(dbgs() << "  Found lowered constant pool: " << Hi20Op1.getIndex()
+                      << "\n");
+  }
+
+  return true;
+}
+
+// Update the offset in Hi20, Lo12, Lo20 and Hi12 instructions.
+// Delete the tail instruction and update all the uses to use the
+// output from Last.
+void LoongArchMergeBaseOffsetOpt::foldOffset(
+    MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
+    MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
+    int64_t Offset) {
+  assert(isInt<32>(Offset) && "Unexpected offset");
+  // Put the offset back in Hi and the Lo
+  Hi20.getOperand(1).setOffset(Offset);
+  Lo12.getOperand(2).setOffset(Offset);
+  if (Lo20 && Hi12) {
+    Lo20->getOperand(2).setOffset(Offset);
+    Hi12->getOperand(2).setOffset(Offset);
+  }
+  // Delete the tail instruction.
+  MachineInstr *Def = Last ? Last : &Lo12;
+  MRI->constrainRegClass(Def->getOperand(0).getReg(),
+                         MRI->getRegClass(Tail.getOperand(0).getReg()));
+  MRI->replaceRegWith(Tail.getOperand(0).getReg(), Def->getOperand(0).getReg());
+  Tail.eraseFromParent();
+  LLVM_DEBUG(dbgs() << "  Merged offset " << Offset << " into base.\n"
+                    << "     " << Hi20 << "     " << Lo12;);
+  if (Lo20 && Hi12) {
+    LLVM_DEBUG(dbgs() << "     " << *Lo20 << "     " << *Hi12;);
+  }
+}
+
+// Detect patterns for large offsets that are passed into an ADD instruction.
+// If the pattern is found, updates the offset in Hi20, Lo12, Lo20 and Hi12
+// instructions and deletes TailAdd and the instructions that produced the
+// offset.
+//
+//                     Base address lowering is of the form:
+//                       Hi20:  pcalau12i vreg1, %pc_hi20(s)
+//                       Lo12:  addi.d vreg2, vreg1, %pc_lo12(s)
+//                       /                                  \
+//                      /                                    \
+//                     /                                      \
+//                    /  The large offset can be of two forms: \
+//  1) Offset that has non zero bits in lower      2) Offset that has non zero
+//     12 bits and upper 20 bits                      bits in upper 20 bits only
+//   OffsetHi: lu12i.w vreg3, 4
+//   OffsetLo: ori voff, vreg3, 188                 OffsetHi: lu12i.w voff, 128
+//                    \                                        /
+//                     \                                      /
+//                      \                                    /
+//                       \                                  /
+//                        TailAdd: add.d  vreg4, vreg2, voff
+bool LoongArchMergeBaseOffsetOpt::foldLargeOffset(
+    MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
+    MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &TailAdd,
+    Register GAReg) {
+  assert((TailAdd.getOpcode() == LoongArch::ADD_W ||
+          TailAdd.getOpcode() == LoongArch::ADD_D) &&
+         "Expected ADD instruction!");
+  Register Rs = TailAdd.getOperand(1).getReg();
+  Register Rt = TailAdd.getOperand(2).getReg();
+  Register Reg = Rs == GAReg ? Rt : Rs;
+
+  // Can't fold if the register has more than one use.
+  if (!Reg.isVirtual() || !MRI->hasOneUse(Reg))
+    return false;
+  // This can point to an ORI or a LU12I.W:
+  MachineInstr &OffsetTail = *MRI->getVRegDef(Reg);
+  if (OffsetTail.getOpcode() == LoongArch::ORI) {
+    // The offset value has non zero bits in both %hi and %lo parts.
+    // Detect an ORI that feeds from a LU12I.W instruction.
+    MachineOperand &OriImmOp = OffsetTail.getOperand(2);
+    if (OriImmOp.getTargetFlags() != LoongArchII::MO_None)
+      return false;
+    Register OriReg = OffsetTail.getOperand(1).getReg();
+    int64_t OffLo = OriImmOp.getImm();
+
+    // Handle rs1 of ORI is R0.
+    if (OriReg == LoongArch::R0) {
+      LLVM_DEBUG(dbgs() << "  Offset Instrs: " << OffsetTail);
+      foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, OffLo);
+      OffsetTail.eraseFromParent();
+      return true;
+    }
+
+    MachineInstr &OffsetLu12i = *MRI->getVRegDef(OriReg);
+    MachineOperand &Lu12iImmOp = OffsetLu12i.getOperand(1);
+    if (OffsetLu12i.getOpcode() != LoongArch::LU12I_W ||
+        Lu12iImmOp.getTargetFlags() != LoongArchII::MO_None ||
+        !MRI->hasOneUse(OffsetLu12i.getOperand(0).getReg()))
+      return false;
+    int64_t Offset = SignExtend64<32>(Lu12iImmOp.getImm() << 12);
+    Offset += OffLo;
+    // LU12I.W+ORI sign extends the result.
+    Offset = SignExtend64<32>(Offset);
+    LLVM_DEBUG(dbgs() << "  Offset Instrs: " << OffsetTail
+                      << "                 " << OffsetLu12i);
+    foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset);
+    OffsetTail.eraseFromParent();
+    OffsetLu12i.eraseFromParent();
+    return true;
+  } else if (OffsetTail.getOpcode() == LoongArch::LU12I_W) {
+    // The offset value has all zero bits in the lower 12 bits. Only LU12I.W
+    // exists.
+    LLVM_DEBUG(dbgs() << "  Offset Instr: " << OffsetTail);
+    int64_t Offset = SignExtend64<32>(OffsetTail.getOperand(1).getImm() << 12);
+    foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset);
+    OffsetTail.eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,
+                                                      MachineInstr &Lo12,
+                                                      MachineInstr *&Lo20,
+                                                      MachineInstr *&Hi12,
+                                                      MachineInstr *&Last) {
+  Register DestReg =
+      Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();
+
+  // Look for arithmetic instructions we can get an offset from.
+  // We might be able to remove the arithmetic instructions by folding the
+  // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I).
+  if (!MRI->hasOneUse(DestReg))
+    return false;
+
+  // DestReg has only one use.
+  MachineInstr &Tail = *MRI->use_instr_begin(DestReg);
+  switch (Tail.getOpcode()) {
+  default:
+    LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:"
+                      << Tail);
+    break;
+  case LoongArch::ADDI_W:
+    if (ST->is64Bit())
+      return false;
+    [[fallthrough]];
+  case LoongArch::ADDI_D:
+  case LoongArch::ADDU16I_D: {
+    // Offset is simply an immediate operand.
+    int64_t Offset = Tail.getOperand(2).getImm();
+    if (Tail.getOpcode() == LoongArch::ADDU16I_D)
+      Offset = SignExtend64<32>(Offset << 16);
+
+    // We might have two ADDIs in a row.
+    Register TailDestReg = Tail.getOperand(0).getReg();
+    if (MRI->hasOneUse(TailDestReg)) {
+      MachineInstr &TailTail = *MRI->use_instr_begin(TailDestReg);
+      if (ST->is64Bit() && TailTail.getOpcode() == LoongArch::ADDI_W)
+        return false;
+      if (TailTail.getOpcode() == LoongArch::ADDI_W ||
+          TailTail.getOpcode() == LoongArch::ADDI_D) {
+        Offset += TailTail.getOperand(2).getImm();
+        LLVM_DEBUG(dbgs() << "  Offset Instrs: " << Tail << TailTail);
+        foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailTail, Offset);
+        Tail.eraseFromParent();
+        return true;
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "  Offset Instr: " << Tail);
+    foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, Offset);
+    return true;
+  }
+  case LoongArch::ADD_W:
+    if (ST->is64Bit())
+      return false;
+    [[fallthrough]];
+  case LoongArch::ADD_D:
+    // The offset is too large to fit in the immediate field of ADDI.
+    // This can be in two forms:
+    // 1) LU12I.W hi_offset followed by:
+    //    ORI lo_offset
+    //    This happens in case the offset has non zero bits in
+    //    both hi 20 and lo 12 bits.
+    // 2) LU12I.W (offset20)
+    //    This happens in case the lower 12 bits of the offset are zeros.
+    return foldLargeOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, DestReg);
+    break;
+  }
+
+  return false;
+}
+
+// Memory access opcode mapping for transforms.
+static unsigned getNewOpc(unsigned Op, bool isLarge) {
+  switch (Op) {
+  case LoongArch::LD_B:
+    return isLarge ? LoongArch::LDX_B : LoongArch::LD_B;
+  case LoongArch::LD_H:
+    return isLarge ? LoongArch::LDX_H : LoongArch::LD_H;
+  case LoongArch::LD_W:
+  case LoongArch::LDPTR_W:
+    return isLarge ? LoongArch::LDX_W : LoongArch::LD_W;
+  case LoongArch::LD_D:
+  case LoongArch::LDPTR_D:
+    return isLarge ? LoongArch::LDX_D : LoongArch::LD_D;
+  case LoongArch::LD_BU:
+    return isLarge ? LoongArch::LDX_BU : LoongArch::LD_BU;
+  case LoongArch::LD_HU:
+    return isLarge ? LoongArch::LDX_HU : LoongArch::LD_HU;
+  case LoongArch::LD_WU:
+    return isLarge ? LoongArch::LDX_WU : LoongArch::LD_WU;
+  case LoongArch::FLD_S:
+    return isLarge ? LoongArch::FLDX_S : LoongArch::FLD_S;
+  case LoongArch::FLD_D:
+    return isLarge ? LoongArch::FLDX_D : LoongArch::FLD_D;
+  case LoongArch::ST_B:
+    return isLarge ? LoongArch::STX_B : LoongArch::ST_B;
+  case LoongArch::ST_H:
+    return isLarge ? LoongArch::STX_H : LoongArch::ST_H;
+  case LoongArch::ST_W:
+  case LoongArch::STPTR_W:
+    return isLarge ? LoongArch::STX_W : LoongArch::ST_W;
+  case LoongArch::ST_D:
+  case LoongArch::STPTR_D:
+    return isLarge ? LoongArch::STX_D : LoongArch::ST_D;
+  case LoongArch::FST_S:
+    return isLarge ? LoongArch::FSTX_S : LoongArch::FST_S;
+  case LoongArch::FST_D:
+    return isLarge ? LoongArch::FSTX_D : LoongArch::FST_D;
+  default:
+    llvm_unreachable("Unexpected opcode for replacement");
+  }
+}
+
+bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
+                                                    MachineInstr &Lo12,
+                                                    MachineInstr *&Lo20,
+                                                    MachineInstr *&Hi12,
+                                                    MachineInstr *&Last) {
+  Register DestReg =
+      Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();
+
+  // If all the uses are memory ops with the same offset, we can transform:
+  //
+  // 1. (small/medium):
+  //   pcalau12i vreg1, %pc_hi20(s)
+  //   addi.d    vreg2, vreg1, %pc_lo12(s)
+  //   ld.w      vreg3, 8(vreg2)
+  //
+  //   =>
+  //
+  //   pcalau12i vreg1, %pc_hi20(s+8)
+  //   ld.w      vreg3, vreg1, %pc_lo12(s+8)(vreg1)
+  //
+  // 2. (large):
+  //   pcalau12i vreg1, %pc_hi20(s)
+  //   addi.d    vreg2, $zero, %pc_lo12(s)
+  //   lu32i.d   vreg3, vreg2, %pc64_lo20(s)
+  //   lu52i.d   vreg4, vreg3, %pc64_hi12(s)
+  //   add.d     vreg5, vreg4, vreg1
+  //   ld.w      vreg6, 8(vreg5)
+  //
+  //   =>
+  //
+  //   pcalau12i vreg1, %pc_hi20(s+8)
+  //   addi.d    vreg2, $zero, %pc_lo12(s+8)
+  //   lu32i.d   vreg3, vreg2, %pc64_lo20(s+8)
+  //   lu52i.d   vreg4, vreg3, %pc64_hi12(s+8)
+  //   ldx.w     vreg6, vreg4, vreg1
+
+  std::optional<int64_t> CommonOffset;
+  DenseMap<const MachineInstr *, SmallVector<unsigned>>
+      InlineAsmMemoryOpIndexesMap;
+  for (const MachineInstr &UseMI : MRI->use_instructions(DestReg)) {
+    switch (UseMI.getOpcode()) {
+    default:
+      LLVM_DEBUG(dbgs() << "Not a load or store instruction: " << UseMI);
+      return false;
+    case LoongArch::LD_B:
+    case LoongArch::LD_H:
+    case LoongArch::LD_W:
+    case LoongArch::LD_D:
+    case LoongArch::LD_BU:
+    case LoongArch::LD_HU:
+    case LoongArch::LD_WU:
+    case LoongArch::LDPTR_W:
+    case LoongArch::LDPTR_D:
+    case LoongArch::FLD_S:
+    case LoongArch::FLD_D:
+    case LoongArch::ST_B:
+    case LoongArch::ST_H:
+    case LoongArch::ST_W:
+    case LoongArch::ST_D:
+    case LoongArch::STPTR_W:
+    case LoongArch::STPTR_D:
+    case LoongArch::FST_S:
+    case LoongArch::FST_D: {
+      if (UseMI.getOperand(1).isFI())
+        return false;
+      // Register defined by Lo should not be the value register.
+      if (DestReg == UseMI.getOperand(0).getReg())
+        return false;
+      assert(DestReg == UseMI.getOperand(1).getReg() &&
+             "Expected base address use");
+      // All load/store instructions must use the same offset.
+      int64_t Offset = UseMI.getOperand(2).getImm();
+      if (CommonOffset && Offset != CommonOffset)
+        return false;
+      CommonOffset = Offset;
+      break;
+    }
+    case LoongArch::INLINEASM:
+    case LoongArch::INLINEASM_BR: {
+      // We can't do this for large pattern.
+      if (Last)
+        return false;
+      SmallVector<unsigned> InlineAsmMemoryOpIndexes;
+      unsigned NumOps = 0;
+      for (unsigned I = InlineAsm::MIOp_FirstOperand;
+           I < UseMI.getNumOperands(); I += 1 + NumOps) {
+        const MachineOperand &FlagsMO = UseMI.getOperand(I);
+        // Should be an imm.
+        if (!FlagsMO.isImm())
+          continue;
+
+        const InlineAsm::Flag Flags(FlagsMO.getImm());
+        NumOps = Flags.getNumOperandRegisters();
+
+        // Memory constraints have two operands.
+        if (NumOps != 2 || !Flags.isMemKind()) {
+          // If the register is used by something other than a memory contraint,
+          // we should not fold.
+          for (unsigned J = 0; J < NumOps; ++J) {
+            const MachineOperand &MO = UseMI.getOperand(I + 1 + J);
+            if (MO.isReg() && MO.getReg() == DestReg)
+              return false;
+          }
+          continue;
+        }
+
+        // We can only do this for constraint m.
+        if (Flags.getMemoryConstraintID() != InlineAsm::ConstraintCode::m)
+          return false;
+
+        const MachineOperand &AddrMO = UseMI.getOperand(I + 1);
+        if (!AddrMO.isReg() || AddrMO.getReg() != DestReg)
+          continue;
+
+        const MachineOperand &OffsetMO = UseMI.getOperand(I + 2);
+        if (!OffsetMO.isImm())
+          continue;
+
+        // All inline asm memory operands must use the same offset.
+        int64_t Offset = OffsetMO.getImm();
+        if (CommonOffset && Offset != CommonOffset)
+          return false;
+        CommonOffset = Offset;
+        InlineAsmMemoryOpIndexes.push_back(I + 1);
+      }
+      InlineAsmMemoryOpIndexesMap.insert(
+          std::make_pair(&UseMI, InlineAsmMemoryOpIndexes));
+      break;
+    }
+    }
+  }
+
+  // We found a common offset.
+  // Update the offsets in global address lowering.
+  // We may have already folded some arithmetic so we need to add to any
+  // existing offset.
+  int64_t NewOffset = Hi20.getOperand(1).getOffset() + *CommonOffset;
+  // LA32 ignores the upper 32 bits.
+  if (!ST->is64Bit())
+    NewOffset = SignExtend64<32>(NewOffset);
+  // We can only fold simm32 offsets.
+  if (!isInt<32>(NewOffset))
+    return false;
+
+  Hi20.getOperand(1).setOffset(NewOffset);
+  MachineOperand &ImmOp = Lo12.getOperand(2);
+  ImmOp.setOffset(NewOffset);
+  if (Lo20 && Hi12) {
+    Lo20->getOperand(2).setOffset(NewOffset);
+    Hi12->getOperand(2).setOffset(NewOffset);
+  }
+
+  // Update the immediate in the load/store instructions to add the offset.
+  const LoongArchInstrInfo &TII = *ST->getInstrInfo();
+  for (MachineInstr &UseMI :
+       llvm::make_early_inc_range(MRI->use_instructions(DestReg))) {
+    if (UseMI.getOpcode() == LoongArch::INLINEASM ||
+        UseMI.getOpcode() == LoongArch::INLINEASM_BR) {
+      auto &InlineAsmMemoryOpIndexes = InlineAsmMemoryOpIndexesMap[&UseMI];
+      for (unsigned I : InlineAsmMemoryOpIndexes) {
+        MachineOperand &MO = UseMI.getOperand(I + 1);
+        switch (ImmOp.getType()) {
+        case MachineOperand::MO_GlobalAddress:
+          MO.ChangeToGA(ImmOp.getGlobal(), ImmOp.getOffset(),
+                        ImmOp.getTargetFlags());
+          break;
+        case MachineOperand::MO_MCSymbol:
+          MO.ChangeToMCSymbol(ImmOp.getMCSymbol(), ImmOp.getTargetFlags());
+          MO.setOffset(ImmOp.getOffset());
+          break;
+        case MachineOperand::MO_BlockAddress:
+          MO.ChangeToBA(ImmOp.getBlockAddress(), ImmOp.getOffset(),
+                        ImmOp.getTargetFlags());
+          break;
+        default:
+          report_fatal_error("unsupported machine operand type");
+          break;
+        }
+      }
+    } else {
+      UseMI.setDesc(TII.get(getNewOpc(UseMI.getOpcode(), Last)));
+      if (Last) {
+        UseMI.removeOperand(2);
+        UseMI.removeOperand(1);
+        UseMI.addOperand(Last->getOperand(1));
+        UseMI.addOperand(Last->getOperand(2));
+        UseMI.getOperand(1).setIsKill(false);
+        UseMI.getOperand(2).setIsKill(false);
+      } else {
+        UseMI.removeOperand(2);
+        UseMI.addOperand(ImmOp);
+      }
+    }
+  }
+
+  if (Last) {
+    Last->eraseFromParent();
+    return true;
+  }
+
+  MRI->replaceRegWith(Lo12.getOperand(0).getReg(), Hi20.getOperand(0).getReg());
+  Lo12.eraseFromParent();
+  return true;
+}
+
+bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(Fn.getFunction()))
+    return false;
+
+  ST = &Fn.getSubtarget<LoongArchSubtarget>();
+
+  bool MadeChange = false;
+  MRI = &Fn.getRegInfo();
+  for (MachineBasicBlock &MBB : Fn) {
+    LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");
+    for (MachineInstr &Hi20 : MBB) {
+      MachineInstr *Lo12 = nullptr;
+      MachineInstr *Lo20 = nullptr;
+      MachineInstr *Hi12 = nullptr;
+      MachineInstr *Last = nullptr;
+      if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))
+        continue;
+      MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last);
+      MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last);
+    }
+  }
+
+  return MadeChange;
+}
+
+/// Returns an instance of the Merge Base Offset Optimization pass.
+FunctionPass *llvm::createLoongArchMergeBaseOffsetOptPass() {
+  return new LoongArchMergeBaseOffsetOpt();
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index e83fc08696aea53..4401aadfe78485a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -35,6 +35,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTarget() {
   RegisterTargetMachine<LoongArchTargetMachine> Y(getTheLoongArch64Target());
   auto *PR = PassRegistry::getPassRegistry();
   initializeLoongArchDeadRegisterDefinitionsPass(*PR);
+  initializeLoongArchMergeBaseOffsetOptPass(*PR);
   initializeLoongArchOptWInstrsPass(*PR);
   initializeLoongArchPreRAExpandPseudoPass(*PR);
   initializeLoongArchDAGToDAGISelLegacyPass(*PR);
@@ -216,6 +217,8 @@ void LoongArchPassConfig::addMachineSSAOptimization() {
 
 void LoongArchPassConfig::addPreRegAlloc() {
   addPass(createLoongArchPreRAExpandPseudoPass());
+  if (TM->getOptLevel() != CodeGenOptLevel::None)
+    addPass(createLoongArchMergeBaseOffsetOptPass());
 }
 
 bool LoongArchPassConfig::addRegAssignAndRewriteFast() {
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 604234b243153c3..ce3a37e194d545a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3194,11 +3194,17 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,
     case RISCV::SLLI_UW:
     case RISCV::FMV_W_X:
     case RISCV::FCVT_H_W:
+    case RISCV::FCVT_H_W_INX:
     case RISCV::FCVT_H_WU:
+    case RISCV::FCVT_H_WU_INX:
     case RISCV::FCVT_S_W:
+    case RISCV::FCVT_S_W_INX:
     case RISCV::FCVT_S_WU:
+    case RISCV::FCVT_S_WU_INX:
     case RISCV::FCVT_D_W:
+    case RISCV::FCVT_D_W_INX:
     case RISCV::FCVT_D_WU:
+    case RISCV::FCVT_D_WU_INX:
     case RISCV::TH_REVW:
     case RISCV::TH_SRRIW:
       if (Bits >= 32)
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 49be866448f2ea1..effec2cc776d809 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -174,11 +174,17 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
       case RISCV::SLLI_UW:
       case RISCV::FMV_W_X:
       case RISCV::FCVT_H_W:
+      case RISCV::FCVT_H_W_INX:
       case RISCV::FCVT_H_WU:
+      case RISCV::FCVT_H_WU_INX:
       case RISCV::FCVT_S_W:
+      case RISCV::FCVT_S_W_INX:
       case RISCV::FCVT_S_WU:
+      case RISCV::FCVT_S_WU_INX:
       case RISCV::FCVT_D_W:
+      case RISCV::FCVT_D_W_INX:
       case RISCV::FCVT_D_WU:
+      case RISCV::FCVT_D_WU_INX:
         if (Bits >= 32)
           break;
         return false;
@@ -350,8 +356,7 @@ static bool hasAllWUsers(const MachineInstr &OrigMI, const RISCVSubtarget &ST,
 
 // This function returns true if the machine instruction always outputs a value
 // where bits 63:32 match bit 31.
-static bool isSignExtendingOpW(const MachineInstr &MI,
-                               const MachineRegisterInfo &MRI, unsigned OpNo) {
+static bool isSignExtendingOpW(const MachineInstr &MI, unsigned OpNo) {
   uint64_t TSFlags = MI.getDesc().TSFlags;
 
   // Instructions that can be determined from opcode are marked in tablegen.
@@ -426,7 +431,7 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST,
     assert(OpNo != -1 && "Couldn't find register");
 
     // If this is a sign extending operation we don't need to look any further.
-    if (isSignExtendingOpW(*MI, MRI, OpNo))
+    if (isSignExtendingOpW(*MI, OpNo))
       continue;
 
     // Is this an instruction that propagates sign extend?
diff --git a/llvm/lib/Target/RISCV/RISCVProfiles.td b/llvm/lib/Target/RISCV/RISCVProfiles.td
index c4a64681f5f123a..157e087a64da07b 100644
--- a/llvm/lib/Target/RISCV/RISCVProfiles.td
+++ b/llvm/lib/Target/RISCV/RISCVProfiles.td
@@ -6,169 +6,113 @@
 //
 //===----------------------------------------------------------------------===//
 
-class RISCVProfile<string name, list<SubtargetFeature> features>
-    : SubtargetFeature<name, "Is" # NAME, "true",
-                       "RISC-V " # name # " profile", features> {
-  // Indicates if the profile is not yet ratified, so should be treated as
-  // experimental.
-  bit Experimental = false;
-}
-class RISCVExperimentalProfile<string name, list<SubtargetFeature> features>
-    : RISCVProfile<"experimental-"#name, features> {
-  let Experimental = true;
-}
+//===----------------------------------------------------------------------===//
+// Profile Featuyre Lists
+//===----------------------------------------------------------------------===//
+
+// RVI Profile Family
 
 defvar RVI20U32Features = [Feature32Bit, FeatureStdExtI];
 defvar RVI20U64Features = [Feature64Bit, FeatureStdExtI];
 
-defvar RVA20U64Features = [Feature64Bit,
-                           FeatureStdExtI,
-                           FeatureStdExtM,
-                           FeatureStdExtA,
-                           FeatureStdExtF,
-                           FeatureStdExtD,
-                           FeatureStdExtC,
-                           FeatureStdExtZicntr,
-                           FeatureStdExtZiccif,
-                           FeatureStdExtZiccrse,
-                           FeatureStdExtZiccamoa,
-                           FeatureStdExtZa128rs,
-                           FeatureStdExtZicclsm];
+// RVA Profile Family
+
+defvar RVA20U64BaseFeatures = [Feature64Bit,
+                               FeatureStdExtI,
+                               FeatureStdExtM,
+                               FeatureStdExtA,
+                               FeatureStdExtF,
+                               FeatureStdExtD,
+                               FeatureStdExtC,
+                               FeatureStdExtZicntr,
+                               FeatureStdExtZiccif,
+                               FeatureStdExtZiccrse,
+                               FeatureStdExtZiccamoa,
+                               FeatureStdExtZicclsm];
+defvar RVA20U64Features = !listconcat(RVA20U64BaseFeatures,
+                                      [FeatureStdExtZa128rs]);
 
+defvar RVA20S64BaseFeatures = [FeatureStdExtZifencei,
+                               FeatureStdExtSvbare,
+                               FeatureStdExtSvade,
+                               FeatureStdExtSsccptr,
+                               FeatureStdExtSstvecd,
+                               FeatureStdExtSstvala];
 defvar RVA20S64Features = !listconcat(RVA20U64Features,
-                                      [FeatureStdExtZifencei,
-                                       FeatureStdExtSvbare,
-                                       FeatureStdExtSvade,
-                                       FeatureStdExtSsccptr,
-                                       FeatureStdExtSstvecd,
-                                       FeatureStdExtSstvala]);
+                                      RVA20S64BaseFeatures);
 
-defvar RVA22U64Features = [Feature64Bit,
-                           FeatureStdExtI,
-                           FeatureStdExtM,
-                           FeatureStdExtA,
-                           FeatureStdExtF,
-                           FeatureStdExtD,
-                           FeatureStdExtC,
-                           FeatureStdExtZicntr,
-                           FeatureStdExtZiccif,
-                           FeatureStdExtZiccrse,
-                           FeatureStdExtZiccamoa,
-                           FeatureStdExtZicclsm,
-                           FeatureStdExtZa64rs,
-                           FeatureStdExtZihpm,
-                           FeatureStdExtZihintpause,
-                           FeatureStdExtZba,
-                           FeatureStdExtZbb,
-                           FeatureStdExtZbs,
-                           FeatureStdExtZic64b,
-                           FeatureStdExtZicbom,
-                           FeatureStdExtZicbop,
-                           FeatureStdExtZicboz,
-                           FeatureStdExtZfhmin,
-                           FeatureStdExtZkt];
+defvar RVA22U64Features = !listconcat(RVA20U64BaseFeatures,
+                                      [FeatureStdExtZa64rs,
+                                       FeatureStdExtZihpm,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtZba,
+                                       FeatureStdExtZbb,
+                                       FeatureStdExtZbs,
+                                       FeatureStdExtZic64b,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZfhmin,
+                                       FeatureStdExtZkt]);
 
+defvar RVA22S64BaseFeatures = !listconcat(RVA20S64BaseFeatures,
+                                          [FeatureStdExtSscounterenw,
+                                           FeatureStdExtSvpbmt,
+                                           FeatureStdExtSvinval]);
 defvar RVA22S64Features = !listconcat(RVA22U64Features,
-                                      [FeatureStdExtZifencei,
-                                       FeatureStdExtSvbare,
-                                       FeatureStdExtSvade,
-                                       FeatureStdExtSsccptr,
-                                       FeatureStdExtSstvecd,
-                                       FeatureStdExtSstvala,
-                                       FeatureStdExtSscounterenw,
-                                       FeatureStdExtSvpbmt,
-                                       FeatureStdExtSvinval]);
+                                      RVA22S64BaseFeatures);
 
-defvar RVA23U64Features = [Feature64Bit,
-                           FeatureStdExtI,
-                           FeatureStdExtM,
-                           FeatureStdExtA,
-                           FeatureStdExtF,
-                           FeatureStdExtD,
-                           FeatureStdExtC,
-                           FeatureStdExtZicntr,
-                           FeatureStdExtZihpm,
-                           FeatureStdExtZiccif,
-                           FeatureStdExtZiccrse,
-                           FeatureStdExtZiccamoa,
-                           FeatureStdExtZicclsm,
-                           FeatureStdExtZa64rs,
-                           FeatureStdExtZihintpause,
-                           FeatureStdExtZba,
-                           FeatureStdExtZbb,
-                           FeatureStdExtZbs,
-                           FeatureStdExtZic64b,
-                           FeatureStdExtZicbom,
-                           FeatureStdExtZicbop,
-                           FeatureStdExtZicboz,
-                           FeatureStdExtZfhmin,
-                           FeatureStdExtZkt,
-                           FeatureStdExtV,
-                           FeatureStdExtZvfhmin,
-                           FeatureStdExtZvbb,
-                           FeatureStdExtZvkt,
-                           FeatureStdExtZihintntl,
-                           FeatureStdExtZicond,
-                           FeatureStdExtZimop,
-                           FeatureStdExtZcmop,
-                           FeatureStdExtZcb,
-                           FeatureStdExtZfa,
-                           FeatureStdExtZawrs];
+defvar RVA23U64Features = !listconcat(RVA22U64Features,
+                                      [FeatureStdExtV,
+                                       FeatureStdExtZvfhmin,
+                                       FeatureStdExtZvbb,
+                                       FeatureStdExtZvkt,
+                                       FeatureStdExtZihintntl,
+                                       FeatureStdExtZicond,
+                                       FeatureStdExtZimop,
+                                       FeatureStdExtZcmop,
+                                       FeatureStdExtZcb,
+                                       FeatureStdExtZfa,
+                                       FeatureStdExtZawrs]);
 
+defvar RVA23S64BaseFeatures = !listconcat(RVA22S64BaseFeatures,
+                                          [FeatureStdExtSvnapot,
+                                           FeatureStdExtSstc,
+                                           FeatureStdExtSscofpmf,
+                                           FeatureStdExtSsnpm,
+                                           FeatureStdExtSsu64xl,
+                                           FeatureStdExtH,
+                                           FeatureStdExtSsstateen,
+                                           FeatureStdExtShcounterenw,
+                                           FeatureStdExtShvstvala,
+                                           FeatureStdExtShtvala,
+                                           FeatureStdExtShvstvecd,
+                                           FeatureStdExtShvsatpa,
+                                           FeatureStdExtShgatpa]);
 defvar RVA23S64Features = !listconcat(RVA23U64Features,
-                                      [FeatureStdExtZifencei,
-                                       FeatureStdExtSvbare,
-                                       FeatureStdExtSvade,
-                                       FeatureStdExtSsccptr,
-                                       FeatureStdExtSstvecd,
-                                       FeatureStdExtSstvala,
-                                       FeatureStdExtSscounterenw,
-                                       FeatureStdExtSvpbmt,
-                                       FeatureStdExtSvinval,
-                                       FeatureStdExtSvnapot,
-                                       FeatureStdExtSstc,
-                                       FeatureStdExtSscofpmf,
-                                       FeatureStdExtSsnpm,
-                                       FeatureStdExtSsu64xl,
-                                       FeatureStdExtH,
-                                       FeatureStdExtSsstateen,
-                                       FeatureStdExtShcounterenw,
-                                       FeatureStdExtShvstvala,
-                                       FeatureStdExtShtvala,
-                                       FeatureStdExtShvstvecd,
-                                       FeatureStdExtShvsatpa,
-                                       FeatureStdExtShgatpa]);
-
-defvar RVB23U64Features = [Feature64Bit,
-                           FeatureStdExtI,
-                           FeatureStdExtM,
-                           FeatureStdExtA,
-                           FeatureStdExtF,
-                           FeatureStdExtD,
-                           FeatureStdExtC,
-                           FeatureStdExtZicntr,
-                           FeatureStdExtZihpm,
-                           FeatureStdExtZiccif,
-                           FeatureStdExtZiccrse,
-                           FeatureStdExtZiccamoa,
-                           FeatureStdExtZicclsm,
-                           FeatureStdExtZa64rs,
-                           FeatureStdExtZihintpause,
-                           FeatureStdExtZba,
-                           FeatureStdExtZbb,
-                           FeatureStdExtZbs,
-                           FeatureStdExtZic64b,
-                           FeatureStdExtZicbom,
-                           FeatureStdExtZicbop,
-                           FeatureStdExtZicboz,
-                           FeatureStdExtZkt,
-                           FeatureStdExtZihintntl,
-                           FeatureStdExtZicond,
-                           FeatureStdExtZimop,
-                           FeatureStdExtZcmop,
-                           FeatureStdExtZcb,
-                           FeatureStdExtZfa,
-                           FeatureStdExtZawrs];
+                                      RVA23S64BaseFeatures);
+
+// RVB Profile Family
+
+defvar RVB23U64Features = !listconcat(RVA20U64BaseFeatures,
+                                      [FeatureStdExtZihpm,
+                                       FeatureStdExtZa64rs,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtZba,
+                                       FeatureStdExtZbb,
+                                       FeatureStdExtZbs,
+                                       FeatureStdExtZic64b,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZkt,
+                                       FeatureStdExtZihintntl,
+                                       FeatureStdExtZicond,
+                                       FeatureStdExtZimop,
+                                       FeatureStdExtZcmop,
+                                       FeatureStdExtZcb,
+                                       FeatureStdExtZfa,
+                                       FeatureStdExtZawrs]);
 
 defvar RVB23S64Features = !listconcat(RVB23U64Features,
                                       [FeatureStdExtZifencei,
@@ -185,6 +129,8 @@ defvar RVB23S64Features = !listconcat(RVB23U64Features,
                                        FeatureStdExtSscofpmf,
                                        FeatureStdExtSsu64xl]);
 
+// RVM Profile Family
+
 defvar RVM23U32Features = [Feature32Bit,
                            FeatureStdExtI,
                            FeatureStdExtM,
@@ -199,6 +145,22 @@ defvar RVM23U32Features = [Feature32Bit,
                            FeatureStdExtZimop,
                            FeatureStdExtZcmop];
 
+//===----------------------------------------------------------------------===//
+// Profile Definitions for ISA String
+//===----------------------------------------------------------------------===//
+
+class RISCVProfile<string name, list<SubtargetFeature> features>
+    : SubtargetFeature<name, "Is" # NAME, "true",
+                       "RISC-V " # name # " profile", features> {
+  // Indicates if the profile is not yet ratified, so should be treated as
+  // experimental.
+  bit Experimental = false;
+}
+class RISCVExperimentalProfile<string name, list<SubtargetFeature> features>
+    : RISCVProfile<"experimental-"#name, features> {
+  let Experimental = true;
+}
+
 def RVI20U32 : RISCVProfile<"rvi20u32", RVI20U32Features>;
 def RVI20U64 : RISCVProfile<"rvi20u64", RVI20U64Features>;
 def RVA20U64 : RISCVProfile<"rva20u64", RVA20U64Features>;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 61f8379983ef99e..e7db1ededf383b8 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -179,7 +179,8 @@ unsigned RISCVSubtarget::getMaxLMULForFixedLengthVectors() const {
 }
 
 bool RISCVSubtarget::useRVVForFixedLengthVectors() const {
-  return hasVInstructions() && getMinRVVVectorSizeInBits() != 0;
+  return hasVInstructions() &&
+         getMinRVVVectorSizeInBits() >= RISCV::RVVBitsPerBlock;
 }
 
 bool RISCVSubtarget::enableSubRegLiveness() const { return true; }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 6f76839724ee9f8..53ed46f14f14dcc 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -59,6 +59,14 @@ static std::string computeDataLayout(const Triple &TT) {
   // Data mangling.
   Ret += DataLayout::getManglingComponent(TT);
 
+  // Special features for z/OS.
+  if (TT.isOSzOS()) {
+    if (TT.isArch64Bit()) {
+      // Custom address space for ptr32.
+      Ret += "-p1:32:32";
+    }
+  }
+
   // Make sure that global data has at least 16 bits of alignment by
   // default, so that we can refer to it using LARL.  We don't have any
   // special requirements for stack variables though.
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 77dac1197f85e91..8404f2231680d61 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -4458,6 +4458,16 @@ void X86FrameLowering::spillFPBP(MachineFunction &MF) const {
     FP = TRI->getFrameRegister(MF);
   if (TRI->hasBasePointer(MF))
     BP = TRI->getBaseRegister();
+
+  // Currently only inline asm and function call can clobbers fp/bp. So we can
+  // do some quick test and return early.
+  if (!MF.hasInlineAsm()) {
+    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    if (!X86FI->getFPClobberedByCall())
+      FP = 0;
+    if (!X86FI->getBPClobberedByCall())
+      BP = 0;
+  }
   if (!FP && !BP)
     return;
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2891e21be1b2676..f69606783f25c87 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30529,8 +30529,7 @@ enum BitTestKind : unsigned {
 static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
   using namespace llvm::PatternMatch;
   BitTestKind BTK = UndefBit;
-  auto *C = dyn_cast<ConstantInt>(V);
-  if (C) {
+  if (auto *C = dyn_cast<ConstantInt>(V)) {
     // Check if V is a power of 2 or NOT power of 2.
     if (isPowerOf2_64(C->getZExtValue()))
       BTK = ConstantBit;
@@ -30540,8 +30539,7 @@ static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
   }
 
   // Check if V is some power of 2 pattern known to be non-zero
-  auto *I = dyn_cast<Instruction>(V);
-  if (I) {
+  if (auto *I = dyn_cast<Instruction>(V)) {
     bool Not = false;
     // Check if we have a NOT
     Value *PeekI;
@@ -30578,13 +30576,12 @@ static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
 
       Value *BitV = I->getOperand(1);
 
+      // Read past a shiftmask instruction to find count
       Value *AndOp;
-      const APInt *AndC;
-      if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
-        // Read past a shiftmask instruction to find count
-        if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
-          BitV = AndOp;
-      }
+      uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
+      if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
+        BitV = AndOp;
+
       return {BitV, BTK};
     }
   }
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index f659c168b86e0ed..1e609a84673a3cc 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2450,6 +2450,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }();
   assert(Mask && "Missing call preserved mask for calling convention");
 
+  if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getFrameRegister(MF)))
+    X86Info->setFPClobberedByCall(true);
+  if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getBaseRegister()))
+    X86Info->setBPClobberedByCall(true);
+
   // If this is an invoke in a 32-bit function using a funclet-based
   // personality, assume the function clobbers all registers. If an exception
   // is thrown, the runtime will not restore CSRs.
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 315aeef65d28c8b..13d57c2fa9dfbc2 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -170,6 +170,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   SmallVector<size_t, 0> PreallocatedStackSizes;
   SmallVector<SmallVector<size_t, 4>, 0> PreallocatedArgOffsets;
 
+  // True if a function clobbers FP/BP according to its calling convention.
+  bool FPClobberedByCall = false;
+  bool BPClobberedByCall = false;
+
 private:
   /// ForwardedMustTailRegParms - A list of virtual and physical registers
   /// that must be forwarded to every musttail call.
@@ -328,6 +332,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
     assert(!PreallocatedArgOffsets[Id].empty() && "arg offsets not set");
     return PreallocatedArgOffsets[Id];
   }
+
+  bool getFPClobberedByCall() const { return FPClobberedByCall; }
+  void setFPClobberedByCall(bool C) { FPClobberedByCall = C; }
+
+  bool getBPClobberedByCall() const { return BPClobberedByCall; }
+  void setBPClobberedByCall(bool C) { BPClobberedByCall = C; }
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index b5b590e2b7acf2c..3a1f690bf039076 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -809,7 +809,7 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
     Value *Dst = NF ? NF : CI->getCalledOperand();
     FunctionType *NFTy = inlinableVariadicFunctionType(M, VarargFunctionType);
 
-    NewCB = CallInst::Create(NFTy, Dst, Args, OpBundles, "", CI);
+    NewCB = CallInst::Create(NFTy, Dst, Args, OpBundles, "", CI->getIterator());
 
     CallInst::TailCallKind TCK = CI->getTailCallKind();
     assert(TCK != CallInst::TCK_MustTail);
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index ab1e41ebf9a9d6c..5293a777496bc7b 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1338,6 +1338,7 @@ deleteIfDead(GlobalValue &GV,
     if (DeleteFnCallback)
       DeleteFnCallback(*F);
   }
+  ReplaceableMetadataImpl::SalvageDebugInfo(GV);
   GV.eraseFromParent();
   ++NumDeleted;
   return true;
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 0ff5e9b815adfcc..67dd2b2052472c7 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -690,8 +690,8 @@ class RuntimeCallInserter {
       if (EHPad && EHPad->isEHPad()) {
         // Replace CI with a clone with an added funclet OperandBundle
         OperandBundleDef OB("funclet", EHPad);
-        auto *NewCall =
-            CallBase::addOperandBundle(CI, LLVMContext::OB_funclet, OB, CI);
+        auto *NewCall = CallBase::addOperandBundle(CI, LLVMContext::OB_funclet,
+                                                   OB, CI->getIterator());
         NewCall->copyMetadata(*CI);
         CI->replaceAllUsesWith(NewCall);
         CI->eraseFromParent();
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 4e3f9e27e0c3446..deab37801ff1df8 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_component_library(LLVMInstrumentation
   ValueProfileCollector.cpp
   ThreadSanitizer.cpp
   HWAddressSanitizer.cpp
+  RealtimeSanitizer.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 45b3edf2b8f2313..a1632a93966c89f 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4116,6 +4116,23 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_mmx_psrli_q:
     case Intrinsic::x86_mmx_psrai_w:
     case Intrinsic::x86_mmx_psrai_d:
+    case Intrinsic::aarch64_neon_rshrn:
+    case Intrinsic::aarch64_neon_sqrshl:
+    case Intrinsic::aarch64_neon_sqrshrn:
+    case Intrinsic::aarch64_neon_sqrshrun:
+    case Intrinsic::aarch64_neon_sqshl:
+    case Intrinsic::aarch64_neon_sqshlu:
+    case Intrinsic::aarch64_neon_sqshrn:
+    case Intrinsic::aarch64_neon_sqshrun:
+    case Intrinsic::aarch64_neon_srshl:
+    case Intrinsic::aarch64_neon_sshl:
+    case Intrinsic::aarch64_neon_uqrshl:
+    case Intrinsic::aarch64_neon_uqrshrn:
+    case Intrinsic::aarch64_neon_uqshl:
+    case Intrinsic::aarch64_neon_uqshrn:
+    case Intrinsic::aarch64_neon_urshl:
+    case Intrinsic::aarch64_neon_ushl:
+      // Not handled here: aarch64_neon_vsli (vector shift left and insert)
       handleVectorShiftIntrinsic(I, /* Variable */ false);
       break;
     case Intrinsic::x86_avx2_psllv_d:
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
index de1d4d2381c06ee..d6ba12465bb3283 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
@@ -30,7 +30,7 @@ static cl::list<std::string> ContextRoots(
         "root of an interesting graph, which will be profiled independently "
         "from other similar graphs."));
 
-bool PGOCtxProfLoweringPass::isContextualIRPGOEnabled() {
+bool PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled() {
   return !ContextRoots.empty();
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 1ce8f58c1aa1408..41618194d12ed7c 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -321,6 +321,7 @@ static cl::opt<unsigned> PGOFunctionCriticalEdgeThreshold(
              " greater than this threshold."));
 
 extern cl::opt<unsigned> MaxNumVTableAnnotations;
+extern cl::opt<std::string> UseCtxProfile;
 
 namespace llvm {
 // Command line option to turn on CFG dot dump after profile annotation.
@@ -338,9 +339,12 @@ extern cl::opt<bool> EnableVTableProfileUse;
 extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
 } // namespace llvm
 
+bool shouldInstrumentForCtxProf() {
+  return PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled() ||
+         !UseCtxProfile.empty();
+}
 bool shouldInstrumentEntryBB() {
-  return PGOInstrumentEntry ||
-         PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+  return PGOInstrumentEntry || shouldInstrumentForCtxProf();
 }
 
 // FIXME(mtrofin): re-enable this for ctx profiling, for non-indirect calls. Ctx
@@ -348,8 +352,7 @@ bool shouldInstrumentEntryBB() {
 // Supporting other values is relatively straight-forward - just another counter
 // range within the context.
 bool isValueProfilingDisabled() {
-  return DisableValueProfiling ||
-         PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+  return DisableValueProfiling || shouldInstrumentForCtxProf();
 }
 
 // Return a string describing the branch condition that can be
@@ -902,7 +905,7 @@ static void instrumentOneFunc(
   unsigned NumCounters =
       InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
 
-  if (PGOCtxProfLoweringPass::isContextualIRPGOEnabled()) {
+  if (shouldInstrumentForCtxProf()) {
     auto *CSIntrinsic =
         Intrinsic::getDeclaration(M, Intrinsic::instrprof_callsite);
     // We want to count the instrumentable callsites, then instrument them. This
@@ -1861,7 +1864,7 @@ static bool InstrumentAllFunctions(
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) {
   // For the context-sensitve instrumentation, we should have a separated pass
   // (before LTO/ThinLTO linking) to create these variables.
-  if (!IsCS && !PGOCtxProfLoweringPass::isContextualIRPGOEnabled())
+  if (!IsCS && !shouldInstrumentForCtxProf())
     createIRLevelProfileFlagVar(M, /*IsCS=*/false);
 
   Triple TT(M.getTargetTriple());
@@ -2112,7 +2115,7 @@ static bool annotateAllFunctions(
   bool InstrumentFuncEntry = PGOReader->instrEntryBBEnabled();
   if (PGOInstrumentEntry.getNumOccurrences() > 0)
     InstrumentFuncEntry = PGOInstrumentEntry;
-  InstrumentFuncEntry |= PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+  InstrumentFuncEntry |= shouldInstrumentForCtxProf();
 
   bool HasSingleByteCoverage = PGOReader->hasSingleByteCoverage();
   for (auto &F : M) {
diff --git a/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
new file mode 100644
index 000000000000000..5663f446613b502
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
@@ -0,0 +1,60 @@
+//===- RealtimeSanitizer.cpp - RealtimeSanitizer instrumentation *- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of the RealtimeSanitizer, an LLVM transformation for
+// detecting and reporting realtime safety violations.
+//
+// See also: llvm-project/compiler-rt/lib/rtsan/
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+
+#include "llvm/Transforms/Instrumentation/RealtimeSanitizer.h"
+
+using namespace llvm;
+
+static void insertCallBeforeInstruction(Function &Fn, Instruction &Instruction,
+                                        const char *FunctionName) {
+  LLVMContext &Context = Fn.getContext();
+  FunctionType *FuncType = FunctionType::get(Type::getVoidTy(Context), false);
+  FunctionCallee Func =
+      Fn.getParent()->getOrInsertFunction(FunctionName, FuncType);
+  IRBuilder<> Builder{&Instruction};
+  Builder.CreateCall(Func, {});
+}
+
+static void insertCallAtFunctionEntryPoint(Function &Fn,
+                                           const char *InsertFnName) {
+
+  insertCallBeforeInstruction(Fn, Fn.front().front(), InsertFnName);
+}
+
+static void insertCallAtAllFunctionExitPoints(Function &Fn,
+                                              const char *InsertFnName) {
+  for (auto &BB : Fn)
+    for (auto &I : BB)
+      if (isa<ReturnInst>(&I))
+        insertCallBeforeInstruction(Fn, I, InsertFnName);
+}
+
+RealtimeSanitizerPass::RealtimeSanitizerPass(
+    const RealtimeSanitizerOptions &Options) {}
+
+PreservedAnalyses RealtimeSanitizerPass::run(Function &F,
+                                             AnalysisManager<Function> &AM) {
+  if (F.hasFnAttribute(Attribute::SanitizeRealtime)) {
+    insertCallAtFunctionEntryPoint(F, "__rtsan_realtime_enter");
+    insertCallAtAllFunctionExitPoints(F, "__rtsan_realtime_exit");
+    return PreservedAnalyses::none();
+  }
+
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index d48b1286b1e08f4..526ae4e88343967 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2767,8 +2767,9 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
     unsigned OpIdx = U->getOperandNo();
     auto *LHS = OpIdx == 0 ? Mul : Ins->getOperand(0);
     auto *RHS = OpIdx == 1 ? Mul : Ins->getOperand(1);
-    auto *NewBO = BinaryOperator::Create(Ins->getOpcode(), LHS, RHS,
-                                         Ins->getName() + ".reass", Ins);
+    auto *NewBO =
+        BinaryOperator::Create(Ins->getOpcode(), LHS, RHS,
+                               Ins->getName() + ".reass", Ins->getIterator());
     NewBO->copyIRFlags(Ins);
     if (VariantOp == Ins)
       VariantOp = NewBO;
@@ -2822,9 +2823,9 @@ static bool hoistBOAssociation(Instruction &I, Loop &L,
   assert(Preheader && "Loop is not in simplify form?");
 
   auto *Inv = BinaryOperator::Create(Opcode, C1, C2, "invariant.op",
-                                     Preheader->getTerminator());
-  auto *NewBO =
-      BinaryOperator::Create(Opcode, LV, Inv, BO->getName() + ".reass", BO);
+                                     Preheader->getTerminator()->getIterator());
+  auto *NewBO = BinaryOperator::Create(
+      Opcode, LV, Inv, BO->getName() + ".reass", BO->getIterator());
 
   // Copy NUW for ADDs if both instructions have it.
   if (Opcode == Instruction::Add && BO->hasNoUnsignedWrap() &&
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index f173d9ca9be07ad..9c711ec183821ff 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/Transforms/Scalar/StructurizeCFG.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
@@ -289,10 +288,6 @@ class StructurizeCFG {
   void findUndefBlocks(BasicBlock *PHIBlock,
                        const SmallSet<BasicBlock *, 8> &Incomings,
                        SmallVector<BasicBlock *> &UndefBlks) const;
-
-  void mergeIfCompatible(EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A,
-                         PHINode *B);
-
   void setPhiValues();
 
   void simplifyAffectedPhis();
@@ -715,103 +710,10 @@ void StructurizeCFG::findUndefBlocks(
   }
 }
 
-// If two phi nodes have compatible incoming values (for each
-// incoming block, either they have the same incoming value or only one phi
-// node has an incoming value), let them share the merged incoming values. The
-// merge process is guided by the equivalence information from \p PhiClasses.
-// The function will possibly update the incoming values of leader phi in
-// DeletedPhis.
-void StructurizeCFG::mergeIfCompatible(
-    EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A, PHINode *B) {
-  auto ItA = PhiClasses.findLeader(PhiClasses.insert(A));
-  auto ItB = PhiClasses.findLeader(PhiClasses.insert(B));
-  // They are already in the same class, no work needed.
-  if (ItA == ItB)
-    return;
-
-  PHINode *LeaderA = *ItA;
-  PHINode *LeaderB = *ItB;
-  BBValueVector &IncomingA = DeletedPhis[LeaderA->getParent()][LeaderA];
-  BBValueVector &IncomingB = DeletedPhis[LeaderB->getParent()][LeaderB];
-
-  DenseMap<BasicBlock *, Value *> Mergeable(IncomingA.begin(), IncomingA.end());
-  for (auto [BB, V] : IncomingB) {
-    auto BBIt = Mergeable.find(BB);
-    if (BBIt != Mergeable.end() && BBIt->second != V)
-      return;
-    // Either IncomingA does not have this value or IncomingA has the same
-    // value.
-    Mergeable.insert({BB, V});
-  }
-
-  // Update the incoming value of leaderA.
-  IncomingA.assign(Mergeable.begin(), Mergeable.end());
-  PhiClasses.unionSets(ItA, ItB);
-}
-
 /// Add the real PHI value as soon as everything is set up
 void StructurizeCFG::setPhiValues() {
   SmallVector<PHINode *, 8> InsertedPhis;
   SSAUpdater Updater(&InsertedPhis);
-  DenseMap<BasicBlock *, SmallVector<BasicBlock *>> UndefBlksMap;
-
-  // Find phi nodes that have compatible incoming values (either they have
-  // the same value for the same block or only one phi node has an incoming
-  // value, see example below). We only search again the phi's that are
-  // referenced by another phi, which is the case we care about.
-  //
-  // For example (-- means no incoming value):
-  // phi1 : BB1:phi2   BB2:v  BB3:--
-  // phi2:  BB1:--     BB2:v  BB3:w
-  //
-  // Then we can merge these incoming values and let phi1, phi2 use the
-  // same set of incoming values:
-  //
-  // phi1&phi2: BB1:phi2  BB2:v  BB3:w
-  //
-  // By doing this, phi1 and phi2 would share more intermediate phi nodes.
-  // This would help reduce the number of phi nodes during SSA reconstruction
-  // and ultimately result in fewer COPY instructions.
-  //
-  // This should be correct, because if a phi node does not have incoming
-  // value from certain block, this means the block is not the predecessor
-  // of the parent block, so we actually don't care about its incoming value.
-  EquivalenceClasses<PHINode *> PhiClasses;
-  for (const auto &[To, From] : AddedPhis) {
-    auto OldPhiIt = DeletedPhis.find(To);
-    if (OldPhiIt == DeletedPhis.end())
-      continue;
-
-    PhiMap &BlkPhis = OldPhiIt->second;
-    SmallVector<BasicBlock *> &UndefBlks =
-        UndefBlksMap.FindAndConstruct(To).second;
-    SmallSet<BasicBlock *, 8> Incomings;
-
-    // Get the undefined blocks shared by all the phi nodes.
-    if (!BlkPhis.empty()) {
-      for (const auto &VI : BlkPhis.front().second)
-        Incomings.insert(VI.first);
-      findUndefBlocks(To, Incomings, UndefBlks);
-    }
-
-    for (const auto &[Phi, Incomings] : OldPhiIt->second) {
-      SmallVector<PHINode *> IncomingPHIs;
-      for (const auto &[BB, V] : Incomings) {
-        // First, for each phi, check whether it has incoming value which is
-        // another phi.
-        if (PHINode *P = dyn_cast<PHINode>(V))
-          IncomingPHIs.push_back(P);
-      }
-
-      for (auto *OtherPhi : IncomingPHIs) {
-        // Skip phis that are unrelated to the phi reconstruction for now.
-        if (!DeletedPhis.contains(OtherPhi->getParent()))
-          continue;
-        mergeIfCompatible(PhiClasses, Phi, OtherPhi);
-      }
-    }
-  }
-
   for (const auto &AddedPhi : AddedPhis) {
     BasicBlock *To = AddedPhi.first;
     const BBVector &From = AddedPhi.second;
@@ -819,27 +721,28 @@ void StructurizeCFG::setPhiValues() {
     if (!DeletedPhis.count(To))
       continue;
 
+    SmallVector<BasicBlock *> UndefBlks;
+    bool CachedUndefs = false;
     PhiMap &Map = DeletedPhis[To];
-    SmallVector<BasicBlock *> &UndefBlks = UndefBlksMap[To];
-    for (const auto &[Phi, Incoming] : Map) {
+    for (const auto &PI : Map) {
+      PHINode *Phi = PI.first;
       Value *Undef = UndefValue::get(Phi->getType());
       Updater.Initialize(Phi->getType(), "");
       Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
       Updater.AddAvailableValue(To, Undef);
 
-      // Use leader phi's incoming if there is.
-      auto LeaderIt = PhiClasses.findLeader(Phi);
-      bool UseIncomingOfLeader =
-          LeaderIt != PhiClasses.member_end() && *LeaderIt != Phi;
-      const auto &IncomingMap =
-          UseIncomingOfLeader ? DeletedPhis[(*LeaderIt)->getParent()][*LeaderIt]
-                              : Incoming;
-
+      SmallSet<BasicBlock *, 8> Incomings;
       SmallVector<BasicBlock *> ConstantPreds;
-      for (const auto &[BB, V] : IncomingMap) {
-        Updater.AddAvailableValue(BB, V);
-        if (isa<Constant>(V))
-          ConstantPreds.push_back(BB);
+      for (const auto &VI : PI.second) {
+        Incomings.insert(VI.first);
+        Updater.AddAvailableValue(VI.first, VI.second);
+        if (isa<Constant>(VI.second))
+          ConstantPreds.push_back(VI.first);
+      }
+
+      if (!CachedUndefs) {
+        findUndefBlocks(To, Incomings, UndefBlks);
+        CachedUndefs = true;
       }
 
       for (auto UB : UndefBlks) {
@@ -850,10 +753,6 @@ void StructurizeCFG::setPhiValues() {
         if (any_of(ConstantPreds,
                    [&](BasicBlock *CP) { return DT->dominates(CP, UB); }))
           continue;
-        // Maybe already get a value through sharing with other phi nodes.
-        if (Updater.HasValueForBlock(UB))
-          continue;
-
         Updater.AddAvailableValue(UB, Undef);
       }
 
@@ -861,7 +760,10 @@ void StructurizeCFG::setPhiValues() {
         Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI));
       AffectedPhis.push_back(Phi);
     }
+
+    DeletedPhis.erase(To);
   }
+  assert(DeletedPhis.empty());
 
   AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
 }
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 5bca5cf8ff91f7c..94c7f161fc4c739 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -959,6 +959,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::SanitizeThread:
       case Attribute::SanitizeHWAddress:
       case Attribute::SanitizeMemTag:
+      case Attribute::SanitizeRealtime:
       case Attribute::SpeculativeLoadHardening:
       case Attribute::StackProtect:
       case Attribute::StackProtectReq:
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 61183752ab90598..cfae63405966ff9 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -460,7 +460,7 @@ static void convertMetadataToAssumes(LoadInst *LI, Value *Val,
     LLVMContext &Ctx = LI->getContext();
     new StoreInst(ConstantInt::getTrue(Ctx),
                   PoisonValue::get(PointerType::getUnqual(Ctx)),
-                  /*isVolatile=*/false, Align(1), LI);
+                  /*isVolatile=*/false, Align(1), LI->getIterator());
     return;
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index de99fb4bee2377a..f82370d738fc690 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2632,8 +2632,9 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
   }
 
   // Create phi nodes to merge from the  backedge-taken check block.
-  PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
-                                         LoopScalarPreHeader->getFirstNonPHI());
+  PHINode *BCResumeVal =
+      PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
+                      LoopScalarPreHeader->getFirstNonPHIIt());
   // Copy original phi DL over to the new one.
   BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7619e744f7a2f72..186b382addd7106 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -309,9 +309,11 @@ static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
 
 #if !defined(NDEBUG)
 /// Print a short descriptor of the instruction bundle suitable for debug output.
-static std::string shortBundleName(ArrayRef<Value *> VL) {
+static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
   std::string Result;
   raw_string_ostream OS(Result);
+  if (Idx >= 0)
+    OS << "Idx: " << Idx << ", ";
   OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
   OS.flush();
   return Result;
@@ -2971,13 +2973,24 @@ class BoUpSLP {
     /// (either with vector instruction or with scatter/gather
     /// intrinsics for store/load)?
     enum EntryState {
-      Vectorize,
-      ScatterVectorize,
-      StridedVectorize,
-      NeedToGather
+      Vectorize,         ///< The node is regularly vectorized.
+      ScatterVectorize,  ///< Masked scatter/gather node.
+      StridedVectorize,  ///< Strided loads (and stores)
+      NeedToGather,      ///< Gather/buildvector node.
+      CombinedVectorize, ///< Vectorized node, combined with its user into more
+                         ///< complex node like select/cmp to minmax, mul/add to
+                         ///< fma, etc. Must be used for the following nodes in
+                         ///< the pattern, not the very first one.
     };
     EntryState State;
 
+    /// List of combined opcodes supported by the vectorizer.
+    enum CombinedOpcode {
+      NotCombinedOp = -1,
+      MinMax = Instruction::OtherOpsEnd + 1,
+    };
+    CombinedOpcode CombinedOp = NotCombinedOp;
+
     /// Does this sequence require some shuffling?
     SmallVector<int, 4> ReuseShuffleIndices;
 
@@ -3165,6 +3178,9 @@ class BoUpSLP {
       case NeedToGather:
         dbgs() << "NeedToGather\n";
         break;
+      case CombinedVectorize:
+        dbgs() << "CombinedVectorize\n";
+        break;
       }
       dbgs() << "MainOp: ";
       if (MainOp)
@@ -6405,6 +6421,16 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
       return TreeEntry::NeedToGather;
     }
 
+    if (any_of(VL, [&SourceVectors](Value *V) {
+          // The last InsertElement can have multiple uses.
+          return SourceVectors.contains(V) && !V->hasOneUse();
+        })) {
+      assert(SLPReVec && "Only supported by REVEC.");
+      LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
+                           "multiple uses.\n");
+      return TreeEntry::NeedToGather;
+    }
+
     return TreeEntry::Vectorize;
   }
   case Instruction::Load: {
@@ -7213,6 +7239,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         buildTree_rec(PointerOps, Depth + 1, {TE, 0});
         LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
         break;
+      case TreeEntry::CombinedVectorize:
       case TreeEntry::NeedToGather:
         llvm_unreachable("Unexpected loads state.");
       }
@@ -8294,6 +8321,22 @@ void BoUpSLP::transformNodes() {
       }
       break;
     }
+    case Instruction::Select: {
+      if (E.State != TreeEntry::Vectorize)
+        break;
+      auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
+      if (MinMaxID == Intrinsic::not_intrinsic)
+        break;
+      // This node is a minmax node.
+      E.CombinedOp = TreeEntry::MinMax;
+      TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
+      if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
+          CondEntry->State == TreeEntry::Vectorize) {
+        // The condition node is part of the combined minmax node.
+        CondEntry->State = TreeEntry::CombinedVectorize;
+      }
+      break;
+    }
     default:
       break;
     }
@@ -9430,6 +9473,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   Instruction *VL0 = E->getMainOp();
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+  if (E->CombinedOp != TreeEntry::NotCombinedOp)
+    ShuffleOrOp = E->CombinedOp;
   SetVector<Value *> UniqueValues(VL.begin(), VL.end());
   const unsigned Sz = UniqueValues.size();
   SmallBitVector UsedScalars(Sz, false);
@@ -9515,6 +9560,31 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     return VecCost - ScalarCost;
   };
 
+  auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
+    auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
+    if (MinMaxID == Intrinsic::not_intrinsic)
+      return InstructionCost::getInvalid();
+    Type *CanonicalType = Ty;
+    if (CanonicalType->isPtrOrPtrVectorTy())
+      CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
+          CanonicalType->getContext(),
+          DL->getTypeSizeInBits(CanonicalType->getScalarType())));
+
+    IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
+                                      {CanonicalType, CanonicalType});
+    InstructionCost IntrinsicCost =
+        TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+    // If the selects are the only uses of the compares, they will be
+    // dead and we can adjust the cost by removing their cost.
+    if (VI && SelectOnly) {
+      assert(!Ty->isVectorTy() && "Expected only for scalar type.");
+      auto *CI = cast<CmpInst>(VI->getOperand(0));
+      IntrinsicCost -=
+          TTI->getCmpSelInstrCost(CI->getOpcode(), Ty, Builder.getInt1Ty(),
+                                  CI->getPredicate(), CostKind, CI);
+    }
+    return IntrinsicCost;
+  };
   switch (ShuffleOrOp) {
   case Instruction::PHI: {
     // Count reused scalars.
@@ -9775,28 +9845,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
           E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
           CostKind, VI);
-      auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI);
-      if (MinMaxID != Intrinsic::not_intrinsic) {
-        Type *CanonicalType = OrigScalarTy;
-        if (CanonicalType->isPtrOrPtrVectorTy())
-          CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
-              CanonicalType->getContext(),
-              DL->getTypeSizeInBits(CanonicalType->getScalarType())));
-
-        IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
-                                          {CanonicalType, CanonicalType});
-        InstructionCost IntrinsicCost =
-            TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
-        // If the selects are the only uses of the compares, they will be
-        // dead and we can adjust the cost by removing their cost.
-        if (SelectOnly) {
-          auto *CI = cast<CmpInst>(VI->getOperand(0));
-          IntrinsicCost -= TTI->getCmpSelInstrCost(
-              CI->getOpcode(), OrigScalarTy, Builder.getInt1Ty(),
-              CI->getPredicate(), CostKind, CI);
-        }
-        ScalarCost = std::min(ScalarCost, IntrinsicCost);
-      }
+      InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
+      if (IntrinsicCost.isValid())
+        ScalarCost = IntrinsicCost;
 
       return ScalarCost;
     };
@@ -9805,30 +9856,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
 
       InstructionCost VecCost = TTI->getCmpSelInstrCost(
           E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
-      // Check if it is possible and profitable to use min/max for selects
-      // in VL.
-      //
-      auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL);
-      if (MinMaxID != Intrinsic::not_intrinsic) {
-        Type *CanonicalType = VecTy;
-        if (CanonicalType->isPtrOrPtrVectorTy())
-          CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
-              CanonicalType->getContext(),
-              DL->getTypeSizeInBits(CanonicalType->getScalarType())));
-        IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
-                                          {CanonicalType, CanonicalType});
-        InstructionCost IntrinsicCost =
-            TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
-        // If the selects are the only uses of the compares, they will be
-        // dead and we can adjust the cost by removing their cost.
-        if (SelectOnly) {
-          auto *CI =
-              cast<CmpInst>(cast<Instruction>(VL.front())->getOperand(0));
-          IntrinsicCost -= TTI->getCmpSelInstrCost(CI->getOpcode(), VecTy,
-                                                   MaskTy, VecPred, CostKind);
-        }
-        VecCost = std::min(VecCost, IntrinsicCost);
-      }
       if (auto *SI = dyn_cast<SelectInst>(VL0)) {
         auto *CondType =
             getWidenedType(SI->getCondition()->getType(), VL.size());
@@ -9850,6 +9877,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
   }
+  case TreeEntry::MinMax: {
+    auto GetScalarCost = [&](unsigned Idx) {
+      return GetMinMaxCost(OrigScalarTy);
+    };
+    auto GetVectorCost = [&](InstructionCost CommonCost) {
+      InstructionCost VecCost = GetMinMaxCost(VecTy);
+      return VecCost + CommonCost;
+    };
+    return GetCostDiff(GetScalarCost, GetVectorCost);
+  }
   case Instruction::FNeg:
   case Instruction::Add:
   case Instruction::FAdd:
@@ -10588,6 +10625,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   SmallPtrSet<Value *, 4> CheckedExtracts;
   for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
     TreeEntry &TE = *VectorizableTree[I];
+    // No need to count the cost for combined entries, they are combined and
+    // just skip their cost.
+    if (TE.State == TreeEntry::CombinedVectorize) {
+      LLVM_DEBUG(
+          dbgs() << "SLP: Skipping cost for combined node that starts with "
+                 << *TE.Scalars[0] << ".\n";
+          TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
+      continue;
+    }
     if (TE.isGather()) {
       if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
           E && E->getVectorFactor() == TE.getVectorFactor() &&
@@ -10595,7 +10641,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
         // Some gather nodes might be absolutely the same as some vectorizable
         // nodes after reordering, need to handle it.
         LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
-                          << shortBundleName(TE.Scalars) << ".\n"
+                          << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
                           << "SLP: Current total cost = " << Cost << "\n");
         continue;
       }
@@ -10604,7 +10650,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
     InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
     Cost += C;
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
-                      << shortBundleName(TE.Scalars) << ".\n"
+                      << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
                       << "SLP: Current total cost = " << Cost << "\n");
   }
 
@@ -12594,10 +12640,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
           Entries.front().front()->isSame(E->Scalars)) {
         // Perfect match in the graph, will reuse the previously vectorized
         // node. Cost is 0.
-        LLVM_DEBUG(
-            dbgs()
-            << "SLP: perfect diamond match for gather bundle "
-            << shortBundleName(E->Scalars) << ".\n");
+        LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
+                          << shortBundleName(E->Scalars, E->Idx) << ".\n");
         // Restore the mask for previous partially matched values.
         Mask.resize(E->Scalars.size());
         const TreeEntry *FrontTE = Entries.front().front();
@@ -12956,10 +13000,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
   };
 
-  assert((E->State == TreeEntry::Vectorize ||
-          E->State == TreeEntry::ScatterVectorize ||
-          E->State == TreeEntry::StridedVectorize) &&
-         "Unhandled state");
+  assert(!E->isGather() && "Unhandled state");
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   Instruction *VL0 = E->getMainOp();
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 42b1b86ebaadf0e..5a4435bc96e0c80 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -89,6 +89,7 @@ function(builtin_default_target compiler_rt_path)
                                       -DLLVM_RUNTIME_OUTPUT_INTDIR=${LLVM_TOOLS_BINARY_DIR}
                                       -DLLVM_DEFAULT_TARGET_TRIPLE=${LLVM_TARGET_TRIPLE}
                                       -DLLVM_ENABLE_PER_TARGET_RUNTIME_DIR=${LLVM_ENABLE_PER_TARGET_RUNTIME_DIR}
+                                      -DLLVM_CMAKE_DIR=${CMAKE_BINARY_DIR}
                                       -DCMAKE_C_COMPILER_WORKS=ON
                                       -DCMAKE_ASM_COMPILER_WORKS=ON
                                       ${COMMON_CMAKE_ARGS}
@@ -128,6 +129,7 @@ function(builtin_register_target compiler_rt_path name)
                            CMAKE_ARGS -DLLVM_LIBRARY_OUTPUT_INTDIR=${LLVM_LIBRARY_DIR}
                                       -DLLVM_RUNTIME_OUTPUT_INTDIR=${LLVM_TOOLS_BINARY_DIR}
                                       -DLLVM_ENABLE_PER_TARGET_RUNTIME_DIR=ON
+                                      -DLLVM_CMAKE_DIR=${CMAKE_BINARY_DIR}
                                       -DCMAKE_C_COMPILER_WORKS=ON
                                       -DCMAKE_ASM_COMPILER_WORKS=ON
                                       -DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-sminmax.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-sminmax.ll
index c73b8b716bc5417..8afb26722cc16b7 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-sminmax.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-sminmax.ll
@@ -42,24 +42,24 @@ define i32 @smax(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'smax'
@@ -67,50 +67,25 @@ define i32 @smax(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'smax'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -173,24 +148,24 @@ define i32 @smin(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'smin'
@@ -198,50 +173,25 @@ define i32 @smin(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'smin'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -271,3 +221,5 @@ define i32 @smin(i32 %arg) {
 
   ret i32 undef
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
index 68687743d75b610..2b3728b556d9e5b 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
@@ -42,24 +42,24 @@ define i32 @umax(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'umax'
@@ -67,50 +67,25 @@ define i32 @umax(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'umax'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -173,24 +148,24 @@ define i32 @umin(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'umin'
@@ -198,50 +173,25 @@ define i32 @umin(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'umin'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -271,3 +221,5 @@ define i32 @umin(i32 %arg) {
 
   ret i32 undef
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/ARM/active_lane_mask.ll b/llvm/test/Analysis/CostModel/ARM/active_lane_mask.ll
index cdd61c147fdad93..c6f1cbab80438c2 100644
--- a/llvm/test/Analysis/CostModel/ARM/active_lane_mask.ll
+++ b/llvm/test/Analysis/CostModel/ARM/active_lane_mask.ll
@@ -3,18 +3,18 @@
 
 define void @get_lane_mask() {
 ; CHECK-LABEL: 'get_lane_mask'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %mask_v8i1_i16 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i16(i16 undef, i16 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %mask_v4i1_i16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i16(i16 undef, i16 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %mask_v2i1_i16 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %mask_v8i1_i16 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %mask_v4i1_i16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %mask_v2i1_i16 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
index 67ab70569175e73..484cd201d14a074 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
@@ -30,79 +30,79 @@ declare {<64 x i8>, <64 x i1>}  @llvm.sadd.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @sadd(i32 %arg) {
 ; V8M-RECIP-LABEL: 'sadd'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'sadd'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-RECIP-LABEL: 'sadd'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 218 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 434 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 154 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 284 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 544 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'sadd'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'sadd'
@@ -110,34 +110,34 @@ define i32 @sadd(i32 %arg) {
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'sadd'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 147 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 528 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -188,26 +188,26 @@ declare {<64 x i8>, <64 x i1>}  @llvm.uadd.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @uadd(i32 %arg) {
 ; V8M-RECIP-LABEL: 'uadd'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'uadd'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -226,7 +226,7 @@ define i32 @uadd(i32 %arg) {
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-RECIP-LABEL: 'uadd'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -245,26 +245,26 @@ define i32 @uadd(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'uadd'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'uadd'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -283,7 +283,7 @@ define i32 @uadd(i32 %arg) {
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'uadd'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -346,79 +346,79 @@ declare {<64 x i8>, <64 x i1>}  @llvm.ssub.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @ssub(i32 %arg) {
 ; V8M-RECIP-LABEL: 'ssub'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'ssub'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-RECIP-LABEL: 'ssub'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 218 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 434 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 154 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 284 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 544 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'ssub'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'ssub'
@@ -426,34 +426,34 @@ define i32 @ssub(i32 %arg) {
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'ssub'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 147 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 528 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -504,26 +504,26 @@ declare {<64 x i8>, <64 x i1>}  @llvm.usub.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @usub(i32 %arg) {
 ; V8M-RECIP-LABEL: 'usub'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'usub'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -542,7 +542,7 @@ define i32 @usub(i32 %arg) {
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-RECIP-LABEL: 'usub'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -561,26 +561,26 @@ define i32 @usub(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'usub'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'usub'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -599,7 +599,7 @@ define i32 @usub(i32 %arg) {
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'usub'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
index 384ebbee8eba144..da7096af4d80c1c 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
@@ -37,27 +37,27 @@ declare <64 x i8>  @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @add(i32 %arg) {
 ; V8M-RECIP-LABEL: 'add'
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'add'
@@ -113,26 +113,26 @@ define i32 @add(i32 %arg) {
 ; V8M-SIZE-LABEL: 'add'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'add'
@@ -243,27 +243,27 @@ declare <64 x i8>  @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @sub(i32 %arg) {
 ; V8M-RECIP-LABEL: 'sub'
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'sub'
@@ -319,26 +319,26 @@ define i32 @sub(i32 %arg) {
 ; V8M-SIZE-LABEL: 'sub'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'sub'
diff --git a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
index 64d49f7e492db49..8a12ca2df234efe 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
@@ -37,27 +37,27 @@ declare <64 x i8>  @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @add(i32 %arg) {
 ; V8M-RECIP-LABEL: 'add'
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'add'
@@ -112,27 +112,27 @@ define i32 @add(i32 %arg) {
 ;
 ; V8M-SIZE-LABEL: 'add'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'add'
@@ -243,27 +243,27 @@ declare <64 x i8>  @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @sub(i32 %arg) {
 ; V8M-RECIP-LABEL: 'sub'
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'sub'
@@ -318,27 +318,27 @@ define i32 @sub(i32 %arg) {
 ;
 ; V8M-SIZE-LABEL: 'sub'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'sub'
diff --git a/llvm/test/Analysis/CostModel/ARM/cmps.ll b/llvm/test/Analysis/CostModel/ARM/cmps.ll
index 184b7076d02bea9..e86a1b95a7b36b7 100644
--- a/llvm/test/Analysis/CostModel/ARM/cmps.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cmps.ll
@@ -227,7 +227,7 @@ define void @minmax() {
 ; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c3 = icmp slt i32 undef, undef
 ; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s3 = select i1 %c3, i32 undef, i32 undef
 ; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %c4 = icmp slt <4 x i32> undef, undef
-; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
 ; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c5 = icmp slt ptr undef, undef
 ; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s5 = select i1 %c5, ptr undef, ptr undef
 ; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c6 = icmp slt <4 x ptr> undef, undef
@@ -287,7 +287,7 @@ define void @minmax() {
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c3 = icmp slt i32 undef, undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s3 = select i1 %c3, i32 undef, i32 undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %c4 = icmp slt <4 x i32> undef, undef
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c5 = icmp slt ptr undef, undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s5 = select i1 %c5, ptr undef, ptr undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c6 = icmp slt <4 x ptr> undef, undef
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
index 4ef35f154789604..7dcab51e0a1cf3f 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
@@ -48,7 +48,7 @@ define i32 @reduce_i32(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
@@ -82,8 +82,8 @@ define i32 @reduce_i16(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
@@ -120,9 +120,9 @@ define i32 @reduce_i8(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
index f8bce922abcd040..617c6b4605189cf 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
@@ -48,7 +48,7 @@ define i32 @reduce_i32(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
@@ -82,8 +82,8 @@ define i32 @reduce_i16(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
@@ -120,9 +120,9 @@ define i32 @reduce_i8(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
index 24f76e60cd31027..764034d18bee026 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
@@ -48,7 +48,7 @@ define i32 @reduce_i32(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
@@ -82,8 +82,8 @@ define i32 @reduce_i16(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
@@ -120,9 +120,9 @@ define i32 @reduce_i8(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
index ee7025cf5db2501..b5431f63bdca977 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
@@ -48,7 +48,7 @@ define i32 @reduce_i32(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
@@ -82,8 +82,8 @@ define i32 @reduce_i16(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
@@ -120,9 +120,9 @@ define i32 @reduce_i8(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
diff --git a/llvm/test/Assembler/range-attribute-invalid-range.ll b/llvm/test/Assembler/range-attribute-invalid-range.ll
index cf6d3f0801838c9..1ddb6745e5dc208 100644
--- a/llvm/test/Assembler/range-attribute-invalid-range.ll
+++ b/llvm/test/Assembler/range-attribute-invalid-range.ll
@@ -1,6 +1,6 @@
 ; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
 
-; CHECK: the range should not represent the full or empty set!
-define void @range_empty(i8 range(i8 0, 0) %a) {
+; CHECK: the range represent the empty set but limits aren't 0!
+define void @range_empty(i8 range(i8 1, 1) %a) {
   ret void
 }
diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll
index f4dc9b9849827a8..4402289ac170d9b 100644
--- a/llvm/test/Bitcode/attributes.ll
+++ b/llvm/test/Bitcode/attributes.ll
@@ -505,6 +505,12 @@ define void @f86() nosanitize_bounds
         ret void;
 }
 
+; CHECK: define void @f92() #53
+define void @f92() sanitize_realtime
+{
+        ret void;
+}
+
 ; CHECK: define void @f87() [[FNRETTHUNKEXTERN:#[0-9]+]]
 define void @f87() fn_ret_thunk_extern { ret void }
 
@@ -531,8 +537,8 @@ define range(i32 -1, 42) i32 @range_attribute(<4 x i32> range(i32 -1, 42) %a) {
   ret i32 0
 }
 
-; CHECK: define range(i32 0, 42) i32 @range_attribute_same_range_other_bitwidth(i8 range(i8 0, 42) %a)
-define range(i32 0, 42) i32 @range_attribute_same_range_other_bitwidth(i8 range(i8 0, 42) %a) {
+; CHECK: define range(i32 0, 0) i32 @range_attribute_same_range_other_bitwidth(i8 range(i8 0, 42) %a)
+define range(i32 0, 0) i32 @range_attribute_same_range_other_bitwidth(i8 range(i8 0, 42) %a) {
   ret i32 0
 }
 
@@ -599,6 +605,7 @@ define void @initializes(ptr initializes((-4, 0), (4, 8)) %a) {
 ; CHECK: attributes #50 = { disable_sanitizer_instrumentation }
 ; CHECK: attributes #51 = { uwtable(sync) }
 ; CHECK: attributes #52 = { nosanitize_bounds }
+; CHECK: attributes #53 = { sanitize_realtime }
 ; CHECK: attributes [[FNRETTHUNKEXTERN]] = { fn_ret_thunk_extern }
 ; CHECK: attributes [[SKIPPROFILE]] = { skipprofile }
 ; CHECK: attributes [[OPTDEBUG]] = { optdebug }
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index e5592b347425a24..fd60c49a4be39be 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1562,7 +1562,7 @@ exit:
   ; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x i8> <i8 2, i8 3>, <2 x i8> <i8 3, i8 2>
 
   call void @f.nobuiltin() builtin
-  ; CHECK: call void @f.nobuiltin() #52
+  ; CHECK: call void @f.nobuiltin() #53
 
   call fastcc noalias ptr @f.noalias() noinline
   ; CHECK: call fastcc noalias ptr @f.noalias() #12
@@ -1989,6 +1989,9 @@ declare void @f.allockind() allockind("alloc,uninitialized")
 declare void @f.sanitize_numerical_stability() sanitize_numerical_stability
 ; CHECK: declare void @f.sanitize_numerical_stability() #51
 
+declare void @f.sanitize_realtime() sanitize_realtime
+; CHECK: declare void @f.sanitize_realtime() #52
+
 ; CHECK: declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan))
 declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan))
 
@@ -2111,7 +2114,8 @@ define float @nofpclass_callsites(float %arg) {
 ; CHECK: attributes #49 = { nosanitize_bounds }
 ; CHECK: attributes #50 = { allockind("alloc,uninitialized") }
 ; CHECK: attributes #51 = { sanitize_numerical_stability }
-; CHECK: attributes #52 = { builtin }
+; CHECK: attributes #52 = { sanitize_realtime }
+; CHECK: attributes #53 = { builtin }
 
 ;; Metadata
 
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index e0226c35cc2de69..fb96b9ff2952e82 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -577,11 +577,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v6
-; GFX908-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX908-NEXT:    v_mov_b32_e32 v8, s8
+; GFX908-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s9
-; GFX908-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v9, s9
+; GFX908-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX908-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX908-NEXT:    s_mov_b64 s[18:19], s[10:11]
@@ -642,10 +642,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_add_f32_e32 v12, v20, v12
 ; GFX908-NEXT:    v_add_f32_e32 v5, v5, v25
 ; GFX908-NEXT:    v_add_f32_e32 v4, v4, v24
-; GFX908-NEXT:    v_add_f32_e32 v7, v7, v27
-; GFX908-NEXT:    v_add_f32_e32 v6, v6, v26
-; GFX908-NEXT:    v_add_f32_e32 v8, v8, v14
-; GFX908-NEXT:    v_add_f32_e32 v9, v9, v15
+; GFX908-NEXT:    v_add_f32_e32 v9, v9, v27
+; GFX908-NEXT:    v_add_f32_e32 v8, v8, v26
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX908-NEXT:    v_add_f32_e32 v7, v7, v15
 ; GFX908-NEXT:    v_add_f32_e32 v10, v10, v12
 ; GFX908-NEXT:    v_add_f32_e32 v11, v11, v13
 ; GFX908-NEXT:    s_mov_b64 s[20:21], -1
@@ -655,6 +655,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX908-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
+; GFX908-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX908-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX908-NEXT:    ; implicit-def: $sgpr18_sgpr19
 ; GFX908-NEXT:  .LBB3_9: ; %loop.exit.guard
@@ -740,8 +744,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v8
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
 ; GFX90A-NEXT:    s_mov_b64 s[18:19], s[10:11]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
@@ -797,8 +801,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_pk_add_f32 v[16:17], v[22:23], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[14:15], v[20:21], v[14:15]
 ; GFX90A-NEXT:    v_pk_add_f32 v[6:7], v[6:7], v[24:25]
-; GFX90A-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[26:27]
-; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[16:17]
+; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[26:27]
+; GFX90A-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[14:15]
 ; GFX90A-NEXT:    s_mov_b64 s[20:21], -1
 ; GFX90A-NEXT:    s_branch .LBB3_4
@@ -807,6 +811,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX90A-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
+; GFX90A-NEXT:    ; implicit-def: $vgpr12_vgpr13
+; GFX90A-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX90A-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; GFX90A-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $sgpr18_sgpr19
 ; GFX90A-NEXT:  .LBB3_9: ; %loop.exit.guard
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 8bf7a1cc42f642f..4f0bc512565d135 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -222,7 +222,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 
 define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ; GFX908-LABEL: syncscope_workgroup_nortn:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
@@ -272,7 +272,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: syncscope_workgroup_nortn:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index d732da1a67bc1fb..970bb08e1838b26 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2131,26 +2131,14 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
 ; GFX10-NEXT:    global_store_short v[2:3], v5, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11TRUE16-LABEL: test_store_fpimm:
-; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x3f80
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0x4228
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
-; GFX11TRUE16-NEXT:    global_store_b16 v[0:1], v5, off
-; GFX11TRUE16-NEXT:    global_store_b16 v[2:3], v4, off
-; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: test_store_fpimm:
-; GFX11FAKE16:       ; %bb.0:
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    v_mov_b32_e32 v4, 0x3f80
-; GFX11FAKE16-NEXT:    v_mov_b32_e32 v5, 0x4228
-; GFX11FAKE16-NEXT:    global_store_b16 v[0:1], v4, off
-; GFX11FAKE16-NEXT:    global_store_b16 v[2:3], v5, off
-; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_store_fpimm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b16 v[2:3], v5, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat 1.0, ptr addrspace(1) %ptr0
   store bfloat 42.0, ptr addrspace(1) %ptr1
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index 7352fcdd071d5b1..9fe7544003568cf 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -246,9 +246,7 @@ define amdgpu_kernel void @fadd_f16_imm_a(
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, s7
 ; GFX11-SDAG-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b16_e32 v0.h, 0x3c00
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -264,9 +262,7 @@ define amdgpu_kernel void @fadd_f16_imm_a(
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b16_e32 v0.h, 0x3c00
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -390,9 +386,7 @@ define amdgpu_kernel void @fadd_f16_imm_b(
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, s7
 ; GFX11-SDAG-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b16_e32 v0.h, 0x4000
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -408,9 +402,7 @@ define amdgpu_kernel void @fadd_f16_imm_b(
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b16_e32 v0.h, 0x4000
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 896b85ea14da11e..422c8a0be23b498 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -630,7 +630,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
@@ -682,7 +682,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
@@ -839,7 +839,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -893,7 +893,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -1062,7 +1062,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
@@ -1116,7 +1116,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
@@ -1469,7 +1469,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -1525,7 +1525,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -2006,7 +2006,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -2060,7 +2060,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -2950,7 +2950,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
@@ -3002,7 +3002,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
@@ -3159,7 +3159,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -3213,7 +3213,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -3382,7 +3382,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
@@ -3436,7 +3436,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
@@ -3789,7 +3789,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -3845,7 +3845,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -4198,7 +4198,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -4254,7 +4254,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -5239,7 +5239,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
@@ -5291,7 +5291,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
new file mode 100644
index 000000000000000..0c31b36e90cb0c9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
@@ -0,0 +1,80 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+
+---
+name:            local_stack_alloc__v_add_u32_e64__literal_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], 256, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets
+    ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e64__literal_offsets_commute
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, [[V_ADD_U32_e64_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], -156, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
+    ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    %2:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %2
+    SI_RETURN
+
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
new file mode 100644
index 000000000000000..b7ade2147e40cd2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
@@ -0,0 +1,863 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX803 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX900 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX940 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -mattr=+wavefrontsize64 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+
+---
+name:            local_stack_alloc__v_add_co_u32_e32__literal_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
+    ; GFX803: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
+    ; GFX940: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
+    ; GFX12: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
+    ; GFX803: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
+    ; GFX900: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
+    ; GFX940: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
+    ; GFX10: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
+    ; GFX12: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1, implicit $vcc
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
+    ; GFX803: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
+    ; GFX940: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
+    ; GFX12: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_CO_U32_e32 8, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_CO_U32_e32 16, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
+    ; GFX803: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_CO_U32_e64_]], 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
+    ; GFX940: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
+    ; GFX12: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
+    ; GFX803: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
+    ; GFX900: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
+    ; GFX940: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX940-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
+    ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
+    ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
+    %0:vgpr_32, %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN implicit %2
+
+...
+
+---
+name:            local_stack_alloc__s_add_i32__literal_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
+    ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
+    ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
+    ; GFX940: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX940-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
+    ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
+    ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, %0
+    %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__s_add_i32__inline_imm_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
+    ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
+    ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
+    ; GFX940: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX940-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
+    ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
+    ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %0
+    %1:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__s_add_i32__literal_offsets_live_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
+    ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: S_NOP 0, implicit $scc
+    ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: SI_RETURN implicit $scc
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
+    ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: S_NOP 0, implicit $scc
+    ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: SI_RETURN implicit $scc
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
+    ; GFX940: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX940-NEXT: S_NOP 0, implicit $scc
+    ; GFX940-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX940-NEXT: SI_RETURN implicit $scc
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
+    ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: S_NOP 0, implicit $scc
+    ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: SI_RETURN implicit $scc
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
+    ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: S_NOP 0, implicit $scc
+    ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: SI_RETURN implicit $scc
+    %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, %0
+    S_NOP 0, implicit $scc
+    %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, %1
+    SI_RETURN implicit $scc
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
+    ; GFX803: liveins: $vgpr0
+    ; GFX803-NEXT: {{  $}}
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
+    ; GFX900: liveins: $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
+    ; GFX940: liveins: $vgpr0
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
+    ; GFX10: liveins: $vgpr0
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %vgpr_offset:vgpr_32 = COPY $vgpr0
+    %0:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
+    ; GFX803: liveins: $vgpr0
+    ; GFX803-NEXT: {{  $}}
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
+    ; GFX900: liveins: $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
+    ; GFX940: liveins: $vgpr0
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
+    ; GFX10: liveins: $vgpr0
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %vgpr_offset:vgpr_32 = COPY $vgpr0
+    %0:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr8
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
+    ; GFX803: liveins: $sgpr8
+    ; GFX803-NEXT: {{  $}}
+    ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
+    ; GFX900: liveins: $sgpr8
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
+    ; GFX940: liveins: $sgpr8
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
+    ; GFX10: liveins: $sgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
+    ; GFX12: liveins: $sgpr8
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY]], implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY1]], implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %sgpr_offset:sreg_32 = COPY $sgpr8
+    %0:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr8
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
+    ; GFX803: liveins: $sgpr8
+    ; GFX803-NEXT: {{  $}}
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
+    ; GFX900: liveins: $sgpr8
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
+    ; GFX940: liveins: $sgpr8
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY]], 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY1]], 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
+    ; GFX10: liveins: $sgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
+    ; GFX12: liveins: $sgpr8
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: SI_RETURN
+    %sgpr_offset:sreg_32 = COPY $sgpr8
+    %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr8
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
+    ; GFX803: liveins: $sgpr8
+    ; GFX803-NEXT: {{  $}}
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
+    ; GFX900: liveins: $sgpr8
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
+    ; GFX940: liveins: $sgpr8
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY]], 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY1]], 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
+    ; GFX10: liveins: $sgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
+    ; GFX12: liveins: $sgpr8
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: SI_RETURN
+    %sgpr_offset:sreg_32 = COPY $sgpr8
+    %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir
new file mode 100644
index 000000000000000..7ed153133517786
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir
@@ -0,0 +1,523 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX900 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX940 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+
+---
+name:            local_stack_alloc__v_add_u32_e32__literal_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
+    ; GFX940: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
+    ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e32__inline_imm_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
+    ; GFX940: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
+    ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e64__inline_imm_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
+    ; GFX940: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
+    ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e32__vgpr_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
+    ; GFX900: liveins: $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
+    ; GFX940: liveins: $vgpr0
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX940-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
+    ; GFX10: liveins: $vgpr0
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %vgpr_offset:vgpr_32 = COPY $vgpr0
+    %0:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
+    ; GFX900: liveins: $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
+    ; GFX940: liveins: $vgpr0
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX940-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
+    ; GFX10: liveins: $vgpr0
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %vgpr_offset:vgpr_32 = COPY $vgpr0
+    %0:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e32__sgpr_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr8
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
+    ; GFX900: liveins: $sgpr8
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
+    ; GFX940: liveins: $sgpr8
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX940-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
+    ; GFX10: liveins: $sgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
+    ; GFX12: liveins: $sgpr8
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %sgpr_offset:sreg_32 = COPY $sgpr8
+    %0:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e64__sgpr_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr8
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
+    ; GFX900: liveins: $sgpr8
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
+    ; GFX940: liveins: $sgpr8
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX940-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
+    ; GFX10: liveins: $sgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
+    ; GFX12: liveins: $sgpr8
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %sgpr_offset:sreg_32 = COPY $sgpr8
+    %0:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr8
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
+    ; GFX900: liveins: $sgpr8
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
+    ; GFX940: liveins: $sgpr8
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX940-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
+    ; GFX10: liveins: $sgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
+    ; GFX12: liveins: $sgpr8
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %sgpr_offset:sreg_32 = COPY $sgpr8
+    %0:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+# Should be OK to fold with clamp modifier, which should be preserved.
+---
+name:            local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
+    ; GFX940: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
+    ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, /*clamp*/1, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, /*clamp*/1, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
new file mode 100644
index 000000000000000..1f36f7a0d9616e0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
+
+define half @swap(half %a, half %b, i32 %i) {
+; GFX11-TRUE16-LABEL: swap:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:  .LBB0_1: ; %loop
+; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_swap_b16 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %ret
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: swap:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:  .LBB0_1: ; %loop
+; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2
+; GFX11-FAKE16-NEXT:    v_swap_b32 v1, v0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %ret
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: swap:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT:  .LBB0_1: ; %loop
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_swap_b16 v0.l, v0.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %ret
+; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: swap:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT:  .LBB0_1: ; %loop
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2
+; GFX12-FAKE16-NEXT:    v_swap_b32 v1, v0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %ret
+; GFX12-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  br label %loop
+
+loop:
+  %x = phi half [%a, %entry], [%y, %loop]
+  %y = phi half [%b, %entry], [%x, %loop]
+  %i2 = phi i32 [%i, %entry], [%i3, %loop]
+
+  %i3 = sub i32 %i2, 1
+
+  %cmp = icmp eq i32 %i3, 0
+  br i1 %cmp, label %ret, label %loop
+
+ret:
+  ret half %x
+}
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-multiple-funcs.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-multiple-funcs.mir
new file mode 100644
index 000000000000000..a65ec9c67654927
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-multiple-funcs.mir
@@ -0,0 +1,41 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass si-insert-waitcnts -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+# CHECK-LABEL: name: t1
+# CHECK: liveins: $vgpr0
+name: t1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0
+...
+
+---
+# CHECK-LABEL: name: t2
+# CHECK: liveins: $sgpr2_sgpr3
+# CHECK: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr2_sgpr3, 0, 0 :: (load (s64), addrspace 4)
+name: t2
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $sgpr2_sgpr3
+     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr2_sgpr3, 0, 0 :: (load (s64), addrspace 4)
+...
+
+---
+# CHECK-LABEL: name: t3
+# CHECK:  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3
+# CHECK:  $vgpr2 = BUFFER_ATOMIC_ADD_ADDR64_RTN $vgpr2, $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 1, implicit $exec :: (load store (s32), addrspace 1)
+name: t3
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3
+    $vgpr2 = BUFFER_ATOMIC_ADD_ADDR64_RTN $vgpr2, $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 1, implicit $exec :: (load store (s32), addrspace 1)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll
index 9bb8a2f9f0282c2..46254994580d2d0 100644
--- a/llvm/test/CodeGen/AMDGPU/while-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/while-break.ll
@@ -162,8 +162,8 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
 ; GCN-NEXT:    s_branch .LBB2_2
 ; GCN-NEXT:  .LBB2_1: ; %Flow1
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GCN-NEXT:    s_and_b32 s1, exec_lo, s1
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GCN-NEXT:    s_and_b32 s1, exec_lo, s4
 ; GCN-NEXT:    s_or_b32 s2, s1, s2
 ; GCN-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
 ; GCN-NEXT:    s_cbranch_execz .LBB2_6
@@ -190,17 +190,20 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
 ; GCN-NEXT:  .LBB2_4: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GCN-NEXT:    s_mov_b32 s1, -1
-; GCN-NEXT:    s_and_saveexec_b32 s4, s3
+; GCN-NEXT:    v_mov_b32_e32 v7, v6
+; GCN-NEXT:    s_mov_b32 s4, -1
+; GCN-NEXT:    s_and_saveexec_b32 s1, s3
 ; GCN-NEXT:    s_cbranch_execz .LBB2_1
 ; GCN-NEXT:  ; %bb.5: ; %latch
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v3
+; GCN-NEXT:    v_mov_b32_e32 v7, v0
 ; GCN-NEXT:    s_add_i32 s0, s0, 1
-; GCN-NEXT:    s_orn2_b32 s1, vcc_lo, exec_lo
+; GCN-NEXT:    s_orn2_b32 s4, vcc_lo, exec_lo
 ; GCN-NEXT:    s_branch .LBB2_1
 ; GCN-NEXT:  .LBB2_6: ; %end
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, v7
 ; GCN-NEXT:    v_mov_b32_e32 v1, v6
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/block-address.ll b/llvm/test/CodeGen/LoongArch/block-address.ll
index eaba81f3563d7f3..114cbb73a512592 100644
--- a/llvm/test/CodeGen/LoongArch/block-address.ll
+++ b/llvm/test/CodeGen/LoongArch/block-address.ll
@@ -8,11 +8,10 @@ define void @test_blockaddress() nounwind {
 ; LA32-LABEL: test_blockaddress:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(addr)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(addr)
 ; LA32-NEXT:    pcalau12i $a1, %pc_hi20(.Ltmp0)
 ; LA32-NEXT:    addi.w $a1, $a1, %pc_lo12(.Ltmp0)
-; LA32-NEXT:    st.w $a1, $a0, 0
-; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    st.w $a1, $a0, %pc_lo12(addr)
+; LA32-NEXT:    ld.w $a0, $a0, %pc_lo12(addr)
 ; LA32-NEXT:    jr $a0
 ; LA32-NEXT:  .Ltmp0: # Block address taken
 ; LA32-NEXT:  .LBB0_1: # %block
@@ -21,11 +20,10 @@ define void @test_blockaddress() nounwind {
 ; LA64-LABEL: test_blockaddress:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(addr)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(addr)
 ; LA64-NEXT:    pcalau12i $a1, %pc_hi20(.Ltmp0)
 ; LA64-NEXT:    addi.d $a1, $a1, %pc_lo12(.Ltmp0)
-; LA64-NEXT:    st.d $a1, $a0, 0
-; LA64-NEXT:    ld.d $a0, $a0, 0
+; LA64-NEXT:    st.d $a1, $a0, %pc_lo12(addr)
+; LA64-NEXT:    ld.d $a0, $a0, %pc_lo12(addr)
 ; LA64-NEXT:    jr $a0
 ; LA64-NEXT:  .Ltmp0: # Block address taken
 ; LA64-NEXT:  .LBB0_1: # %block
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll b/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll
index cc6ba057019c652..34fbec03c535b07 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll
@@ -64,26 +64,19 @@ define i64 @caller_double_in_gpr_exhausted_fprs() nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    fld.d $fa1, $a0, 0
+; CHECK-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI3_0)
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_1)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_1)
-; CHECK-NEXT:    fld.d $fa2, $a0, 0
+; CHECK-NEXT:    fld.d $fa2, $a0, %pc_lo12(.LCPI3_1)
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_2)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_2)
-; CHECK-NEXT:    fld.d $fa3, $a0, 0
+; CHECK-NEXT:    fld.d $fa3, $a0, %pc_lo12(.LCPI3_2)
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_3)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_3)
-; CHECK-NEXT:    fld.d $fa4, $a0, 0
+; CHECK-NEXT:    fld.d $fa4, $a0, %pc_lo12(.LCPI3_3)
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_4)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_4)
-; CHECK-NEXT:    fld.d $fa5, $a0, 0
+; CHECK-NEXT:    fld.d $fa5, $a0, %pc_lo12(.LCPI3_4)
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_5)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_5)
-; CHECK-NEXT:    fld.d $fa6, $a0, 0
+; CHECK-NEXT:    fld.d $fa6, $a0, %pc_lo12(.LCPI3_5)
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_6)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_6)
-; CHECK-NEXT:    fld.d $fa7, $a0, 0
+; CHECK-NEXT:    fld.d $fa7, $a0, %pc_lo12(.LCPI3_6)
 ; CHECK-NEXT:    addi.d $a0, $zero, 1
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    ffint.d.l $fa0, $fa0
diff --git a/llvm/test/CodeGen/LoongArch/double-imm.ll b/llvm/test/CodeGen/LoongArch/double-imm.ll
index 3e89db3ec5c8ccd..8d50b27907d72b4 100644
--- a/llvm/test/CodeGen/LoongArch/double-imm.ll
+++ b/llvm/test/CodeGen/LoongArch/double-imm.ll
@@ -36,15 +36,13 @@ define double @f64_constant_pi() nounwind {
 ; LA32-LABEL: f64_constant_pi:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA32-NEXT:    fld.d $fa0, $a0, 0
+; LA32-NEXT:    fld.d $fa0, $a0, %pc_lo12(.LCPI2_0)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: f64_constant_pi:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA64-NEXT:    fld.d $fa0, $a0, 0
+; LA64-NEXT:    fld.d $fa0, $a0, %pc_lo12(.LCPI2_0)
 ; LA64-NEXT:    ret
   ret double 3.1415926535897931159979634685441851615905761718750
 }
diff --git a/llvm/test/CodeGen/LoongArch/float-imm-vldi.ll b/llvm/test/CodeGen/LoongArch/float-imm-vldi.ll
index 0c6856bafd23d76..551ab6ea44c66d0 100644
--- a/llvm/test/CodeGen/LoongArch/float-imm-vldi.ll
+++ b/llvm/test/CodeGen/LoongArch/float-imm-vldi.ll
@@ -1,6145 +1,2563 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc --mtriple=loongarch32 --mattr=+lsx < %s | FileCheck %s --check-prefix=LA32
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefix=LA64
+; RUN: llc --mtriple=loongarch32 --mattr=+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
 
 define dso_local { float, double } @test1() {
-; LA32-LABEL: test1:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI0_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI0_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test1:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1280
+; CHECK-NEXT:    vldi $vr1, -1024
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.0000000000, double 2.0000000000 }
 }
 
 define dso_local { float, double } @test2() {
-; LA32-LABEL: test2:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI1_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI1_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test2:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1279
+; CHECK-NEXT:    vldi $vr1, -1023
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.1250000000, double 2.1250000000 }
 }
 
 define dso_local { float, double } @test3() {
-; LA32-LABEL: test3:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test3:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1278
+; CHECK-NEXT:    vldi $vr1, -1022
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.2500000000, double 2.2500000000 }
 }
 
 define dso_local { float, double } @test4() {
-; LA32-LABEL: test4:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI3_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI3_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test4:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1277
+; CHECK-NEXT:    vldi $vr1, -1021
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.3750000000, double 2.3750000000 }
 }
 
 define dso_local { float, double } @test5() {
-; LA32-LABEL: test5:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI4_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI4_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test5:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1276
+; CHECK-NEXT:    vldi $vr1, -1020
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.5000000000, double 2.5000000000 }
 }
 
 define dso_local { float, double } @test6() {
-; LA32-LABEL: test6:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI5_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI5_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test6:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1275
+; CHECK-NEXT:    vldi $vr1, -1019
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.6250000000, double 2.6250000000 }
 }
 
 define dso_local { float, double } @test7() {
-; LA32-LABEL: test7:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI6_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI6_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test7:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI6_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI6_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1274
+; CHECK-NEXT:    vldi $vr1, -1018
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.7500000000, double 2.7500000000 }
 }
 
 define dso_local { float, double } @test8() {
-; LA32-LABEL: test8:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI7_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI7_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test8:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI7_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI7_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1273
+; CHECK-NEXT:    vldi $vr1, -1017
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.8750000000, double 2.8750000000 }
 }
 
 define dso_local { float, double } @test9() {
-; LA32-LABEL: test9:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI8_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI8_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test9:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI8_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI8_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test9:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1272
+; CHECK-NEXT:    vldi $vr1, -1016
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.0000000000, double 3.0000000000 }
 }
 
 define dso_local { float, double } @test10() {
-; LA32-LABEL: test10:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI9_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI9_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test10:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI9_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI9_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test10:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1271
+; CHECK-NEXT:    vldi $vr1, -1015
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.1250000000, double 3.1250000000 }
 }
 
 define dso_local { float, double } @test11() {
-; LA32-LABEL: test11:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI10_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI10_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test11:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI10_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI10_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1270
+; CHECK-NEXT:    vldi $vr1, -1014
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.2500000000, double 3.2500000000 }
 }
 
 define dso_local { float, double } @test12() {
-; LA32-LABEL: test12:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI11_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI11_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test12:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI11_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI11_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1269
+; CHECK-NEXT:    vldi $vr1, -1013
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.3750000000, double 3.3750000000 }
 }
 
 define dso_local { float, double } @test13() {
-; LA32-LABEL: test13:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI12_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI12_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test13:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI12_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI12_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1268
+; CHECK-NEXT:    vldi $vr1, -1012
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.5000000000, double 3.5000000000 }
 }
 
 define dso_local { float, double } @test14() {
-; LA32-LABEL: test14:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI13_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI13_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test14:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI13_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI13_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test14:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1267
+; CHECK-NEXT:    vldi $vr1, -1011
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.6250000000, double 3.6250000000 }
 }
 
 define dso_local { float, double } @test15() {
-; LA32-LABEL: test15:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI14_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI14_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test15:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI14_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI14_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test15:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1266
+; CHECK-NEXT:    vldi $vr1, -1010
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.7500000000, double 3.7500000000 }
 }
 
 define dso_local { float, double } @test16() {
-; LA32-LABEL: test16:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI15_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI15_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test16:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI15_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI15_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1265
+; CHECK-NEXT:    vldi $vr1, -1009
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.8750000000, double 3.8750000000 }
 }
 
 define dso_local { float, double } @test17() {
-; LA32-LABEL: test17:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI16_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI16_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test17:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI16_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI16_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test17:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1264
+; CHECK-NEXT:    vldi $vr1, -1008
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 4.0000000000, double 4.0000000000 }
 }
 
 define dso_local { float, double } @test18() {
-; LA32-LABEL: test18:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI17_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI17_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test18:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI17_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI17_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test18:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1263
+; CHECK-NEXT:    vldi $vr1, -1007
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 4.2500000000, double 4.2500000000 }
 }
 
 define dso_local { float, double } @test19() {
-; LA32-LABEL: test19:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI18_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI18_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test19:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI18_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI18_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test19:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1262
+; CHECK-NEXT:    vldi $vr1, -1006
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 4.5000000000, double 4.5000000000 }
 }
 
 define dso_local { float, double } @test20() {
-; LA32-LABEL: test20:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI19_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI19_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test20:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI19_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI19_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test20:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1261
+; CHECK-NEXT:    vldi $vr1, -1005
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 4.7500000000, double 4.7500000000 }
 }
 
 define dso_local { float, double } @test21() {
-; LA32-LABEL: test21:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI20_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI20_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test21:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI20_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI20_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test21:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1260
+; CHECK-NEXT:    vldi $vr1, -1004
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 5.0000000000, double 5.0000000000 }
 }
 
 define dso_local { float, double } @test22() {
-; LA32-LABEL: test22:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI21_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI21_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test22:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI21_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI21_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test22:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1259
+; CHECK-NEXT:    vldi $vr1, -1003
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 5.2500000000, double 5.2500000000 }
 }
 
 define dso_local { float, double } @test23() {
-; LA32-LABEL: test23:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI22_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI22_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI22_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI22_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test23:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI22_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI22_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI22_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI22_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test23:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1258
+; CHECK-NEXT:    vldi $vr1, -1002
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 5.5000000000, double 5.5000000000 }
 }
 
 define dso_local { float, double } @test24() {
-; LA32-LABEL: test24:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI23_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI23_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI23_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI23_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test24:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI23_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI23_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI23_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI23_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test24:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1257
+; CHECK-NEXT:    vldi $vr1, -1001
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 5.7500000000, double 5.7500000000 }
 }
 
 define dso_local { float, double } @test25() {
-; LA32-LABEL: test25:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI24_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI24_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test25:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI24_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI24_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test25:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1256
+; CHECK-NEXT:    vldi $vr1, -1000
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 6.0000000000, double 6.0000000000 }
 }
 
 define dso_local { float, double } @test26() {
-; LA32-LABEL: test26:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI25_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI25_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI25_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI25_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test26:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI25_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI25_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI25_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI25_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test26:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1255
+; CHECK-NEXT:    vldi $vr1, -999
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 6.2500000000, double 6.2500000000 }
 }
 
 define dso_local { float, double } @test27() {
-; LA32-LABEL: test27:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI26_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI26_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI26_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI26_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test27:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI26_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI26_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI26_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI26_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test27:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1254
+; CHECK-NEXT:    vldi $vr1, -998
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 6.5000000000, double 6.5000000000 }
 }
 
 define dso_local { float, double } @test28() {
-; LA32-LABEL: test28:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI27_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI27_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI27_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI27_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test28:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI27_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI27_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI27_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI27_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test28:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1253
+; CHECK-NEXT:    vldi $vr1, -997
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 6.7500000000, double 6.7500000000 }
 }
 
 define dso_local { float, double } @test29() {
-; LA32-LABEL: test29:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI28_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI28_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI28_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI28_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test29:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI28_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI28_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI28_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI28_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test29:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1252
+; CHECK-NEXT:    vldi $vr1, -996
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 7.0000000000, double 7.0000000000 }
 }
 
 define dso_local { float, double } @test30() {
-; LA32-LABEL: test30:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI29_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI29_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test30:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI29_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI29_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test30:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1251
+; CHECK-NEXT:    vldi $vr1, -995
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 7.2500000000, double 7.2500000000 }
 }
 
 define dso_local { float, double } @test31() {
-; LA32-LABEL: test31:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI30_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI30_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI30_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI30_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test31:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI30_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI30_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI30_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI30_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test31:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1250
+; CHECK-NEXT:    vldi $vr1, -994
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 7.5000000000, double 7.5000000000 }
 }
 
 define dso_local { float, double } @test32() {
-; LA32-LABEL: test32:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI31_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI31_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI31_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI31_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test32:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI31_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI31_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI31_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI31_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1249
+; CHECK-NEXT:    vldi $vr1, -993
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 7.7500000000, double 7.7500000000 }
 }
 
 define dso_local { float, double } @test33() {
-; LA32-LABEL: test33:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI32_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI32_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI32_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI32_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test33:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI32_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI32_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI32_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI32_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test33:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1248
+; CHECK-NEXT:    vldi $vr1, -992
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 8.0000000000, double 8.0000000000 }
 }
 
 define dso_local { float, double } @test34() {
-; LA32-LABEL: test34:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI33_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI33_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI33_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI33_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test34:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI33_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI33_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI33_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI33_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test34:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1247
+; CHECK-NEXT:    vldi $vr1, -991
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 8.5000000000, double 8.5000000000 }
 }
 
 define dso_local { float, double } @test35() {
-; LA32-LABEL: test35:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI34_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI34_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI34_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI34_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test35:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI34_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI34_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI34_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI34_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test35:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1246
+; CHECK-NEXT:    vldi $vr1, -990
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 9.0000000000, double 9.0000000000 }
 }
 
 define dso_local { float, double } @test36() {
-; LA32-LABEL: test36:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI35_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI35_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI35_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI35_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test36:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI35_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI35_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI35_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI35_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test36:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1245
+; CHECK-NEXT:    vldi $vr1, -989
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 9.5000000000, double 9.5000000000 }
 }
 
 define dso_local { float, double } @test37() {
-; LA32-LABEL: test37:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI36_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI36_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI36_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI36_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test37:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI36_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI36_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI36_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI36_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test37:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1244
+; CHECK-NEXT:    vldi $vr1, -988
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 10.0000000000, double 10.0000000000 }
 }
 
 define dso_local { float, double } @test38() {
-; LA32-LABEL: test38:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI37_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI37_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test38:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI37_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI37_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test38:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1243
+; CHECK-NEXT:    vldi $vr1, -987
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 10.5000000000, double 10.5000000000 }
 }
 
 define dso_local { float, double } @test39() {
-; LA32-LABEL: test39:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI38_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI38_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI38_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI38_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test39:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI38_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI38_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI38_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI38_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test39:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1242
+; CHECK-NEXT:    vldi $vr1, -986
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 11.0000000000, double 11.0000000000 }
 }
 
 define dso_local { float, double } @test40() {
-; LA32-LABEL: test40:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI39_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI39_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI39_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI39_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test40:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI39_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI39_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI39_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI39_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test40:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1241
+; CHECK-NEXT:    vldi $vr1, -985
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 11.5000000000, double 11.5000000000 }
 }
 
 define dso_local { float, double } @test41() {
-; LA32-LABEL: test41:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI40_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI40_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI40_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI40_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test41:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI40_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI40_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI40_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI40_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test41:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1240
+; CHECK-NEXT:    vldi $vr1, -984
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 12.0000000000, double 12.0000000000 }
 }
 
 define dso_local { float, double } @test42() {
-; LA32-LABEL: test42:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI41_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI41_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI41_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI41_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test42:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI41_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI41_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI41_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI41_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test42:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1239
+; CHECK-NEXT:    vldi $vr1, -983
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 12.5000000000, double 12.5000000000 }
 }
 
 define dso_local { float, double } @test43() {
-; LA32-LABEL: test43:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI42_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI42_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI42_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI42_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test43:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI42_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI42_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI42_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI42_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test43:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1238
+; CHECK-NEXT:    vldi $vr1, -982
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 13.0000000000, double 13.0000000000 }
 }
 
 define dso_local { float, double } @test44() {
-; LA32-LABEL: test44:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI43_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI43_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI43_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI43_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test44:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI43_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI43_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI43_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI43_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test44:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1237
+; CHECK-NEXT:    vldi $vr1, -981
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 13.5000000000, double 13.5000000000 }
 }
 
 define dso_local { float, double } @test45() {
-; LA32-LABEL: test45:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI44_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI44_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI44_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI44_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test45:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI44_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI44_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI44_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI44_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test45:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1236
+; CHECK-NEXT:    vldi $vr1, -980
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 14.0000000000, double 14.0000000000 }
 }
 
 define dso_local { float, double } @test46() {
-; LA32-LABEL: test46:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI45_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI45_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI45_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI45_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test46:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI45_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI45_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI45_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI45_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test46:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1235
+; CHECK-NEXT:    vldi $vr1, -979
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 14.5000000000, double 14.5000000000 }
 }
 
 define dso_local { float, double } @test47() {
-; LA32-LABEL: test47:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI46_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI46_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI46_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI46_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test47:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI46_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI46_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI46_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI46_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test47:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1234
+; CHECK-NEXT:    vldi $vr1, -978
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 15.0000000000, double 15.0000000000 }
 }
 
 define dso_local { float, double } @test48() {
-; LA32-LABEL: test48:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI47_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI47_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI47_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI47_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test48:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI47_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI47_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI47_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI47_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test48:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1233
+; CHECK-NEXT:    vldi $vr1, -977
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 15.5000000000, double 15.5000000000 }
 }
 
 define dso_local { float, double } @test49() {
-; LA32-LABEL: test49:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI48_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI48_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI48_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI48_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test49:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI48_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI48_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI48_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI48_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test49:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1232
+; CHECK-NEXT:    vldi $vr1, -976
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 16.0000000000, double 16.0000000000 }
 }
 
 define dso_local { float, double } @test50() {
-; LA32-LABEL: test50:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI49_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI49_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI49_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI49_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test50:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI49_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI49_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI49_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI49_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test50:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1231
+; CHECK-NEXT:    vldi $vr1, -975
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 17.0000000000, double 17.0000000000 }
 }
 
 define dso_local { float, double } @test51() {
-; LA32-LABEL: test51:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI50_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI50_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI50_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI50_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test51:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI50_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI50_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI50_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI50_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test51:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1230
+; CHECK-NEXT:    vldi $vr1, -974
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 18.0000000000, double 18.0000000000 }
 }
 
 define dso_local { float, double } @test52() {
-; LA32-LABEL: test52:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI51_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI51_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI51_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI51_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test52:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI51_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI51_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI51_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI51_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test52:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1229
+; CHECK-NEXT:    vldi $vr1, -973
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 19.0000000000, double 19.0000000000 }
 }
 
 define dso_local { float, double } @test53() {
-; LA32-LABEL: test53:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI52_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI52_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI52_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI52_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test53:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI52_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI52_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI52_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI52_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test53:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1228
+; CHECK-NEXT:    vldi $vr1, -972
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 20.0000000000, double 20.0000000000 }
 }
 
 define dso_local { float, double } @test54() {
-; LA32-LABEL: test54:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI53_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI53_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI53_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI53_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test54:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI53_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI53_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI53_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI53_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test54:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1227
+; CHECK-NEXT:    vldi $vr1, -971
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 21.0000000000, double 21.0000000000 }
 }
 
 define dso_local { float, double } @test55() {
-; LA32-LABEL: test55:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI54_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI54_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI54_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI54_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test55:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI54_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI54_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI54_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI54_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test55:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1226
+; CHECK-NEXT:    vldi $vr1, -970
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 22.0000000000, double 22.0000000000 }
 }
 
 define dso_local { float, double } @test56() {
-; LA32-LABEL: test56:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI55_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI55_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI55_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI55_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test56:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI55_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI55_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI55_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI55_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test56:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1225
+; CHECK-NEXT:    vldi $vr1, -969
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 23.0000000000, double 23.0000000000 }
 }
 
 define dso_local { float, double } @test57() {
-; LA32-LABEL: test57:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI56_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI56_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI56_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI56_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test57:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI56_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI56_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI56_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI56_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test57:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1224
+; CHECK-NEXT:    vldi $vr1, -968
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 24.0000000000, double 24.0000000000 }
 }
 
 define dso_local { float, double } @test58() {
-; LA32-LABEL: test58:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI57_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI57_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI57_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI57_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test58:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI57_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI57_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI57_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI57_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test58:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1223
+; CHECK-NEXT:    vldi $vr1, -967
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 25.0000000000, double 25.0000000000 }
 }
 
 define dso_local { float, double } @test59() {
-; LA32-LABEL: test59:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI58_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI58_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI58_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI58_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test59:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI58_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI58_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI58_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI58_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test59:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1222
+; CHECK-NEXT:    vldi $vr1, -966
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 26.0000000000, double 26.0000000000 }
 }
 
 define dso_local { float, double } @test60() {
-; LA32-LABEL: test60:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI59_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI59_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI59_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI59_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test60:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI59_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI59_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI59_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI59_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test60:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1221
+; CHECK-NEXT:    vldi $vr1, -965
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 27.0000000000, double 27.0000000000 }
 }
 
 define dso_local { float, double } @test61() {
-; LA32-LABEL: test61:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI60_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI60_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI60_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI60_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test61:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI60_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI60_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI60_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI60_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test61:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1220
+; CHECK-NEXT:    vldi $vr1, -964
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 28.0000000000, double 28.0000000000 }
 }
 
 define dso_local { float, double } @test62() {
-; LA32-LABEL: test62:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI61_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI61_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI61_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI61_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test62:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI61_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI61_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI61_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI61_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test62:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1219
+; CHECK-NEXT:    vldi $vr1, -963
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 29.0000000000, double 29.0000000000 }
 }
 
 define dso_local { float, double } @test63() {
-; LA32-LABEL: test63:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI62_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI62_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI62_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI62_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test63:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI62_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI62_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI62_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI62_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test63:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1218
+; CHECK-NEXT:    vldi $vr1, -962
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 30.0000000000, double 30.0000000000 }
 }
 
 define dso_local { float, double } @test64() {
-; LA32-LABEL: test64:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI63_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI63_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI63_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI63_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test64:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI63_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI63_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI63_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI63_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1217
+; CHECK-NEXT:    vldi $vr1, -961
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 31.0000000000, double 31.0000000000 }
 }
 
 define dso_local { float, double } @test65() {
-; LA32-LABEL: test65:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI64_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI64_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI64_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI64_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test65:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI64_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI64_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI64_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI64_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test65:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1216
+; CHECK-NEXT:    vldi $vr1, -960
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1250000000, double 0.1250000000 }
 }
 
 define dso_local { float, double } @test66() {
-; LA32-LABEL: test66:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI65_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI65_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI65_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI65_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test66:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI65_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI65_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI65_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI65_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test66:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1215
+; CHECK-NEXT:    vldi $vr1, -959
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1328125000, double 0.1328125000 }
 }
 
 define dso_local { float, double } @test67() {
-; LA32-LABEL: test67:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI66_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI66_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI66_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI66_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test67:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI66_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI66_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI66_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI66_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test67:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1214
+; CHECK-NEXT:    vldi $vr1, -958
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1406250000, double 0.1406250000 }
 }
 
 define dso_local { float, double } @test68() {
-; LA32-LABEL: test68:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI67_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI67_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI67_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI67_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test68:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI67_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI67_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI67_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI67_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test68:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1213
+; CHECK-NEXT:    vldi $vr1, -957
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1484375000, double 0.1484375000 }
 }
 
 define dso_local { float, double } @test69() {
-; LA32-LABEL: test69:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI68_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI68_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI68_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI68_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test69:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI68_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI68_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI68_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI68_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test69:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1212
+; CHECK-NEXT:    vldi $vr1, -956
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1562500000, double 0.1562500000 }
 }
 
 define dso_local { float, double } @test70() {
-; LA32-LABEL: test70:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI69_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI69_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI69_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI69_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test70:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI69_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI69_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI69_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI69_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test70:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1211
+; CHECK-NEXT:    vldi $vr1, -955
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1640625000, double 0.1640625000 }
 }
 
 define dso_local { float, double } @test71() {
-; LA32-LABEL: test71:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI70_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI70_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI70_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI70_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test71:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI70_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI70_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI70_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI70_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test71:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1210
+; CHECK-NEXT:    vldi $vr1, -954
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1718750000, double 0.1718750000 }
 }
 
 define dso_local { float, double } @test72() {
-; LA32-LABEL: test72:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI71_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI71_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI71_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI71_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test72:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI71_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI71_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI71_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI71_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test72:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1209
+; CHECK-NEXT:    vldi $vr1, -953
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1796875000, double 0.1796875000 }
 }
 
 define dso_local { float, double } @test73() {
-; LA32-LABEL: test73:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI72_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI72_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI72_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI72_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test73:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI72_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI72_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI72_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI72_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test73:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1208
+; CHECK-NEXT:    vldi $vr1, -952
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1875000000, double 0.1875000000 }
 }
 
 define dso_local { float, double } @test74() {
-; LA32-LABEL: test74:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI73_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI73_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI73_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI73_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test74:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI73_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI73_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI73_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI73_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test74:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1207
+; CHECK-NEXT:    vldi $vr1, -951
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1953125000, double 0.1953125000 }
 }
 
 define dso_local { float, double } @test75() {
-; LA32-LABEL: test75:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI74_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI74_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI74_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI74_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test75:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI74_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI74_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI74_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI74_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test75:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1206
+; CHECK-NEXT:    vldi $vr1, -950
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2031250000, double 0.2031250000 }
 }
 
 define dso_local { float, double } @test76() {
-; LA32-LABEL: test76:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI75_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI75_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI75_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI75_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test76:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI75_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI75_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI75_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI75_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test76:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1205
+; CHECK-NEXT:    vldi $vr1, -949
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2109375000, double 0.2109375000 }
 }
 
 define dso_local { float, double } @test77() {
-; LA32-LABEL: test77:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI76_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI76_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI76_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI76_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test77:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI76_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI76_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI76_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI76_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test77:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1204
+; CHECK-NEXT:    vldi $vr1, -948
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2187500000, double 0.2187500000 }
 }
 
 define dso_local { float, double } @test78() {
-; LA32-LABEL: test78:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI77_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI77_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI77_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI77_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test78:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI77_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI77_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI77_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI77_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test78:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1203
+; CHECK-NEXT:    vldi $vr1, -947
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2265625000, double 0.2265625000 }
 }
 
 define dso_local { float, double } @test79() {
-; LA32-LABEL: test79:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI78_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI78_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI78_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI78_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test79:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI78_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI78_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI78_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI78_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test79:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1202
+; CHECK-NEXT:    vldi $vr1, -946
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2343750000, double 0.2343750000 }
 }
 
 define dso_local { float, double } @test80() {
-; LA32-LABEL: test80:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI79_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI79_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI79_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI79_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test80:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI79_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI79_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI79_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI79_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test80:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1201
+; CHECK-NEXT:    vldi $vr1, -945
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2421875000, double 0.2421875000 }
 }
 
 define dso_local { float, double } @test81() {
-; LA32-LABEL: test81:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI80_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI80_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI80_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI80_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test81:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI80_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI80_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI80_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI80_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test81:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1200
+; CHECK-NEXT:    vldi $vr1, -944
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2500000000, double 0.2500000000 }
 }
 
 define dso_local { float, double } @test82() {
-; LA32-LABEL: test82:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI81_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI81_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI81_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI81_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test82:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI81_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI81_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI81_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI81_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test82:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1199
+; CHECK-NEXT:    vldi $vr1, -943
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2656250000, double 0.2656250000 }
 }
 
 define dso_local { float, double } @test83() {
-; LA32-LABEL: test83:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI82_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI82_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI82_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI82_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test83:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI82_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI82_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI82_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI82_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test83:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1198
+; CHECK-NEXT:    vldi $vr1, -942
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2812500000, double 0.2812500000 }
 }
 
 define dso_local { float, double } @test84() {
-; LA32-LABEL: test84:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI83_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI83_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI83_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI83_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test84:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI83_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI83_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI83_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI83_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test84:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1197
+; CHECK-NEXT:    vldi $vr1, -941
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2968750000, double 0.2968750000 }
 }
 
 define dso_local { float, double } @test85() {
-; LA32-LABEL: test85:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI84_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI84_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI84_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI84_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test85:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI84_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI84_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI84_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI84_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test85:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1196
+; CHECK-NEXT:    vldi $vr1, -940
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.3125000000, double 0.3125000000 }
 }
 
 define dso_local { float, double } @test86() {
-; LA32-LABEL: test86:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI85_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI85_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI85_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI85_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test86:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI85_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI85_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI85_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI85_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test86:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1195
+; CHECK-NEXT:    vldi $vr1, -939
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.3281250000, double 0.3281250000 }
 }
 
 define dso_local { float, double } @test87() {
-; LA32-LABEL: test87:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI86_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI86_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI86_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI86_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test87:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI86_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI86_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI86_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI86_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test87:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1194
+; CHECK-NEXT:    vldi $vr1, -938
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.3437500000, double 0.3437500000 }
 }
 
 define dso_local { float, double } @test88() {
-; LA32-LABEL: test88:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI87_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI87_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI87_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI87_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test88:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI87_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI87_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI87_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI87_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test88:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1193
+; CHECK-NEXT:    vldi $vr1, -937
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.3593750000, double 0.3593750000 }
 }
 
 define dso_local { float, double } @test89() {
-; LA32-LABEL: test89:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI88_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI88_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI88_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI88_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test89:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI88_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI88_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI88_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI88_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test89:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1192
+; CHECK-NEXT:    vldi $vr1, -936
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.3750000000, double 0.3750000000 }
 }
 
 define dso_local { float, double } @test90() {
-; LA32-LABEL: test90:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI89_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI89_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI89_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI89_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test90:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI89_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI89_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI89_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI89_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test90:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1191
+; CHECK-NEXT:    vldi $vr1, -935
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.3906250000, double 0.3906250000 }
 }
 
 define dso_local { float, double } @test91() {
-; LA32-LABEL: test91:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI90_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI90_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI90_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI90_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test91:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI90_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI90_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI90_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI90_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test91:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1190
+; CHECK-NEXT:    vldi $vr1, -934
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.4062500000, double 0.4062500000 }
 }
 
 define dso_local { float, double } @test92() {
-; LA32-LABEL: test92:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI91_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI91_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI91_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI91_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test92:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI91_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI91_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI91_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI91_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test92:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1189
+; CHECK-NEXT:    vldi $vr1, -933
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.4218750000, double 0.4218750000 }
 }
 
 define dso_local { float, double } @test93() {
-; LA32-LABEL: test93:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI92_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI92_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI92_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI92_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test93:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI92_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI92_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI92_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI92_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test93:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1188
+; CHECK-NEXT:    vldi $vr1, -932
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.4375000000, double 0.4375000000 }
 }
 
 define dso_local { float, double } @test94() {
-; LA32-LABEL: test94:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI93_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI93_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI93_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI93_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test94:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI93_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI93_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI93_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI93_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test94:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1187
+; CHECK-NEXT:    vldi $vr1, -931
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.4531250000, double 0.4531250000 }
 }
 
 define dso_local { float, double } @test95() {
-; LA32-LABEL: test95:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI94_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI94_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI94_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI94_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test95:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI94_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI94_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI94_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI94_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test95:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1186
+; CHECK-NEXT:    vldi $vr1, -930
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.4687500000, double 0.4687500000 }
 }
 
 define dso_local { float, double } @test96() {
-; LA32-LABEL: test96:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI95_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI95_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI95_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI95_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test96:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI95_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI95_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI95_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI95_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test96:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1185
+; CHECK-NEXT:    vldi $vr1, -929
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.4843750000, double 0.4843750000 }
 }
 
 define dso_local { float, double } @test97() {
-; LA32-LABEL: test97:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI96_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI96_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI96_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI96_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test97:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI96_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI96_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI96_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI96_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test97:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1184
+; CHECK-NEXT:    vldi $vr1, -928
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.5000000000, double 0.5000000000 }
 }
 
 define dso_local { float, double } @test98() {
-; LA32-LABEL: test98:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI97_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI97_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI97_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI97_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test98:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI97_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI97_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI97_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI97_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test98:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1183
+; CHECK-NEXT:    vldi $vr1, -927
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.5312500000, double 0.5312500000 }
 }
 
 define dso_local { float, double } @test99() {
-; LA32-LABEL: test99:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI98_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI98_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI98_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI98_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test99:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI98_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI98_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI98_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI98_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test99:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1182
+; CHECK-NEXT:    vldi $vr1, -926
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.5625000000, double 0.5625000000 }
 }
 
 define dso_local { float, double } @test100() {
-; LA32-LABEL: test100:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI99_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI99_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI99_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI99_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test100:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI99_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI99_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI99_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI99_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test100:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1181
+; CHECK-NEXT:    vldi $vr1, -925
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.5937500000, double 0.5937500000 }
 }
 
 define dso_local { float, double } @test101() {
-; LA32-LABEL: test101:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI100_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI100_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI100_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI100_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test101:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI100_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI100_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI100_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI100_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test101:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1180
+; CHECK-NEXT:    vldi $vr1, -924
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.6250000000, double 0.6250000000 }
 }
 
 define dso_local { float, double } @test102() {
-; LA32-LABEL: test102:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI101_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI101_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI101_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI101_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test102:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI101_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI101_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI101_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI101_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test102:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1179
+; CHECK-NEXT:    vldi $vr1, -923
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.6562500000, double 0.6562500000 }
 }
 
 define dso_local { float, double } @test103() {
-; LA32-LABEL: test103:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI102_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI102_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI102_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI102_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test103:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI102_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI102_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI102_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI102_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test103:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1178
+; CHECK-NEXT:    vldi $vr1, -922
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.6875000000, double 0.6875000000 }
 }
 
 define dso_local { float, double } @test104() {
-; LA32-LABEL: test104:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI103_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI103_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI103_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI103_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test104:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI103_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI103_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI103_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI103_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test104:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1177
+; CHECK-NEXT:    vldi $vr1, -921
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.7187500000, double 0.7187500000 }
 }
 
 define dso_local { float, double } @test105() {
-; LA32-LABEL: test105:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI104_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI104_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI104_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI104_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test105:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI104_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI104_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI104_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI104_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test105:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1176
+; CHECK-NEXT:    vldi $vr1, -920
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.7500000000, double 0.7500000000 }
 }
 
 define dso_local { float, double } @test106() {
-; LA32-LABEL: test106:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI105_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI105_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI105_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI105_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test106:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI105_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI105_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI105_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI105_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test106:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1175
+; CHECK-NEXT:    vldi $vr1, -919
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.7812500000, double 0.7812500000 }
 }
 
 define dso_local { float, double } @test107() {
-; LA32-LABEL: test107:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI106_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI106_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI106_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI106_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test107:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI106_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI106_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI106_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI106_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test107:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1174
+; CHECK-NEXT:    vldi $vr1, -918
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.8125000000, double 0.8125000000 }
 }
 
 define dso_local { float, double } @test108() {
-; LA32-LABEL: test108:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI107_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI107_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI107_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI107_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test108:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI107_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI107_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI107_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI107_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test108:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1173
+; CHECK-NEXT:    vldi $vr1, -917
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.8437500000, double 0.8437500000 }
 }
 
 define dso_local { float, double } @test109() {
-; LA32-LABEL: test109:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI108_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI108_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI108_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI108_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test109:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI108_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI108_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI108_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI108_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test109:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1172
+; CHECK-NEXT:    vldi $vr1, -916
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.8750000000, double 0.8750000000 }
 }
 
 define dso_local { float, double } @test110() {
-; LA32-LABEL: test110:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI109_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI109_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI109_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI109_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test110:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI109_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI109_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI109_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI109_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test110:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1171
+; CHECK-NEXT:    vldi $vr1, -915
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.9062500000, double 0.9062500000 }
 }
 
 define dso_local { float, double } @test111() {
-; LA32-LABEL: test111:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI110_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI110_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI110_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI110_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test111:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI110_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI110_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI110_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI110_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test111:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1170
+; CHECK-NEXT:    vldi $vr1, -914
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.9375000000, double 0.9375000000 }
 }
 
 define dso_local { float, double } @test112() {
-; LA32-LABEL: test112:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI111_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI111_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI111_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI111_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test112:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI111_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI111_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI111_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI111_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test112:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1169
+; CHECK-NEXT:    vldi $vr1, -913
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.9687500000, double 0.9687500000 }
 }
 
 define dso_local { float, double } @test113() {
-; LA32-LABEL: test113:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $a0, $zero, 1
-; LA32-NEXT:    movgr2fr.w $fa0, $a0
-; LA32-NEXT:    ffint.s.w $fa0, $fa0
-; LA32-NEXT:    fcvt.d.s $fa1, $fa0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test113:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.w $a0, $zero, 1
-; LA64-NEXT:    movgr2fr.w $fa0, $a0
-; LA64-NEXT:    ffint.s.w $fa0, $fa0
-; LA64-NEXT:    addi.d $a0, $zero, 1
-; LA64-NEXT:    movgr2fr.d $fa1, $a0
-; LA64-NEXT:    ffint.d.l $fa1, $fa1
-; LA64-NEXT:    ret
+; CHECK-LABEL: test113:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1168
+; CHECK-NEXT:    vldi $vr1, -912
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.0000000000, double 1.0000000000 }
 }
 
 define dso_local { float, double } @test114() {
-; LA32-LABEL: test114:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI113_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI113_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI113_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI113_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test114:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI113_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI113_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI113_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI113_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test114:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1167
+; CHECK-NEXT:    vldi $vr1, -911
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.0625000000, double 1.0625000000 }
 }
 
 define dso_local { float, double } @test115() {
-; LA32-LABEL: test115:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI114_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI114_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI114_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI114_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test115:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI114_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI114_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI114_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI114_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test115:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1166
+; CHECK-NEXT:    vldi $vr1, -910
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.1250000000, double 1.1250000000 }
 }
 
 define dso_local { float, double } @test116() {
-; LA32-LABEL: test116:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI115_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI115_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI115_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI115_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test116:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI115_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI115_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI115_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI115_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test116:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1165
+; CHECK-NEXT:    vldi $vr1, -909
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.1875000000, double 1.1875000000 }
 }
 
 define dso_local { float, double } @test117() {
-; LA32-LABEL: test117:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI116_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI116_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI116_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI116_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test117:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI116_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI116_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI116_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI116_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test117:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1164
+; CHECK-NEXT:    vldi $vr1, -908
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.2500000000, double 1.2500000000 }
 }
 
 define dso_local { float, double } @test118() {
-; LA32-LABEL: test118:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI117_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI117_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI117_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI117_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test118:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI117_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI117_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI117_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI117_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test118:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1163
+; CHECK-NEXT:    vldi $vr1, -907
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.3125000000, double 1.3125000000 }
 }
 
 define dso_local { float, double } @test119() {
-; LA32-LABEL: test119:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI118_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI118_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI118_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI118_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test119:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI118_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI118_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI118_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI118_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test119:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1162
+; CHECK-NEXT:    vldi $vr1, -906
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.3750000000, double 1.3750000000 }
 }
 
 define dso_local { float, double } @test120() {
-; LA32-LABEL: test120:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI119_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI119_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI119_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI119_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test120:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI119_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI119_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI119_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI119_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test120:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1161
+; CHECK-NEXT:    vldi $vr1, -905
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.4375000000, double 1.4375000000 }
 }
 
 define dso_local { float, double } @test121() {
-; LA32-LABEL: test121:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI120_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI120_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI120_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI120_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test121:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI120_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI120_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI120_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI120_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test121:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1160
+; CHECK-NEXT:    vldi $vr1, -904
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.5000000000, double 1.5000000000 }
 }
 
 define dso_local { float, double } @test122() {
-; LA32-LABEL: test122:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI121_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI121_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI121_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI121_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test122:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI121_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI121_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI121_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI121_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test122:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1159
+; CHECK-NEXT:    vldi $vr1, -903
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.5625000000, double 1.5625000000 }
 }
 
 define dso_local { float, double } @test123() {
-; LA32-LABEL: test123:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI122_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI122_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI122_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI122_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test123:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI122_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI122_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI122_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI122_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test123:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1158
+; CHECK-NEXT:    vldi $vr1, -902
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.6250000000, double 1.6250000000 }
 }
 
 define dso_local { float, double } @test124() {
-; LA32-LABEL: test124:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI123_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI123_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI123_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI123_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test124:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI123_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI123_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI123_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI123_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test124:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1157
+; CHECK-NEXT:    vldi $vr1, -901
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.6875000000, double 1.6875000000 }
 }
 
 define dso_local { float, double } @test125() {
-; LA32-LABEL: test125:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI124_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI124_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI124_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI124_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test125:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI124_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI124_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI124_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI124_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test125:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1156
+; CHECK-NEXT:    vldi $vr1, -900
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.7500000000, double 1.7500000000 }
 }
 
 define dso_local { float, double } @test126() {
-; LA32-LABEL: test126:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI125_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI125_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI125_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI125_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test126:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI125_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI125_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI125_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI125_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test126:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1155
+; CHECK-NEXT:    vldi $vr1, -899
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.8125000000, double 1.8125000000 }
 }
 
 define dso_local { float, double } @test127() {
-; LA32-LABEL: test127:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI126_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI126_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI126_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI126_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test127:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI126_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI126_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI126_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI126_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test127:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1154
+; CHECK-NEXT:    vldi $vr1, -898
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.8750000000, double 1.8750000000 }
 }
 
 define dso_local { float, double } @test128() {
-; LA32-LABEL: test128:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI127_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI127_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI127_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI127_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test128:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI127_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI127_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI127_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI127_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1153
+; CHECK-NEXT:    vldi $vr1, -897
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.9375000000, double 1.9375000000 }
 }
 
 define dso_local { float, double } @test129() {
-; LA32-LABEL: test129:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI128_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI128_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI128_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI128_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test129:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI128_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI128_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI128_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI128_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test129:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1152
+; CHECK-NEXT:    vldi $vr1, -896
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.0000000000, double -2.0000000000 }
 }
 
 define dso_local { float, double } @test130() {
-; LA32-LABEL: test130:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI129_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI129_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI129_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI129_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test130:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI129_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI129_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI129_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI129_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test130:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1151
+; CHECK-NEXT:    vldi $vr1, -895
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.1250000000, double -2.1250000000 }
 }
 
 define dso_local { float, double } @test131() {
-; LA32-LABEL: test131:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI130_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI130_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI130_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI130_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test131:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI130_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI130_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI130_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI130_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test131:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1150
+; CHECK-NEXT:    vldi $vr1, -894
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.2500000000, double -2.2500000000 }
 }
 
 define dso_local { float, double } @test132() {
-; LA32-LABEL: test132:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI131_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI131_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI131_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI131_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test132:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI131_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI131_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI131_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI131_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test132:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1149
+; CHECK-NEXT:    vldi $vr1, -893
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.3750000000, double -2.3750000000 }
 }
 
 define dso_local { float, double } @test133() {
-; LA32-LABEL: test133:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI132_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI132_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI132_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI132_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test133:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI132_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI132_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI132_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI132_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test133:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1148
+; CHECK-NEXT:    vldi $vr1, -892
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.5000000000, double -2.5000000000 }
 }
 
 define dso_local { float, double } @test134() {
-; LA32-LABEL: test134:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI133_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI133_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI133_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI133_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test134:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI133_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI133_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI133_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI133_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test134:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1147
+; CHECK-NEXT:    vldi $vr1, -891
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.6250000000, double -2.6250000000 }
 }
 
 define dso_local { float, double } @test135() {
-; LA32-LABEL: test135:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI134_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI134_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI134_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI134_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test135:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI134_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI134_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI134_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI134_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test135:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1146
+; CHECK-NEXT:    vldi $vr1, -890
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.7500000000, double -2.7500000000 }
 }
 
 define dso_local { float, double } @test136() {
-; LA32-LABEL: test136:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI135_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI135_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI135_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI135_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test136:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI135_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI135_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI135_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI135_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test136:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1145
+; CHECK-NEXT:    vldi $vr1, -889
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.8750000000, double -2.8750000000 }
 }
 
 define dso_local { float, double } @test137() {
-; LA32-LABEL: test137:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI136_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI136_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI136_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI136_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test137:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI136_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI136_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI136_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI136_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test137:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1144
+; CHECK-NEXT:    vldi $vr1, -888
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.0000000000, double -3.0000000000 }
 }
 
 define dso_local { float, double } @test138() {
-; LA32-LABEL: test138:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI137_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI137_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI137_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI137_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test138:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI137_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI137_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI137_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI137_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test138:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1143
+; CHECK-NEXT:    vldi $vr1, -887
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.1250000000, double -3.1250000000 }
 }
 
 define dso_local { float, double } @test139() {
-; LA32-LABEL: test139:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI138_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI138_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI138_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI138_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test139:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI138_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI138_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI138_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI138_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test139:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1142
+; CHECK-NEXT:    vldi $vr1, -886
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.2500000000, double -3.2500000000 }
 }
 
 define dso_local { float, double } @test140() {
-; LA32-LABEL: test140:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI139_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI139_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI139_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI139_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test140:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI139_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI139_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI139_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI139_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test140:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1141
+; CHECK-NEXT:    vldi $vr1, -885
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.3750000000, double -3.3750000000 }
 }
 
 define dso_local { float, double } @test141() {
-; LA32-LABEL: test141:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI140_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI140_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI140_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI140_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test141:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI140_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI140_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI140_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI140_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test141:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1140
+; CHECK-NEXT:    vldi $vr1, -884
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.5000000000, double -3.5000000000 }
 }
 
 define dso_local { float, double } @test142() {
-; LA32-LABEL: test142:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI141_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI141_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI141_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI141_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test142:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI141_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI141_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI141_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI141_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test142:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1139
+; CHECK-NEXT:    vldi $vr1, -883
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.6250000000, double -3.6250000000 }
 }
 
 define dso_local { float, double } @test143() {
-; LA32-LABEL: test143:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI142_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI142_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI142_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI142_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test143:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI142_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI142_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI142_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI142_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test143:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1138
+; CHECK-NEXT:    vldi $vr1, -882
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.7500000000, double -3.7500000000 }
 }
 
 define dso_local { float, double } @test144() {
-; LA32-LABEL: test144:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI143_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI143_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI143_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI143_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test144:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI143_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI143_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI143_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI143_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test144:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1137
+; CHECK-NEXT:    vldi $vr1, -881
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.8750000000, double -3.8750000000 }
 }
 
 define dso_local { float, double } @test145() {
-; LA32-LABEL: test145:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI144_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI144_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI144_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI144_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test145:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI144_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI144_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI144_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI144_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test145:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1136
+; CHECK-NEXT:    vldi $vr1, -880
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -4.0000000000, double -4.0000000000 }
 }
 
 define dso_local { float, double } @test146() {
-; LA32-LABEL: test146:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI145_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI145_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI145_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI145_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test146:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI145_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI145_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI145_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI145_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test146:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1135
+; CHECK-NEXT:    vldi $vr1, -879
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -4.2500000000, double -4.2500000000 }
 }
 
 define dso_local { float, double } @test147() {
-; LA32-LABEL: test147:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI146_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI146_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI146_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI146_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test147:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI146_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI146_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI146_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI146_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test147:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1134
+; CHECK-NEXT:    vldi $vr1, -878
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -4.5000000000, double -4.5000000000 }
 }
 
 define dso_local { float, double } @test148() {
-; LA32-LABEL: test148:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI147_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI147_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI147_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI147_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test148:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI147_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI147_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI147_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI147_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test148:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1133
+; CHECK-NEXT:    vldi $vr1, -877
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -4.7500000000, double -4.7500000000 }
 }
 
 define dso_local { float, double } @test149() {
-; LA32-LABEL: test149:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI148_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI148_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI148_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI148_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test149:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI148_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI148_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI148_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI148_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test149:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1132
+; CHECK-NEXT:    vldi $vr1, -876
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -5.0000000000, double -5.0000000000 }
 }
 
 define dso_local { float, double } @test150() {
-; LA32-LABEL: test150:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI149_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI149_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI149_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI149_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test150:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI149_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI149_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI149_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI149_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test150:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1131
+; CHECK-NEXT:    vldi $vr1, -875
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -5.2500000000, double -5.2500000000 }
 }
 
 define dso_local { float, double } @test151() {
-; LA32-LABEL: test151:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI150_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI150_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI150_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI150_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test151:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI150_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI150_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI150_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI150_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test151:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1130
+; CHECK-NEXT:    vldi $vr1, -874
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -5.5000000000, double -5.5000000000 }
 }
 
 define dso_local { float, double } @test152() {
-; LA32-LABEL: test152:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI151_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI151_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI151_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI151_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test152:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI151_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI151_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI151_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI151_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test152:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1129
+; CHECK-NEXT:    vldi $vr1, -873
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -5.7500000000, double -5.7500000000 }
 }
 
 define dso_local { float, double } @test153() {
-; LA32-LABEL: test153:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI152_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI152_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI152_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI152_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test153:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI152_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI152_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI152_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI152_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test153:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1128
+; CHECK-NEXT:    vldi $vr1, -872
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -6.0000000000, double -6.0000000000 }
 }
 
 define dso_local { float, double } @test154() {
-; LA32-LABEL: test154:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI153_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI153_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI153_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI153_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test154:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI153_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI153_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI153_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI153_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test154:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1127
+; CHECK-NEXT:    vldi $vr1, -871
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -6.2500000000, double -6.2500000000 }
 }
 
 define dso_local { float, double } @test155() {
-; LA32-LABEL: test155:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI154_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI154_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI154_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI154_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test155:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI154_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI154_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI154_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI154_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test155:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1126
+; CHECK-NEXT:    vldi $vr1, -870
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -6.5000000000, double -6.5000000000 }
 }
 
 define dso_local { float, double } @test156() {
-; LA32-LABEL: test156:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI155_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI155_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI155_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI155_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test156:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI155_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI155_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI155_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI155_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test156:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1125
+; CHECK-NEXT:    vldi $vr1, -869
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -6.7500000000, double -6.7500000000 }
 }
 
 define dso_local { float, double } @test157() {
-; LA32-LABEL: test157:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI156_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI156_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI156_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI156_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test157:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI156_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI156_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI156_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI156_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test157:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1124
+; CHECK-NEXT:    vldi $vr1, -868
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -7.0000000000, double -7.0000000000 }
 }
 
 define dso_local { float, double } @test158() {
-; LA32-LABEL: test158:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI157_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI157_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI157_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI157_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test158:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI157_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI157_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI157_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI157_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test158:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1123
+; CHECK-NEXT:    vldi $vr1, -867
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -7.2500000000, double -7.2500000000 }
 }
 
 define dso_local { float, double } @test159() {
-; LA32-LABEL: test159:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI158_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI158_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI158_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI158_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test159:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI158_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI158_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI158_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI158_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test159:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1122
+; CHECK-NEXT:    vldi $vr1, -866
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -7.5000000000, double -7.5000000000 }
 }
 
 define dso_local { float, double } @test160() {
-; LA32-LABEL: test160:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI159_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI159_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI159_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI159_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test160:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI159_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI159_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI159_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI159_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test160:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1121
+; CHECK-NEXT:    vldi $vr1, -865
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -7.7500000000, double -7.7500000000 }
 }
 
 define dso_local { float, double } @test161() {
-; LA32-LABEL: test161:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI160_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI160_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI160_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI160_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test161:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI160_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI160_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI160_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI160_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test161:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1120
+; CHECK-NEXT:    vldi $vr1, -864
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -8.0000000000, double -8.0000000000 }
 }
 
 define dso_local { float, double } @test162() {
-; LA32-LABEL: test162:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI161_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI161_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI161_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI161_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test162:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI161_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI161_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI161_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI161_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test162:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1119
+; CHECK-NEXT:    vldi $vr1, -863
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -8.5000000000, double -8.5000000000 }
 }
 
 define dso_local { float, double } @test163() {
-; LA32-LABEL: test163:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI162_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI162_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI162_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI162_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test163:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI162_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI162_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI162_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI162_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test163:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1118
+; CHECK-NEXT:    vldi $vr1, -862
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -9.0000000000, double -9.0000000000 }
 }
 
 define dso_local { float, double } @test164() {
-; LA32-LABEL: test164:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI163_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI163_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI163_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI163_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test164:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI163_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI163_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI163_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI163_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test164:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1117
+; CHECK-NEXT:    vldi $vr1, -861
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -9.5000000000, double -9.5000000000 }
 }
 
 define dso_local { float, double } @test165() {
-; LA32-LABEL: test165:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI164_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI164_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI164_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI164_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test165:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI164_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI164_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI164_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI164_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test165:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1116
+; CHECK-NEXT:    vldi $vr1, -860
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -10.0000000000, double -10.0000000000 }
 }
 
 define dso_local { float, double } @test166() {
-; LA32-LABEL: test166:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI165_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI165_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI165_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI165_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test166:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI165_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI165_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI165_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI165_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test166:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1115
+; CHECK-NEXT:    vldi $vr1, -859
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -10.5000000000, double -10.5000000000 }
 }
 
 define dso_local { float, double } @test167() {
-; LA32-LABEL: test167:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI166_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI166_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI166_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI166_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test167:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI166_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI166_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI166_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI166_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test167:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1114
+; CHECK-NEXT:    vldi $vr1, -858
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -11.0000000000, double -11.0000000000 }
 }
 
 define dso_local { float, double } @test168() {
-; LA32-LABEL: test168:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI167_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI167_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI167_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI167_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test168:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI167_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI167_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI167_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI167_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test168:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1113
+; CHECK-NEXT:    vldi $vr1, -857
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -11.5000000000, double -11.5000000000 }
 }
 
 define dso_local { float, double } @test169() {
-; LA32-LABEL: test169:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI168_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI168_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI168_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI168_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test169:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI168_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI168_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI168_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI168_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test169:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1112
+; CHECK-NEXT:    vldi $vr1, -856
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -12.0000000000, double -12.0000000000 }
 }
 
 define dso_local { float, double } @test170() {
-; LA32-LABEL: test170:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI169_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI169_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI169_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI169_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test170:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI169_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI169_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI169_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI169_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test170:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1111
+; CHECK-NEXT:    vldi $vr1, -855
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -12.5000000000, double -12.5000000000 }
 }
 
 define dso_local { float, double } @test171() {
-; LA32-LABEL: test171:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI170_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI170_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI170_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI170_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test171:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI170_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI170_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI170_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI170_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test171:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1110
+; CHECK-NEXT:    vldi $vr1, -854
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -13.0000000000, double -13.0000000000 }
 }
 
 define dso_local { float, double } @test172() {
-; LA32-LABEL: test172:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI171_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI171_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI171_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI171_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test172:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI171_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI171_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI171_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI171_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test172:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1109
+; CHECK-NEXT:    vldi $vr1, -853
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -13.5000000000, double -13.5000000000 }
 }
 
 define dso_local { float, double } @test173() {
-; LA32-LABEL: test173:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI172_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI172_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI172_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI172_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test173:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI172_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI172_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI172_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI172_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test173:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1108
+; CHECK-NEXT:    vldi $vr1, -852
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -14.0000000000, double -14.0000000000 }
 }
 
 define dso_local { float, double } @test174() {
-; LA32-LABEL: test174:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI173_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI173_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI173_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI173_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test174:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI173_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI173_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI173_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI173_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test174:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1107
+; CHECK-NEXT:    vldi $vr1, -851
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -14.5000000000, double -14.5000000000 }
 }
 
 define dso_local { float, double } @test175() {
-; LA32-LABEL: test175:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI174_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI174_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI174_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI174_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test175:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI174_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI174_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI174_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI174_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test175:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1106
+; CHECK-NEXT:    vldi $vr1, -850
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -15.0000000000, double -15.0000000000 }
 }
 
 define dso_local { float, double } @test176() {
-; LA32-LABEL: test176:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI175_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI175_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI175_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI175_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test176:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI175_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI175_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI175_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI175_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test176:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1105
+; CHECK-NEXT:    vldi $vr1, -849
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -15.5000000000, double -15.5000000000 }
 }
 
 define dso_local { float, double } @test177() {
-; LA32-LABEL: test177:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI176_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI176_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI176_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI176_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test177:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI176_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI176_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI176_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI176_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test177:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1104
+; CHECK-NEXT:    vldi $vr1, -848
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -16.0000000000, double -16.0000000000 }
 }
 
 define dso_local { float, double } @test178() {
-; LA32-LABEL: test178:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI177_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI177_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI177_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI177_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test178:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI177_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI177_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI177_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI177_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test178:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1103
+; CHECK-NEXT:    vldi $vr1, -847
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -17.0000000000, double -17.0000000000 }
 }
 
 define dso_local { float, double } @test179() {
-; LA32-LABEL: test179:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI178_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI178_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI178_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI178_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test179:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI178_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI178_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI178_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI178_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test179:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1102
+; CHECK-NEXT:    vldi $vr1, -846
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -18.0000000000, double -18.0000000000 }
 }
 
 define dso_local { float, double } @test180() {
-; LA32-LABEL: test180:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI179_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI179_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI179_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI179_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test180:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI179_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI179_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI179_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI179_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test180:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1101
+; CHECK-NEXT:    vldi $vr1, -845
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -19.0000000000, double -19.0000000000 }
 }
 
 define dso_local { float, double } @test181() {
-; LA32-LABEL: test181:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI180_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI180_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI180_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI180_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test181:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI180_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI180_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI180_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI180_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test181:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1100
+; CHECK-NEXT:    vldi $vr1, -844
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -20.0000000000, double -20.0000000000 }
 }
 
 define dso_local { float, double } @test182() {
-; LA32-LABEL: test182:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI181_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI181_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI181_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI181_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test182:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI181_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI181_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI181_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI181_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test182:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1099
+; CHECK-NEXT:    vldi $vr1, -843
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -21.0000000000, double -21.0000000000 }
 }
 
 define dso_local { float, double } @test183() {
-; LA32-LABEL: test183:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI182_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI182_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI182_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI182_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test183:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI182_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI182_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI182_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI182_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test183:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1098
+; CHECK-NEXT:    vldi $vr1, -842
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -22.0000000000, double -22.0000000000 }
 }
 
 define dso_local { float, double } @test184() {
-; LA32-LABEL: test184:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI183_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI183_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI183_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI183_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test184:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI183_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI183_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI183_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI183_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test184:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1097
+; CHECK-NEXT:    vldi $vr1, -841
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -23.0000000000, double -23.0000000000 }
 }
 
 define dso_local { float, double } @test185() {
-; LA32-LABEL: test185:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI184_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI184_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI184_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI184_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test185:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI184_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI184_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI184_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI184_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test185:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1096
+; CHECK-NEXT:    vldi $vr1, -840
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -24.0000000000, double -24.0000000000 }
 }
 
 define dso_local { float, double } @test186() {
-; LA32-LABEL: test186:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI185_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI185_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI185_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI185_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test186:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI185_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI185_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI185_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI185_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test186:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1095
+; CHECK-NEXT:    vldi $vr1, -839
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -25.0000000000, double -25.0000000000 }
 }
 
 define dso_local { float, double } @test187() {
-; LA32-LABEL: test187:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI186_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI186_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI186_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI186_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test187:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI186_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI186_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI186_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI186_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test187:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1094
+; CHECK-NEXT:    vldi $vr1, -838
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -26.0000000000, double -26.0000000000 }
 }
 
 define dso_local { float, double } @test188() {
-; LA32-LABEL: test188:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI187_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI187_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI187_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI187_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test188:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI187_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI187_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI187_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI187_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test188:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1093
+; CHECK-NEXT:    vldi $vr1, -837
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -27.0000000000, double -27.0000000000 }
 }
 
 define dso_local { float, double } @test189() {
-; LA32-LABEL: test189:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI188_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI188_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI188_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI188_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test189:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI188_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI188_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI188_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI188_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test189:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1092
+; CHECK-NEXT:    vldi $vr1, -836
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -28.0000000000, double -28.0000000000 }
 }
 
 define dso_local { float, double } @test190() {
-; LA32-LABEL: test190:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI189_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI189_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI189_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI189_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test190:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI189_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI189_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI189_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI189_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test190:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1091
+; CHECK-NEXT:    vldi $vr1, -835
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -29.0000000000, double -29.0000000000 }
 }
 
 define dso_local { float, double } @test191() {
-; LA32-LABEL: test191:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI190_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI190_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI190_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI190_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test191:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI190_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI190_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI190_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI190_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test191:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1090
+; CHECK-NEXT:    vldi $vr1, -834
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -30.0000000000, double -30.0000000000 }
 }
 
 define dso_local { float, double } @test192() {
-; LA32-LABEL: test192:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI191_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI191_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI191_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI191_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test192:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI191_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI191_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI191_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI191_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test192:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1089
+; CHECK-NEXT:    vldi $vr1, -833
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -31.0000000000, double -31.0000000000 }
 }
 
 define dso_local { float, double } @test193() {
-; LA32-LABEL: test193:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI192_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI192_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI192_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI192_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test193:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI192_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI192_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI192_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI192_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test193:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1088
+; CHECK-NEXT:    vldi $vr1, -832
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1250000000, double -0.1250000000 }
 }
 
 define dso_local { float, double } @test194() {
-; LA32-LABEL: test194:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI193_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI193_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI193_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI193_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test194:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI193_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI193_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI193_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI193_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test194:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1087
+; CHECK-NEXT:    vldi $vr1, -831
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1328125000, double -0.1328125000 }
 }
 
 define dso_local { float, double } @test195() {
-; LA32-LABEL: test195:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI194_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI194_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI194_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI194_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test195:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI194_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI194_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI194_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI194_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test195:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1086
+; CHECK-NEXT:    vldi $vr1, -830
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1406250000, double -0.1406250000 }
 }
 
 define dso_local { float, double } @test196() {
-; LA32-LABEL: test196:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI195_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI195_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI195_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI195_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test196:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI195_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI195_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI195_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI195_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test196:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1085
+; CHECK-NEXT:    vldi $vr1, -829
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1484375000, double -0.1484375000 }
 }
 
 define dso_local { float, double } @test197() {
-; LA32-LABEL: test197:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI196_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI196_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI196_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI196_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test197:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI196_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI196_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI196_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI196_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test197:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1084
+; CHECK-NEXT:    vldi $vr1, -828
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1562500000, double -0.1562500000 }
 }
 
 define dso_local { float, double } @test198() {
-; LA32-LABEL: test198:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI197_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI197_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI197_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI197_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test198:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI197_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI197_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI197_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI197_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test198:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1083
+; CHECK-NEXT:    vldi $vr1, -827
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1640625000, double -0.1640625000 }
 }
 
 define dso_local { float, double } @test199() {
-; LA32-LABEL: test199:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI198_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI198_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI198_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI198_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test199:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI198_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI198_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI198_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI198_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test199:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1082
+; CHECK-NEXT:    vldi $vr1, -826
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1718750000, double -0.1718750000 }
 }
 
 define dso_local { float, double } @test200() {
-; LA32-LABEL: test200:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI199_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI199_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI199_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI199_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test200:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI199_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI199_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI199_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI199_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test200:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1081
+; CHECK-NEXT:    vldi $vr1, -825
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1796875000, double -0.1796875000 }
 }
 
 define dso_local { float, double } @test201() {
-; LA32-LABEL: test201:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI200_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI200_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI200_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI200_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test201:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI200_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI200_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI200_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI200_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test201:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1080
+; CHECK-NEXT:    vldi $vr1, -824
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1875000000, double -0.1875000000 }
 }
 
 define dso_local { float, double } @test202() {
-; LA32-LABEL: test202:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI201_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI201_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI201_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI201_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test202:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI201_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI201_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI201_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI201_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test202:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1079
+; CHECK-NEXT:    vldi $vr1, -823
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1953125000, double -0.1953125000 }
 }
 
 define dso_local { float, double } @test203() {
-; LA32-LABEL: test203:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI202_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI202_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI202_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI202_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test203:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI202_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI202_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI202_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI202_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test203:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1078
+; CHECK-NEXT:    vldi $vr1, -822
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2031250000, double -0.2031250000 }
 }
 
 define dso_local { float, double } @test204() {
-; LA32-LABEL: test204:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI203_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI203_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI203_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI203_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test204:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI203_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI203_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI203_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI203_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test204:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1077
+; CHECK-NEXT:    vldi $vr1, -821
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2109375000, double -0.2109375000 }
 }
 
 define dso_local { float, double } @test205() {
-; LA32-LABEL: test205:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI204_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI204_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI204_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI204_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test205:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI204_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI204_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI204_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI204_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test205:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1076
+; CHECK-NEXT:    vldi $vr1, -820
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2187500000, double -0.2187500000 }
 }
 
 define dso_local { float, double } @test206() {
-; LA32-LABEL: test206:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI205_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI205_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI205_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI205_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test206:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI205_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI205_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI205_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI205_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test206:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1075
+; CHECK-NEXT:    vldi $vr1, -819
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2265625000, double -0.2265625000 }
 }
 
 define dso_local { float, double } @test207() {
-; LA32-LABEL: test207:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI206_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI206_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI206_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI206_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test207:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI206_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI206_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI206_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI206_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test207:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1074
+; CHECK-NEXT:    vldi $vr1, -818
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2343750000, double -0.2343750000 }
 }
 
 define dso_local { float, double } @test208() {
-; LA32-LABEL: test208:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI207_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI207_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI207_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI207_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test208:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI207_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI207_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI207_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI207_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test208:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1073
+; CHECK-NEXT:    vldi $vr1, -817
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2421875000, double -0.2421875000 }
 }
 
 define dso_local { float, double } @test209() {
-; LA32-LABEL: test209:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI208_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI208_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI208_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI208_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test209:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI208_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI208_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI208_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI208_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test209:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1072
+; CHECK-NEXT:    vldi $vr1, -816
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2500000000, double -0.2500000000 }
 }
 
 define dso_local { float, double } @test210() {
-; LA32-LABEL: test210:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI209_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI209_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI209_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI209_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test210:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI209_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI209_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI209_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI209_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test210:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1071
+; CHECK-NEXT:    vldi $vr1, -815
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2656250000, double -0.2656250000 }
 }
 
 define dso_local { float, double } @test211() {
-; LA32-LABEL: test211:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI210_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI210_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI210_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI210_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test211:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI210_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI210_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI210_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI210_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test211:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1070
+; CHECK-NEXT:    vldi $vr1, -814
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2812500000, double -0.2812500000 }
 }
 
 define dso_local { float, double } @test212() {
-; LA32-LABEL: test212:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI211_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI211_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI211_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI211_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test212:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI211_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI211_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI211_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI211_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test212:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1069
+; CHECK-NEXT:    vldi $vr1, -813
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2968750000, double -0.2968750000 }
 }
 
 define dso_local { float, double } @test213() {
-; LA32-LABEL: test213:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI212_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI212_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI212_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI212_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test213:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI212_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI212_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI212_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI212_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test213:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1068
+; CHECK-NEXT:    vldi $vr1, -812
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.3125000000, double -0.3125000000 }
 }
 
 define dso_local { float, double } @test214() {
-; LA32-LABEL: test214:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI213_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI213_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI213_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI213_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test214:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI213_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI213_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI213_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI213_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test214:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1067
+; CHECK-NEXT:    vldi $vr1, -811
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.3281250000, double -0.3281250000 }
 }
 
 define dso_local { float, double } @test215() {
-; LA32-LABEL: test215:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI214_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI214_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI214_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI214_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test215:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI214_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI214_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI214_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI214_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test215:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1066
+; CHECK-NEXT:    vldi $vr1, -810
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.3437500000, double -0.3437500000 }
 }
 
 define dso_local { float, double } @test216() {
-; LA32-LABEL: test216:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI215_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI215_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI215_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI215_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test216:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI215_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI215_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI215_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI215_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test216:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1065
+; CHECK-NEXT:    vldi $vr1, -809
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.3593750000, double -0.3593750000 }
 }
 
 define dso_local { float, double } @test217() {
-; LA32-LABEL: test217:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI216_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI216_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI216_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI216_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test217:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI216_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI216_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI216_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI216_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test217:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1064
+; CHECK-NEXT:    vldi $vr1, -808
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.3750000000, double -0.3750000000 }
 }
 
 define dso_local { float, double } @test218() {
-; LA32-LABEL: test218:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI217_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI217_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI217_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI217_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test218:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI217_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI217_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI217_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI217_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test218:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1063
+; CHECK-NEXT:    vldi $vr1, -807
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.3906250000, double -0.3906250000 }
 }
 
 define dso_local { float, double } @test219() {
-; LA32-LABEL: test219:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI218_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI218_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI218_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI218_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test219:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI218_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI218_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI218_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI218_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test219:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1062
+; CHECK-NEXT:    vldi $vr1, -806
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.4062500000, double -0.4062500000 }
 }
 
 define dso_local { float, double } @test220() {
-; LA32-LABEL: test220:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI219_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI219_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI219_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI219_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test220:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI219_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI219_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI219_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI219_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test220:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1061
+; CHECK-NEXT:    vldi $vr1, -805
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.4218750000, double -0.4218750000 }
 }
 
 define dso_local { float, double } @test221() {
-; LA32-LABEL: test221:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI220_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI220_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI220_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI220_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test221:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI220_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI220_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI220_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI220_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test221:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1060
+; CHECK-NEXT:    vldi $vr1, -804
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.4375000000, double -0.4375000000 }
 }
 
 define dso_local { float, double } @test222() {
-; LA32-LABEL: test222:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI221_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI221_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI221_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI221_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test222:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI221_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI221_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI221_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI221_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test222:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1059
+; CHECK-NEXT:    vldi $vr1, -803
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.4531250000, double -0.4531250000 }
 }
 
 define dso_local { float, double } @test223() {
-; LA32-LABEL: test223:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI222_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI222_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI222_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI222_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test223:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI222_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI222_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI222_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI222_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test223:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1058
+; CHECK-NEXT:    vldi $vr1, -802
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.4687500000, double -0.4687500000 }
 }
 
 define dso_local { float, double } @test224() {
-; LA32-LABEL: test224:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI223_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI223_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI223_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI223_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test224:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI223_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI223_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI223_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI223_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test224:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1057
+; CHECK-NEXT:    vldi $vr1, -801
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.4843750000, double -0.4843750000 }
 }
 
 define dso_local { float, double } @test225() {
-; LA32-LABEL: test225:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI224_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI224_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI224_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI224_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test225:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI224_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI224_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI224_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI224_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test225:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1056
+; CHECK-NEXT:    vldi $vr1, -800
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.5000000000, double -0.5000000000 }
 }
 
 define dso_local { float, double } @test226() {
-; LA32-LABEL: test226:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI225_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI225_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI225_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI225_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test226:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI225_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI225_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI225_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI225_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test226:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1055
+; CHECK-NEXT:    vldi $vr1, -799
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.5312500000, double -0.5312500000 }
 }
 
 define dso_local { float, double } @test227() {
-; LA32-LABEL: test227:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI226_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI226_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI226_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI226_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test227:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI226_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI226_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI226_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI226_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test227:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1054
+; CHECK-NEXT:    vldi $vr1, -798
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.5625000000, double -0.5625000000 }
 }
 
 define dso_local { float, double } @test228() {
-; LA32-LABEL: test228:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI227_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI227_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI227_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI227_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test228:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI227_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI227_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI227_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI227_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test228:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1053
+; CHECK-NEXT:    vldi $vr1, -797
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.5937500000, double -0.5937500000 }
 }
 
 define dso_local { float, double } @test229() {
-; LA32-LABEL: test229:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI228_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI228_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI228_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI228_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test229:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI228_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI228_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI228_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI228_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test229:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1052
+; CHECK-NEXT:    vldi $vr1, -796
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.6250000000, double -0.6250000000 }
 }
 
 define dso_local { float, double } @test230() {
-; LA32-LABEL: test230:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI229_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI229_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI229_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI229_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test230:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI229_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI229_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI229_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI229_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test230:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1051
+; CHECK-NEXT:    vldi $vr1, -795
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.6562500000, double -0.6562500000 }
 }
 
 define dso_local { float, double } @test231() {
-; LA32-LABEL: test231:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI230_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI230_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI230_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI230_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test231:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI230_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI230_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI230_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI230_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test231:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1050
+; CHECK-NEXT:    vldi $vr1, -794
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.6875000000, double -0.6875000000 }
 }
 
 define dso_local { float, double } @test232() {
-; LA32-LABEL: test232:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI231_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI231_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI231_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI231_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test232:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI231_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI231_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI231_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI231_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test232:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1049
+; CHECK-NEXT:    vldi $vr1, -793
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.7187500000, double -0.7187500000 }
 }
 
 define dso_local { float, double } @test233() {
-; LA32-LABEL: test233:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI232_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI232_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI232_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI232_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test233:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI232_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI232_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI232_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI232_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test233:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1048
+; CHECK-NEXT:    vldi $vr1, -792
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.7500000000, double -0.7500000000 }
 }
 
 define dso_local { float, double } @test234() {
-; LA32-LABEL: test234:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI233_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI233_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI233_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI233_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test234:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI233_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI233_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI233_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI233_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test234:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1047
+; CHECK-NEXT:    vldi $vr1, -791
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.7812500000, double -0.7812500000 }
 }
 
 define dso_local { float, double } @test235() {
-; LA32-LABEL: test235:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI234_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI234_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI234_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI234_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test235:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI234_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI234_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI234_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI234_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test235:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1046
+; CHECK-NEXT:    vldi $vr1, -790
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.8125000000, double -0.8125000000 }
 }
 
 define dso_local { float, double } @test236() {
-; LA32-LABEL: test236:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI235_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI235_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI235_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI235_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test236:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI235_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI235_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI235_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI235_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test236:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1045
+; CHECK-NEXT:    vldi $vr1, -789
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.8437500000, double -0.8437500000 }
 }
 
 define dso_local { float, double } @test237() {
-; LA32-LABEL: test237:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI236_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI236_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI236_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI236_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test237:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI236_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI236_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI236_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI236_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test237:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1044
+; CHECK-NEXT:    vldi $vr1, -788
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.8750000000, double -0.8750000000 }
 }
 
 define dso_local { float, double } @test238() {
-; LA32-LABEL: test238:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI237_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI237_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI237_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI237_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test238:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI237_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI237_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI237_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI237_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test238:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1043
+; CHECK-NEXT:    vldi $vr1, -787
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.9062500000, double -0.9062500000 }
 }
 
 define dso_local { float, double } @test239() {
-; LA32-LABEL: test239:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI238_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI238_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI238_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI238_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test239:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI238_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI238_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI238_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI238_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test239:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1042
+; CHECK-NEXT:    vldi $vr1, -786
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.9375000000, double -0.9375000000 }
 }
 
 define dso_local { float, double } @test240() {
-; LA32-LABEL: test240:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI239_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI239_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI239_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI239_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test240:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI239_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI239_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI239_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI239_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test240:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1041
+; CHECK-NEXT:    vldi $vr1, -785
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.9687500000, double -0.9687500000 }
 }
 
 define dso_local { float, double } @test241() {
-; LA32-LABEL: test241:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI240_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI240_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI240_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI240_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test241:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI240_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI240_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI240_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI240_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test241:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1040
+; CHECK-NEXT:    vldi $vr1, -784
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.0000000000, double -1.0000000000 }
 }
 
 define dso_local { float, double } @test242() {
-; LA32-LABEL: test242:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI241_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI241_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI241_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI241_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test242:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI241_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI241_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI241_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI241_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test242:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1039
+; CHECK-NEXT:    vldi $vr1, -783
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.0625000000, double -1.0625000000 }
 }
 
 define dso_local { float, double } @test243() {
-; LA32-LABEL: test243:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI242_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI242_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI242_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI242_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test243:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI242_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI242_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI242_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI242_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test243:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1038
+; CHECK-NEXT:    vldi $vr1, -782
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.1250000000, double -1.1250000000 }
 }
 
 define dso_local { float, double } @test244() {
-; LA32-LABEL: test244:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI243_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI243_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI243_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI243_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test244:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI243_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI243_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI243_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI243_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test244:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1037
+; CHECK-NEXT:    vldi $vr1, -781
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.1875000000, double -1.1875000000 }
 }
 
 define dso_local { float, double } @test245() {
-; LA32-LABEL: test245:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI244_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI244_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI244_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI244_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test245:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI244_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI244_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI244_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI244_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test245:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1036
+; CHECK-NEXT:    vldi $vr1, -780
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.2500000000, double -1.2500000000 }
 }
 
 define dso_local { float, double } @test246() {
-; LA32-LABEL: test246:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI245_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI245_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI245_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI245_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test246:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI245_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI245_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI245_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI245_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test246:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1035
+; CHECK-NEXT:    vldi $vr1, -779
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.3125000000, double -1.3125000000 }
 }
 
 define dso_local { float, double } @test247() {
-; LA32-LABEL: test247:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI246_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI246_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI246_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI246_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test247:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI246_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI246_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI246_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI246_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test247:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1034
+; CHECK-NEXT:    vldi $vr1, -778
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.3750000000, double -1.3750000000 }
 }
 
 define dso_local { float, double } @test248() {
-; LA32-LABEL: test248:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI247_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI247_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI247_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI247_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test248:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI247_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI247_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI247_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI247_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test248:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1033
+; CHECK-NEXT:    vldi $vr1, -777
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.4375000000, double -1.4375000000 }
 }
 
 define dso_local { float, double } @test249() {
-; LA32-LABEL: test249:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI248_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI248_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI248_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI248_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test249:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI248_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI248_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI248_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI248_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test249:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1032
+; CHECK-NEXT:    vldi $vr1, -776
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.5000000000, double -1.5000000000 }
 }
 
 define dso_local { float, double } @test250() {
-; LA32-LABEL: test250:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI249_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI249_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI249_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI249_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test250:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI249_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI249_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI249_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI249_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test250:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1031
+; CHECK-NEXT:    vldi $vr1, -775
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.5625000000, double -1.5625000000 }
 }
 
 define dso_local { float, double } @test251() {
-; LA32-LABEL: test251:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI250_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI250_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI250_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI250_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test251:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI250_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI250_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI250_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI250_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test251:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1030
+; CHECK-NEXT:    vldi $vr1, -774
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.6250000000, double -1.6250000000 }
 }
 
 define dso_local { float, double } @test252() {
-; LA32-LABEL: test252:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI251_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI251_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI251_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI251_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test252:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI251_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI251_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI251_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI251_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test252:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1029
+; CHECK-NEXT:    vldi $vr1, -773
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.6875000000, double -1.6875000000 }
 }
 
 define dso_local { float, double } @test253() {
-; LA32-LABEL: test253:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI252_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI252_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI252_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI252_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test253:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI252_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI252_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI252_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI252_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test253:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1028
+; CHECK-NEXT:    vldi $vr1, -772
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.7500000000, double -1.7500000000 }
 }
 
 define dso_local { float, double } @test254() {
-; LA32-LABEL: test254:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI253_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI253_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI253_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI253_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test254:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI253_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI253_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI253_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI253_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test254:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1027
+; CHECK-NEXT:    vldi $vr1, -771
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.8125000000, double -1.8125000000 }
 }
 
 define dso_local { float, double } @test255() {
-; LA32-LABEL: test255:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI254_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI254_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI254_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI254_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test255:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI254_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI254_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI254_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI254_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test255:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1026
+; CHECK-NEXT:    vldi $vr1, -770
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.8750000000, double -1.8750000000 }
 }
 
 define dso_local { float, double } @test256() {
-; LA32-LABEL: test256:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI255_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI255_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI255_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI255_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test256:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI255_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI255_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI255_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI255_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: test256:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1025
+; CHECK-NEXT:    vldi $vr1, -769
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.9375000000, double -1.9375000000 }
 }
diff --git a/llvm/test/CodeGen/LoongArch/float-imm.ll b/llvm/test/CodeGen/LoongArch/float-imm.ll
index e2cbf4bf9b3e874..006a9e64b190de7 100644
--- a/llvm/test/CodeGen/LoongArch/float-imm.ll
+++ b/llvm/test/CodeGen/LoongArch/float-imm.ll
@@ -34,15 +34,13 @@ define float @f32_constant_pi() nounwind {
 ; LA32-LABEL: f32_constant_pi:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI2_0)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: f32_constant_pi:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI2_0)
 ; LA64-NEXT:    ret
   ret float 3.14159274101257324218750
 }
diff --git a/llvm/test/CodeGen/LoongArch/ghc-cc.ll b/llvm/test/CodeGen/LoongArch/ghc-cc.ll
index 735315d323a3626..f99759b4b5ed563 100644
--- a/llvm/test/CodeGen/LoongArch/ghc-cc.ll
+++ b/llvm/test/CodeGen/LoongArch/ghc-cc.ll
@@ -27,56 +27,39 @@ define ghccc void @foo() nounwind {
 ; LA64-LABEL: foo:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(d4)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(d4)
-; LA64-NEXT:    fld.d $fs7, $a0, 0
+; LA64-NEXT:    fld.d $fs7, $a0, %pc_lo12(d4)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(d3)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(d3)
-; LA64-NEXT:    fld.d $fs6, $a0, 0
+; LA64-NEXT:    fld.d $fs6, $a0, %pc_lo12(d3)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(d2)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(d2)
-; LA64-NEXT:    fld.d $fs5, $a0, 0
+; LA64-NEXT:    fld.d $fs5, $a0, %pc_lo12(d2)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(d1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(d1)
-; LA64-NEXT:    fld.d $fs4, $a0, 0
+; LA64-NEXT:    fld.d $fs4, $a0, %pc_lo12(d1)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(f4)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(f4)
-; LA64-NEXT:    fld.s $fs3, $a0, 0
+; LA64-NEXT:    fld.s $fs3, $a0, %pc_lo12(f4)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(f3)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(f3)
-; LA64-NEXT:    fld.s $fs2, $a0, 0
+; LA64-NEXT:    fld.s $fs2, $a0, %pc_lo12(f3)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(f2)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(f2)
-; LA64-NEXT:    fld.s $fs1, $a0, 0
+; LA64-NEXT:    fld.s $fs1, $a0, %pc_lo12(f2)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(f1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(f1)
-; LA64-NEXT:    fld.s $fs0, $a0, 0
+; LA64-NEXT:    fld.s $fs0, $a0, %pc_lo12(f1)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(splim)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(splim)
-; LA64-NEXT:    ld.d $s8, $a0, 0
+; LA64-NEXT:    ld.d $s8, $a0, %pc_lo12(splim)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(r5)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(r5)
-; LA64-NEXT:    ld.d $s7, $a0, 0
+; LA64-NEXT:    ld.d $s7, $a0, %pc_lo12(r5)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(r4)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(r4)
-; LA64-NEXT:    ld.d $s6, $a0, 0
+; LA64-NEXT:    ld.d $s6, $a0, %pc_lo12(r4)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(r3)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(r3)
-; LA64-NEXT:    ld.d $s5, $a0, 0
+; LA64-NEXT:    ld.d $s5, $a0, %pc_lo12(r3)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(r2)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(r2)
-; LA64-NEXT:    ld.d $s4, $a0, 0
+; LA64-NEXT:    ld.d $s4, $a0, %pc_lo12(r2)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(r1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(r1)
-; LA64-NEXT:    ld.d $s3, $a0, 0
+; LA64-NEXT:    ld.d $s3, $a0, %pc_lo12(r1)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(hp)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(hp)
-; LA64-NEXT:    ld.d $s2, $a0, 0
+; LA64-NEXT:    ld.d $s2, $a0, %pc_lo12(hp)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(sp)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(sp)
-; LA64-NEXT:    ld.d $s1, $a0, 0
+; LA64-NEXT:    ld.d $s1, $a0, %pc_lo12(sp)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(base)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(base)
-; LA64-NEXT:    ld.d $s0, $a0, 0
+; LA64-NEXT:    ld.d $s0, $a0, %pc_lo12(base)
 ; LA64-NEXT:    b %plt(bar)
 
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/global-address.ll b/llvm/test/CodeGen/LoongArch/global-address.ll
index 2423dd81a4d3a8b..89ea48c3b1cbf68 100644
--- a/llvm/test/CodeGen/LoongArch/global-address.ll
+++ b/llvm/test/CodeGen/LoongArch/global-address.ll
@@ -16,8 +16,7 @@ define void @foo() nounwind {
 ; LA32NOPIC-NEXT:    ld.w $a0, $a0, %got_pc_lo12(G)
 ; LA32NOPIC-NEXT:    ld.w $zero, $a0, 0
 ; LA32NOPIC-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LA32NOPIC-NEXT:    addi.w $a0, $a0, %pc_lo12(g)
-; LA32NOPIC-NEXT:    ld.w $zero, $a0, 0
+; LA32NOPIC-NEXT:    ld.w $zero, $a0, %pc_lo12(g)
 ; LA32NOPIC-NEXT:    ret
 ;
 ; LA32PIC-LABEL: foo:
@@ -26,8 +25,7 @@ define void @foo() nounwind {
 ; LA32PIC-NEXT:    ld.w $a0, $a0, %got_pc_lo12(G)
 ; LA32PIC-NEXT:    ld.w $zero, $a0, 0
 ; LA32PIC-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; LA32PIC-NEXT:    addi.w $a0, $a0, %pc_lo12(.Lg$local)
-; LA32PIC-NEXT:    ld.w $zero, $a0, 0
+; LA32PIC-NEXT:    ld.w $zero, $a0, %pc_lo12(.Lg$local)
 ; LA32PIC-NEXT:    ret
 ;
 ; LA64NOPIC-LABEL: foo:
@@ -36,8 +34,7 @@ define void @foo() nounwind {
 ; LA64NOPIC-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; LA64NOPIC-NEXT:    ld.w $zero, $a0, 0
 ; LA64NOPIC-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LA64NOPIC-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
-; LA64NOPIC-NEXT:    ld.w $zero, $a0, 0
+; LA64NOPIC-NEXT:    ld.w $zero, $a0, %pc_lo12(g)
 ; LA64NOPIC-NEXT:    ret
 ;
 ; LA64PIC-LABEL: foo:
@@ -46,8 +43,7 @@ define void @foo() nounwind {
 ; LA64PIC-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; LA64PIC-NEXT:    ld.w $zero, $a0, 0
 ; LA64PIC-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; LA64PIC-NEXT:    addi.d $a0, $a0, %pc_lo12(.Lg$local)
-; LA64PIC-NEXT:    ld.w $zero, $a0, 0
+; LA64PIC-NEXT:    ld.w $zero, $a0, %pc_lo12(.Lg$local)
 ; LA64PIC-NEXT:    ret
 ;
 ; LA64LARGENOPIC-LABEL: foo:
@@ -62,8 +58,7 @@ define void @foo() nounwind {
 ; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
 ; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %pc64_lo20(g)
 ; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
-; LA64LARGENOPIC-NEXT:    add.d $a0, $a1, $a0
-; LA64LARGENOPIC-NEXT:    ld.w $zero, $a0, 0
+; LA64LARGENOPIC-NEXT:    ldx.w $zero, $a1, $a0
 ; LA64LARGENOPIC-NEXT:    ret
 ;
 ; LA64LARGEPIC-LABEL: foo:
@@ -78,8 +73,7 @@ define void @foo() nounwind {
 ; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
 ; LA64LARGEPIC-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
 ; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
-; LA64LARGEPIC-NEXT:    ld.w $zero, $a0, 0
+; LA64LARGEPIC-NEXT:    ldx.w $zero, $a1, $a0
 ; LA64LARGEPIC-NEXT:    ret
   %V = load volatile i32, ptr @G
   %v = load volatile i32, ptr @g
diff --git a/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll b/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll
index 2b7a862ecde11e1..04f6a635778eba5 100644
--- a/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll
+++ b/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll
@@ -7,8 +7,7 @@ define dso_local signext i32 @local_small() #0 {
 ; CHECK-LABEL: local_small:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(a)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(a)
-; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    ld.w $a0, $a0, %pc_lo12(a)
 ; CHECK-NEXT:    ret
   %1 = load i32, ptr @a, align 4
   ret i32 %1
@@ -23,8 +22,7 @@ define dso_local signext i32 @local_large() #0 {
 ; CHECK-NEXT:    addi.d $a1, $zero, %pc_lo12(b)
 ; CHECK-NEXT:    lu32i.d $a1, %pc64_lo20(b)
 ; CHECK-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(b)
-; CHECK-NEXT:    add.d $a0, $a1, $a0
-; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    ldx.w $a0, $a1, $a0
 ; CHECK-NEXT:    ret
   %1 = load i32, ptr @b, align 4
   ret i32 %1
diff --git a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll
index fa675e4bbb32434..be9ea29b54c3320 100644
--- a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll
+++ b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll
@@ -10,8 +10,7 @@ define double @constraint_f_double(double %a) nounwind {
 ; LA32-LABEL: constraint_f_double:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(gd)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(gd)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(gd)
 ; LA32-NEXT:    #APP
 ; LA32-NEXT:    fadd.d $fa0, $fa0, $fa1
 ; LA32-NEXT:    #NO_APP
@@ -20,8 +19,7 @@ define double @constraint_f_double(double %a) nounwind {
 ; LA64-LABEL: constraint_f_double:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(gd)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(gd)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(gd)
 ; LA64-NEXT:    #APP
 ; LA64-NEXT:    fadd.d $fa0, $fa0, $fa1
 ; LA64-NEXT:    #NO_APP
diff --git a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll
index becb3cae46b8c05..565ccdbe6880fbf 100644
--- a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll
+++ b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll
@@ -147,19 +147,17 @@ define i32 @m_offset_2048(ptr %p) nounwind {
 define i32 @m_addr_pcrel() nounwind {
 ; LA32-LABEL: m_addr_pcrel:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA32-NEXT:    addi.w $a1, $a0, %pc_lo12(g_i32)
+; LA32-NEXT:    pcalau12i $a1, %pc_hi20(g_i32)
 ; LA32-NEXT:    #APP
-; LA32-NEXT:    ld.w $a0, $a1, 0
+; LA32-NEXT:    ld.w $a0, $a1, %pc_lo12(g_i32)
 ; LA32-NEXT:    #NO_APP
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: m_addr_pcrel:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA64-NEXT:    addi.d $a1, $a0, %pc_lo12(g_i32)
+; LA64-NEXT:    pcalau12i $a1, %pc_hi20(g_i32)
 ; LA64-NEXT:    #APP
-; LA64-NEXT:    ld.w $a0, $a1, 0
+; LA64-NEXT:    ld.w $a0, $a1, %pc_lo12(g_i32)
 ; LA64-NEXT:    #NO_APP
 ; LA64-NEXT:    ret
   %1 = tail call i32 asm sideeffect "ld.w $0, $1", "=&r,*m"(ptr nonnull elementtype(i32) @g_i32)
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
index 193fa6c08600aec..7e320d9245f1c2f 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
@@ -79,8 +79,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    fld.s $fa0, $a0, 0
 ; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI1_0)
-; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI1_0)
-; LA64F-NEXT:    fld.s $fa1, $a1, 0
+; LA64F-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI1_0)
 ; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB1_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
@@ -113,8 +112,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64D:       # %bb.0:
 ; LA64D-NEXT:    fld.s $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI1_0)
-; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI1_0)
-; LA64D-NEXT:    fld.s $fa1, $a1, 0
+; LA64D-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI1_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB1_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
@@ -413,8 +411,7 @@ define double @double_fsub_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    move $fp, $a0
 ; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
-; LA64D-NEXT:    fld.d $fs0, $a0, 0
+; LA64D-NEXT:    fld.d $fs0, $a0, %pc_lo12(.LCPI5_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB5_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -665,8 +662,7 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    fld.s $fa0, $a0, 0
 ; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI9_0)
-; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI9_0)
-; LA64F-NEXT:    fld.s $fa1, $a1, 0
+; LA64F-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI9_0)
 ; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB9_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
@@ -699,8 +695,7 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64D:       # %bb.0:
 ; LA64D-NEXT:    fld.s $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI9_0)
-; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI9_0)
-; LA64D-NEXT:    fld.s $fa1, $a1, 0
+; LA64D-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI9_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB9_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
@@ -999,8 +994,7 @@ define double @double_fsub_release(ptr %p) nounwind {
 ; LA64D-NEXT:    move $fp, $a0
 ; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI13_0)
-; LA64D-NEXT:    fld.d $fs0, $a0, 0
+; LA64D-NEXT:    fld.d $fs0, $a0, %pc_lo12(.LCPI13_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB13_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1251,8 +1245,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    fld.s $fa0, $a0, 0
 ; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI17_0)
-; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI17_0)
-; LA64F-NEXT:    fld.s $fa1, $a1, 0
+; LA64F-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI17_0)
 ; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB17_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
@@ -1285,8 +1278,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64D:       # %bb.0:
 ; LA64D-NEXT:    fld.s $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI17_0)
-; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI17_0)
-; LA64D-NEXT:    fld.s $fa1, $a1, 0
+; LA64D-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI17_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB17_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
@@ -1585,8 +1577,7 @@ define double @double_fsub_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    move $fp, $a0
 ; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI21_0)
-; LA64D-NEXT:    fld.d $fs0, $a0, 0
+; LA64D-NEXT:    fld.d $fs0, $a0, %pc_lo12(.LCPI21_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB21_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1837,8 +1828,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    fld.s $fa0, $a0, 0
 ; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI25_0)
-; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI25_0)
-; LA64F-NEXT:    fld.s $fa1, $a1, 0
+; LA64F-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI25_0)
 ; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB25_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
@@ -1871,8 +1861,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64D:       # %bb.0:
 ; LA64D-NEXT:    fld.s $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI25_0)
-; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI25_0)
-; LA64D-NEXT:    fld.s $fa1, $a1, 0
+; LA64D-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI25_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB25_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
@@ -2171,8 +2160,7 @@ define double @double_fsub_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    move $fp, $a0
 ; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI29_0)
-; LA64D-NEXT:    fld.d $fs0, $a0, 0
+; LA64D-NEXT:    fld.d $fs0, $a0, %pc_lo12(.LCPI29_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB29_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -2423,8 +2411,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    fld.s $fa0, $a0, 0
 ; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI33_0)
-; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI33_0)
-; LA64F-NEXT:    fld.s $fa1, $a1, 0
+; LA64F-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI33_0)
 ; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB33_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
@@ -2457,8 +2444,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64D:       # %bb.0:
 ; LA64D-NEXT:    fld.s $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI33_0)
-; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI33_0)
-; LA64D-NEXT:    fld.s $fa1, $a1, 0
+; LA64D-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI33_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB33_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
@@ -2757,8 +2743,7 @@ define double @double_fsub_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    move $fp, $a0
 ; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI37_0)
-; LA64D-NEXT:    fld.d $fs0, $a0, 0
+; LA64D-NEXT:    fld.d $fs0, $a0, %pc_lo12(.LCPI37_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB37_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
index ef117f974887157..8d08942c314aa45 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
@@ -117,8 +117,7 @@ define i32 @convert_double_to_u32(double %a) nounwind {
 ; LA32-LABEL: convert_double_to_u32:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI7_0)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI7_0)
 ; LA32-NEXT:    fcmp.clt.d $fcc0, $fa0, $fa1
 ; LA32-NEXT:    fsub.d $fa1, $fa0, $fa1
 ; LA32-NEXT:    ftintrz.w.d $fa1, $fa1
@@ -174,8 +173,7 @@ define i64 @convert_double_to_u64(double %a) nounwind {
 ; LA64-LABEL: convert_double_to_u64:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI9_0)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI9_0)
 ; LA64-NEXT:    fcmp.clt.d $fcc0, $fa0, $fa1
 ; LA64-NEXT:    fsub.d $fa1, $fa0, $fa1
 ; LA64-NEXT:    ftintrz.l.d $fa1, $fa1
@@ -234,8 +232,7 @@ define double @convert_u32_to_double(i32 %a) nounwind {
 ; LA32-NEXT:    st.w $a0, $sp, 8
 ; LA32-NEXT:    fld.d $fa0, $sp, 8
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI12_0)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI12_0)
 ; LA32-NEXT:    fsub.d $fa0, $fa0, $fa1
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -264,8 +261,7 @@ define double @convert_u64_to_double(i64 %a) nounwind {
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    srli.d $a1, $a0, 32
 ; LA64-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI13_0)
-; LA64-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI13_0)
-; LA64-NEXT:    fld.d $fa0, $a2, 0
+; LA64-NEXT:    fld.d $fa0, $a2, %pc_lo12(.LCPI13_0)
 ; LA64-NEXT:    lu52i.d $a2, $zero, 1107
 ; LA64-NEXT:    or $a1, $a1, $a2
 ; LA64-NEXT:    movgr2fr.d $fa1, $a1
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/float-convert.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/float-convert.ll
index b01b84ba385ec84..b7de5a592c35983 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/float-convert.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/float-convert.ll
@@ -182,8 +182,7 @@ define i32 @convert_float_to_u32(float %a) nounwind {
 ; LA32F-LABEL: convert_float_to_u32:
 ; LA32F:       # %bb.0:
 ; LA32F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; LA32F-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI6_0)
-; LA32F-NEXT:    fld.s $fa1, $a0, 0
+; LA32F-NEXT:    fld.s $fa1, $a0, %pc_lo12(.LCPI6_0)
 ; LA32F-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
 ; LA32F-NEXT:    fsub.s $fa1, $fa0, $fa1
 ; LA32F-NEXT:    ftintrz.w.s $fa1, $fa1
@@ -201,8 +200,7 @@ define i32 @convert_float_to_u32(float %a) nounwind {
 ; LA32D-LABEL: convert_float_to_u32:
 ; LA32D:       # %bb.0:
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI6_0)
-; LA32D-NEXT:    fld.s $fa1, $a0, 0
+; LA32D-NEXT:    fld.s $fa1, $a0, %pc_lo12(.LCPI6_0)
 ; LA32D-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
 ; LA32D-NEXT:    fsub.s $fa1, $fa0, $fa1
 ; LA32D-NEXT:    ftintrz.w.s $fa1, $fa1
@@ -220,8 +218,7 @@ define i32 @convert_float_to_u32(float %a) nounwind {
 ; LA64F-LABEL: convert_float_to_u32:
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; LA64F-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI6_0)
-; LA64F-NEXT:    fld.s $fa1, $a0, 0
+; LA64F-NEXT:    fld.s $fa1, $a0, %pc_lo12(.LCPI6_0)
 ; LA64F-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
 ; LA64F-NEXT:    fsub.s $fa1, $fa0, $fa1
 ; LA64F-NEXT:    ftintrz.w.s $fa1, $fa1
@@ -267,8 +264,7 @@ define i64 @convert_float_to_u64(float %a) nounwind {
 ; LA64F-LABEL: convert_float_to_u64:
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; LA64F-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI7_0)
-; LA64F-NEXT:    fld.s $fa1, $a0, 0
+; LA64F-NEXT:    fld.s $fa1, $a0, %pc_lo12(.LCPI7_0)
 ; LA64F-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
 ; LA64F-NEXT:    fsub.s $fa1, $fa0, $fa1
 ; LA64F-NEXT:    ftintrz.w.s $fa1, $fa1
@@ -286,8 +282,7 @@ define i64 @convert_float_to_u64(float %a) nounwind {
 ; LA64D-LABEL: convert_float_to_u64:
 ; LA64D:       # %bb.0:
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI7_0)
-; LA64D-NEXT:    fld.s $fa1, $a0, 0
+; LA64D-NEXT:    fld.s $fa1, $a0, %pc_lo12(.LCPI7_0)
 ; LA64D-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
 ; LA64D-NEXT:    fsub.s $fa1, $fa0, $fa1
 ; LA64D-NEXT:    ftintrz.l.s $fa1, $fa1
@@ -506,8 +501,7 @@ define float @convert_u32_to_float(i32 %a) nounwind {
 ; LA32D-NEXT:    st.w $a0, $sp, 8
 ; LA32D-NEXT:    fld.d $fa0, $sp, 8
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_0)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI14_0)
-; LA32D-NEXT:    fld.d $fa1, $a0, 0
+; LA32D-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI14_0)
 ; LA32D-NEXT:    fsub.d $fa0, $fa0, $fa1
 ; LA32D-NEXT:    fcvt.s.d $fa0, $fa0
 ; LA32D-NEXT:    addi.w $sp, $sp, 16
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll
index 26f44adc6135865..772ae8d81a88bf0 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll
@@ -11,38 +11,34 @@
 define i32 @load_store_global() nounwind {
 ; LA32NOPIC-LABEL: load_store_global:
 ; LA32NOPIC:       # %bb.0:
-; LA32NOPIC-NEXT:    pcalau12i $a0, %pc_hi20(G)
-; LA32NOPIC-NEXT:    addi.w $a1, $a0, %pc_lo12(G)
-; LA32NOPIC-NEXT:    ld.w $a0, $a1, 0
+; LA32NOPIC-NEXT:    pcalau12i $a1, %pc_hi20(G)
+; LA32NOPIC-NEXT:    ld.w $a0, $a1, %pc_lo12(G)
 ; LA32NOPIC-NEXT:    addi.w $a0, $a0, 1
-; LA32NOPIC-NEXT:    st.w $a0, $a1, 0
+; LA32NOPIC-NEXT:    st.w $a0, $a1, %pc_lo12(G)
 ; LA32NOPIC-NEXT:    ret
 ;
 ; LA32PIC-LABEL: load_store_global:
 ; LA32PIC:       # %bb.0:
-; LA32PIC-NEXT:    pcalau12i $a0, %pc_hi20(.LG$local)
-; LA32PIC-NEXT:    addi.w $a1, $a0, %pc_lo12(.LG$local)
-; LA32PIC-NEXT:    ld.w $a0, $a1, 0
+; LA32PIC-NEXT:    pcalau12i $a1, %pc_hi20(.LG$local)
+; LA32PIC-NEXT:    ld.w $a0, $a1, %pc_lo12(.LG$local)
 ; LA32PIC-NEXT:    addi.w $a0, $a0, 1
-; LA32PIC-NEXT:    st.w $a0, $a1, 0
+; LA32PIC-NEXT:    st.w $a0, $a1, %pc_lo12(.LG$local)
 ; LA32PIC-NEXT:    ret
 ;
 ; LA64NOPIC-LABEL: load_store_global:
 ; LA64NOPIC:       # %bb.0:
-; LA64NOPIC-NEXT:    pcalau12i $a0, %pc_hi20(G)
-; LA64NOPIC-NEXT:    addi.d $a1, $a0, %pc_lo12(G)
-; LA64NOPIC-NEXT:    ld.w $a0, $a1, 0
+; LA64NOPIC-NEXT:    pcalau12i $a1, %pc_hi20(G)
+; LA64NOPIC-NEXT:    ld.w $a0, $a1, %pc_lo12(G)
 ; LA64NOPIC-NEXT:    addi.w $a0, $a0, 1
-; LA64NOPIC-NEXT:    st.w $a0, $a1, 0
+; LA64NOPIC-NEXT:    st.w $a0, $a1, %pc_lo12(G)
 ; LA64NOPIC-NEXT:    ret
 ;
 ; LA64PIC-LABEL: load_store_global:
 ; LA64PIC:       # %bb.0:
-; LA64PIC-NEXT:    pcalau12i $a0, %pc_hi20(.LG$local)
-; LA64PIC-NEXT:    addi.d $a1, $a0, %pc_lo12(.LG$local)
-; LA64PIC-NEXT:    ld.w $a0, $a1, 0
+; LA64PIC-NEXT:    pcalau12i $a1, %pc_hi20(.LG$local)
+; LA64PIC-NEXT:    ld.w $a0, $a1, %pc_lo12(.LG$local)
 ; LA64PIC-NEXT:    addi.w $a0, $a0, 1
-; LA64PIC-NEXT:    st.w $a0, $a1, 0
+; LA64PIC-NEXT:    st.w $a0, $a1, %pc_lo12(.LG$local)
 ; LA64PIC-NEXT:    ret
   %v = load i32, ptr @G
   %sum = add i32 %v, 1
diff --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
index 5248468b6027d52..ba72ef5bd7ba4be 100644
--- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
@@ -15,11 +15,10 @@ define void @test_la_pcrel(i32 signext %n) {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    move $a1, $zero
 ; LA32-NEXT:    pcalau12i $a2, %pc_hi20(l)
-; LA32-NEXT:    addi.w $a2, $a2, %pc_lo12(l)
 ; LA32-NEXT:    .p2align 4, , 16
 ; LA32-NEXT:  .LBB0_1: # %loop
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA32-NEXT:    ld.w $zero, $a2, 0
+; LA32-NEXT:    ld.w $zero, $a2, %pc_lo12(l)
 ; LA32-NEXT:    addi.w $a1, $a1, 1
 ; LA32-NEXT:    blt $a1, $a0, .LBB0_1
 ; LA32-NEXT:  # %bb.2: # %ret
@@ -29,11 +28,10 @@ define void @test_la_pcrel(i32 signext %n) {
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    move $a1, $zero
 ; LA64-NEXT:    pcalau12i $a2, %pc_hi20(l)
-; LA64-NEXT:    addi.d $a2, $a2, %pc_lo12(l)
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB0_1: # %loop
 ; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ld.w $zero, $a2, 0
+; LA64-NEXT:    ld.w $zero, $a2, %pc_lo12(l)
 ; LA64-NEXT:    addi.w $a1, $a1, 1
 ; LA64-NEXT:    blt $a1, $a0, .LBB0_1
 ; LA64-NEXT:  # %bb.2: # %ret
@@ -41,18 +39,17 @@ define void @test_la_pcrel(i32 signext %n) {
 ;
 ; LA64LARGE-LABEL: test_la_pcrel:
 ; LA64LARGE:       # %bb.0: # %entry
-; LA64LARGE-NEXT:    pcalau12i $a2, %pc_hi20(l)
-; LA64LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(l)
-; LA64LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(l)
-; LA64LARGE-NEXT:    lu52i.d $a3, $a1, %pc64_hi12(l)
-; LA64LARGE-NEXT:    move $a1, $zero
-; LA64LARGE-NEXT:    add.d $a2, $a3, $a2
+; LA64LARGE-NEXT:    pcalau12i $a1, %pc_hi20(l)
+; LA64LARGE-NEXT:    addi.d $a2, $zero, %pc_lo12(l)
+; LA64LARGE-NEXT:    lu32i.d $a2, %pc64_lo20(l)
+; LA64LARGE-NEXT:    lu52i.d $a2, $a2, %pc64_hi12(l)
+; LA64LARGE-NEXT:    move $a3, $zero
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB0_1: # %loop
 ; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64LARGE-NEXT:    ld.w $zero, $a2, 0
-; LA64LARGE-NEXT:    addi.w $a1, $a1, 1
-; LA64LARGE-NEXT:    blt $a1, $a0, .LBB0_1
+; LA64LARGE-NEXT:    ldx.w $zero, $a2, $a1
+; LA64LARGE-NEXT:    addi.w $a3, $a3, 1
+; LA64LARGE-NEXT:    blt $a3, $a0, .LBB0_1
 ; LA64LARGE-NEXT:  # %bb.2: # %ret
 ; LA64LARGE-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
index 48d18dbedcaf265..32a4c4bdd1508a9 100644
--- a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
+++ b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
@@ -12,15 +12,13 @@ define dso_local signext i8 @load_s8() nounwind {
 ; LA32-LABEL: load_s8:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i8)
-; LA32-NEXT:    ld.b $a0, $a0, 0
+; LA32-NEXT:    ld.b $a0, $a0, %pc_lo12(g_i8)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_s8:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i8)
-; LA64-NEXT:    ld.b $a0, $a0, 0
+; LA64-NEXT:    ld.b $a0, $a0, %pc_lo12(g_i8)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_s8:
@@ -29,8 +27,7 @@ define dso_local signext i8 @load_s8() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i8)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i8)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i8)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.b $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.b $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i8, ptr @g_i8
@@ -41,15 +38,13 @@ define dso_local zeroext i8 @load_u8() nounwind {
 ; LA32-LABEL: load_u8:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i8)
-; LA32-NEXT:    ld.bu $a0, $a0, 0
+; LA32-NEXT:    ld.bu $a0, $a0, %pc_lo12(g_i8)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_u8:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i8)
-; LA64-NEXT:    ld.bu $a0, $a0, 0
+; LA64-NEXT:    ld.bu $a0, $a0, %pc_lo12(g_i8)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_u8:
@@ -58,8 +53,7 @@ define dso_local zeroext i8 @load_u8() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i8)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i8)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i8)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.bu $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.bu $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i8, ptr @g_i8
@@ -70,17 +64,15 @@ define dso_local void @store_i8() nounwind {
 ; LA32-LABEL: store_i8:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i8)
 ; LA32-NEXT:    ori $a1, $zero, 1
-; LA32-NEXT:    st.b $a1, $a0, 0
+; LA32-NEXT:    st.b $a1, $a0, %pc_lo12(g_i8)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_i8:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i8)
 ; LA64-NEXT:    ori $a1, $zero, 1
-; LA64-NEXT:    st.b $a1, $a0, 0
+; LA64-NEXT:    st.b $a1, $a0, %pc_lo12(g_i8)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_i8:
@@ -89,9 +81,8 @@ define dso_local void @store_i8() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i8)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i8)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i8)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 1
-; LA64-LARGE-NEXT:    st.b $a1, $a0, 0
+; LA64-LARGE-NEXT:    ori $a2, $zero, 1
+; LA64-LARGE-NEXT:    stx.b $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store i8 1, ptr @g_i8
@@ -104,15 +95,13 @@ define dso_local signext i16 @load_s16() nounwind {
 ; LA32-LABEL: load_s16:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i16)
-; LA32-NEXT:    ld.h $a0, $a0, 0
+; LA32-NEXT:    ld.h $a0, $a0, %pc_lo12(g_i16)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_s16:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i16)
-; LA64-NEXT:    ld.h $a0, $a0, 0
+; LA64-NEXT:    ld.h $a0, $a0, %pc_lo12(g_i16)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_s16:
@@ -121,8 +110,7 @@ define dso_local signext i16 @load_s16() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i16)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i16)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i16)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.h $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.h $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i16, ptr @g_i16
@@ -133,15 +121,13 @@ define dso_local zeroext i16 @load_u16() nounwind {
 ; LA32-LABEL: load_u16:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i16)
-; LA32-NEXT:    ld.hu $a0, $a0, 0
+; LA32-NEXT:    ld.hu $a0, $a0, %pc_lo12(g_i16)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_u16:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i16)
-; LA64-NEXT:    ld.hu $a0, $a0, 0
+; LA64-NEXT:    ld.hu $a0, $a0, %pc_lo12(g_i16)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_u16:
@@ -150,8 +136,7 @@ define dso_local zeroext i16 @load_u16() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i16)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i16)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i16)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.hu $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.hu $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i16, ptr @g_i16
@@ -162,17 +147,15 @@ define dso_local void @store_i16() nounwind {
 ; LA32-LABEL: store_i16:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i16)
 ; LA32-NEXT:    ori $a1, $zero, 1
-; LA32-NEXT:    st.h $a1, $a0, 0
+; LA32-NEXT:    st.h $a1, $a0, %pc_lo12(g_i16)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_i16:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i16)
 ; LA64-NEXT:    ori $a1, $zero, 1
-; LA64-NEXT:    st.h $a1, $a0, 0
+; LA64-NEXT:    st.h $a1, $a0, %pc_lo12(g_i16)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_i16:
@@ -181,9 +164,8 @@ define dso_local void @store_i16() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i16)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i16)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i16)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 1
-; LA64-LARGE-NEXT:    st.h $a1, $a0, 0
+; LA64-LARGE-NEXT:    ori $a2, $zero, 1
+; LA64-LARGE-NEXT:    stx.h $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store i16 1, ptr @g_i16
@@ -196,15 +178,13 @@ define dso_local signext i32 @load_s32() nounwind {
 ; LA32-LABEL: load_s32:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i32)
-; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ld.w $a0, $a0, %pc_lo12(g_i32)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_s32:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i32)
-; LA64-NEXT:    ld.w $a0, $a0, 0
+; LA64-NEXT:    ld.w $a0, $a0, %pc_lo12(g_i32)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_s32:
@@ -213,8 +193,7 @@ define dso_local signext i32 @load_s32() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i32)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i32)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.w $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.w $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i32, ptr @g_i32
@@ -225,15 +204,13 @@ define dso_local zeroext i32 @load_u32() nounwind {
 ; LA32-LABEL: load_u32:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i32)
-; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ld.w $a0, $a0, %pc_lo12(g_i32)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_u32:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i32)
-; LA64-NEXT:    ld.wu $a0, $a0, 0
+; LA64-NEXT:    ld.wu $a0, $a0, %pc_lo12(g_i32)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_u32:
@@ -242,8 +219,7 @@ define dso_local zeroext i32 @load_u32() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i32)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i32)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.wu $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.wu $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i32, ptr @g_i32
@@ -254,17 +230,15 @@ define dso_local void @store_i32() nounwind {
 ; LA32-LABEL: store_i32:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i32)
 ; LA32-NEXT:    ori $a1, $zero, 1
-; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a1, $a0, %pc_lo12(g_i32)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_i32:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i32)
 ; LA64-NEXT:    ori $a1, $zero, 1
-; LA64-NEXT:    st.w $a1, $a0, 0
+; LA64-NEXT:    st.w $a1, $a0, %pc_lo12(g_i32)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_i32:
@@ -273,9 +247,8 @@ define dso_local void @store_i32() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i32)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i32)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 1
-; LA64-LARGE-NEXT:    st.w $a1, $a0, 0
+; LA64-LARGE-NEXT:    ori $a2, $zero, 1
+; LA64-LARGE-NEXT:    stx.w $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store i32 1, ptr @g_i32
@@ -296,8 +269,7 @@ define dso_local i64 @load_64() nounwind {
 ; LA64-LABEL: load_64:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i64)
-; LA64-NEXT:    ld.d $a0, $a0, 0
+; LA64-NEXT:    ld.d $a0, $a0, %pc_lo12(g_i64)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_64:
@@ -306,8 +278,7 @@ define dso_local i64 @load_64() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i64)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i64)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i64)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.d $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i64, ptr @g_i64
@@ -327,9 +298,8 @@ define dso_local void @store_i64() nounwind {
 ; LA64-LABEL: store_i64:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i64)
 ; LA64-NEXT:    ori $a1, $zero, 1
-; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    st.d $a1, $a0, %pc_lo12(g_i64)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_i64:
@@ -338,9 +308,8 @@ define dso_local void @store_i64() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i64)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i64)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i64)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 1
-; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ori $a2, $zero, 1
+; LA64-LARGE-NEXT:    stx.d $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store i64 1, ptr @g_i64
@@ -353,15 +322,13 @@ define dso_local float @load_f32() nounwind {
 ; LA32-LABEL: load_f32:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f32)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(g_f32)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_f32:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f32)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(g_f32)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_f32:
@@ -370,8 +337,7 @@ define dso_local float @load_f32() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f32)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f32)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    fld.s $fa0, $a0, 0
+; LA64-LARGE-NEXT:    fldx.s $fa0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load float, ptr @g_f32
@@ -382,17 +348,15 @@ define dso_local void @store_f32() nounwind {
 ; LA32-LABEL: store_f32:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f32)
 ; LA32-NEXT:    lu12i.w $a1, 260096
-; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a1, $a0, %pc_lo12(g_f32)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_f32:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f32)
 ; LA64-NEXT:    lu12i.w $a1, 260096
-; LA64-NEXT:    st.w $a1, $a0, 0
+; LA64-NEXT:    st.w $a1, $a0, %pc_lo12(g_f32)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_f32:
@@ -401,9 +365,8 @@ define dso_local void @store_f32() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f32)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f32)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    lu12i.w $a1, 260096
-; LA64-LARGE-NEXT:    st.w $a1, $a0, 0
+; LA64-LARGE-NEXT:    lu12i.w $a2, 260096
+; LA64-LARGE-NEXT:    stx.w $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store float 1.0, ptr @g_f32
@@ -416,15 +379,13 @@ define dso_local double @load_f64() nounwind {
 ; LA32-LABEL: load_f64:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f64)
-; LA32-NEXT:    fld.d $fa0, $a0, 0
+; LA32-NEXT:    fld.d $fa0, $a0, %pc_lo12(g_f64)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_f64:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f64)
-; LA64-NEXT:    fld.d $fa0, $a0, 0
+; LA64-NEXT:    fld.d $fa0, $a0, %pc_lo12(g_f64)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_f64:
@@ -433,8 +394,7 @@ define dso_local double @load_f64() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f64)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f64)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f64)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    fld.d $fa0, $a0, 0
+; LA64-LARGE-NEXT:    fldx.d $fa0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load double, ptr @g_f64
@@ -445,20 +405,18 @@ define dso_local void @store_f64() nounwind {
 ; LA32-LABEL: store_f64:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f64)
 ; LA32-NEXT:    addi.w $a1, $zero, 1
 ; LA32-NEXT:    movgr2fr.w $fa0, $a1
 ; LA32-NEXT:    ffint.s.w $fa0, $fa0
 ; LA32-NEXT:    fcvt.d.s $fa0, $fa0
-; LA32-NEXT:    fst.d $fa0, $a0, 0
+; LA32-NEXT:    fst.d $fa0, $a0, %pc_lo12(g_f64)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_f64:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f64)
 ; LA64-NEXT:    lu52i.d $a1, $zero, 1023
-; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    st.d $a1, $a0, %pc_lo12(g_f64)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_f64:
@@ -467,9 +425,8 @@ define dso_local void @store_f64() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f64)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f64)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f64)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    lu52i.d $a1, $zero, 1023
-; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    lu52i.d $a2, $zero, 1023
+; LA64-LARGE-NEXT:    stx.d $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store double 1.0, ptr @g_f64
@@ -494,11 +451,10 @@ define dso_local void @store_multi() nounwind {
 ; LA64-LABEL: store_multi:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_m64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_m64)
 ; LA64-NEXT:    ori $a1, $zero, 1
-; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    st.d $a1, $a0, %pc_lo12(g_m64)
 ; LA64-NEXT:    ori $a1, $zero, 2
-; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    st.d $a1, $a0, %pc_lo12(g_m64)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_multi:
@@ -507,11 +463,10 @@ define dso_local void @store_multi() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_m64)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_m64)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_m64)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 1
-; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 2
-; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ori $a2, $zero, 1
+; LA64-LARGE-NEXT:    stx.d $a2, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a2, $zero, 2
+; LA64-LARGE-NEXT:    stx.d $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store volatile i64 1, ptr @g_m64
@@ -525,17 +480,15 @@ define dso_local void @store_sf32() nounwind {
 ; LA32-LABEL: store_sf32:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_sf32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_sf32)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    fst.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(g_sf32)
+; LA32-NEXT:    fst.s $fa0, $a0, %pc_lo12(g_sf32)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_sf32:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_sf32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_sf32)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    fst.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(g_sf32)
+; LA64-NEXT:    fst.s $fa0, $a0, %pc_lo12(g_sf32)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_sf32:
@@ -544,9 +497,8 @@ define dso_local void @store_sf32() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_sf32)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_sf32)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_sf32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    fld.s $fa0, $a0, 0
-; LA64-LARGE-NEXT:    fst.s $fa0, $a0, 0
+; LA64-LARGE-NEXT:    fldx.s $fa0, $a1, $a0
+; LA64-LARGE-NEXT:    fstx.s $fa0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load float, ptr @g_sf32
@@ -560,17 +512,15 @@ define dso_local void @store_sf64() nounwind {
 ; LA32-LABEL: store_sf64:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_sf64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_sf64)
-; LA32-NEXT:    fld.d $fa0, $a0, 0
-; LA32-NEXT:    fst.d $fa0, $a0, 0
+; LA32-NEXT:    fld.d $fa0, $a0, %pc_lo12(g_sf64)
+; LA32-NEXT:    fst.d $fa0, $a0, %pc_lo12(g_sf64)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_sf64:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_sf64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_sf64)
-; LA64-NEXT:    fld.d $fa0, $a0, 0
-; LA64-NEXT:    fst.d $fa0, $a0, 0
+; LA64-NEXT:    fld.d $fa0, $a0, %pc_lo12(g_sf64)
+; LA64-NEXT:    fst.d $fa0, $a0, %pc_lo12(g_sf64)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_sf64:
@@ -579,9 +529,8 @@ define dso_local void @store_sf64() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_sf64)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_sf64)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_sf64)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    fld.d $fa0, $a0, 0
-; LA64-LARGE-NEXT:    fst.d $fa0, $a0, 0
+; LA64-LARGE-NEXT:    fldx.d $fa0, $a1, $a0
+; LA64-LARGE-NEXT:    fstx.d $fa0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load double, ptr @g_sf64
@@ -608,10 +557,9 @@ define dso_local void @rmw() nounwind {
 ; LA64-LABEL: rmw:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_rmw)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_rmw)
-; LA64-NEXT:    ld.d $a1, $a0, 0
+; LA64-NEXT:    ld.d $a1, $a0, %pc_lo12(g_rmw)
 ; LA64-NEXT:    addi.d $a1, $a1, 1
-; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    st.d $a1, $a0, %pc_lo12(g_rmw)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: rmw:
@@ -620,10 +568,9 @@ define dso_local void @rmw() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_rmw)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_rmw)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_rmw)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.d $a1, $a0, 0
-; LA64-LARGE-NEXT:    addi.d $a1, $a1, 1
-; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ldx.d $a2, $a1, $a0
+; LA64-LARGE-NEXT:    addi.d $a2, $a2, 1
+; LA64-LARGE-NEXT:    stx.d $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i64, ptr @g_rmw
@@ -637,31 +584,26 @@ entry:
 define dso_local void @store_a32() nounwind {
 ; LA32-LABEL: store_a32:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a32)
-; LA32-NEXT:    lu12i.w $a1, 1
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4096)
 ; LA32-NEXT:    ori $a1, $zero, 1
-; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a1, $a0, %pc_lo12(g_a32+4096)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_a32:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a32)
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4096)
 ; LA64-NEXT:    ori $a1, $zero, 1
-; LA64-NEXT:    stptr.w $a1, $a0, 4096
+; LA64-NEXT:    st.w $a1, $a0, %pc_lo12(g_a32+4096)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_a32:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 1
-; LA64-LARGE-NEXT:    stptr.w $a1, $a0, 4096
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4096)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32+4096)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32+4096)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32+4096)
+; LA64-LARGE-NEXT:    ori $a2, $zero, 1
+; LA64-LARGE-NEXT:    stx.w $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store i32 1, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1024), align 4
@@ -714,48 +656,44 @@ entry:
 define dso_local void @control_flow_with_mem_access() nounwind {
 ; LA32-LABEL: control_flow_with_mem_access:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a32)
-; LA32-NEXT:    ld.w $a1, $a0, 4
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4)
+; LA32-NEXT:    ld.w $a1, $a0, %pc_lo12(g_a32+4)
 ; LA32-NEXT:    ori $a2, $zero, 1
 ; LA32-NEXT:    blt $a1, $a2, .LBB21_2
 ; LA32-NEXT:  # %bb.1: # %if.then
 ; LA32-NEXT:    ori $a1, $zero, 10
-; LA32-NEXT:    st.w $a1, $a0, 4
+; LA32-NEXT:    st.w $a1, $a0, %pc_lo12(g_a32+4)
 ; LA32-NEXT:  .LBB21_2: # %if.end
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: control_flow_with_mem_access:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a32)
-; LA64-NEXT:    ld.w $a1, $a0, 4
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4)
+; LA64-NEXT:    ld.w $a1, $a0, %pc_lo12(g_a32+4)
 ; LA64-NEXT:    ori $a2, $zero, 1
 ; LA64-NEXT:    blt $a1, $a2, .LBB21_2
 ; LA64-NEXT:  # %bb.1: # %if.then
 ; LA64-NEXT:    ori $a1, $zero, 10
-; LA64-NEXT:    st.w $a1, $a0, 4
+; LA64-NEXT:    st.w $a1, $a0, %pc_lo12(g_a32+4)
 ; LA64-NEXT:  .LBB21_2: # %if.end
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: control_flow_with_mem_access:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.w $a0, $a0, 4
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32+4)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32+4)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32+4)
+; LA64-LARGE-NEXT:    ldx.w $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ori $a1, $zero, 1
 ; LA64-LARGE-NEXT:    blt $a0, $a1, .LBB21_2
 ; LA64-LARGE-NEXT:  # %bb.1: # %if.then
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 10
-; LA64-LARGE-NEXT:    st.w $a1, $a0, 4
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32+4)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32+4)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32+4)
+; LA64-LARGE-NEXT:    ori $a2, $zero, 10
+; LA64-LARGE-NEXT:    stx.w $a2, $a1, $a0
 ; LA64-LARGE-NEXT:  .LBB21_2: # %if.end
 ; LA64-LARGE-NEXT:    ret
 entry:
@@ -777,8 +715,7 @@ define dso_local ptr @load_ba_1() nounwind {
 ; LA32-NEXT:  .Ltmp0: # Block address taken
 ; LA32-NEXT:  # %bb.1: # %label
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.Ltmp0)
-; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ld.w $a0, $a0, %pc_lo12(.Ltmp0)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_ba_1:
@@ -786,8 +723,7 @@ define dso_local ptr @load_ba_1() nounwind {
 ; LA64-NEXT:  .Ltmp0: # Block address taken
 ; LA64-NEXT:  # %bb.1: # %label
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.Ltmp0)
-; LA64-NEXT:    ld.d $a0, $a0, 0
+; LA64-NEXT:    ld.d $a0, $a0, %pc_lo12(.Ltmp0)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_ba_1:
@@ -798,8 +734,7 @@ define dso_local ptr @load_ba_1() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(.Ltmp0)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(.Ltmp0)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Ltmp0)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.d $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   br label %label
@@ -813,30 +748,27 @@ define dso_local ptr @load_ba_2() nounwind {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:  .Ltmp1: # Block address taken
 ; LA32-NEXT:  # %bb.1: # %label
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.Ltmp1)
-; LA32-NEXT:    ld.w $a0, $a0, 8
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1+8)
+; LA32-NEXT:    ld.w $a0, $a0, %pc_lo12(.Ltmp1+8)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_ba_2:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:  .Ltmp1: # Block address taken
 ; LA64-NEXT:  # %bb.1: # %label
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.Ltmp1)
-; LA64-NEXT:    ld.d $a0, $a0, 8
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1+8)
+; LA64-NEXT:    ld.d $a0, $a0, %pc_lo12(.Ltmp1+8)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_ba_2:
 ; LA64-LARGE:       # %bb.0: # %entry
 ; LA64-LARGE-NEXT:  .Ltmp1: # Block address taken
 ; LA64-LARGE-NEXT:  # %bb.1: # %label
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(.Ltmp1)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(.Ltmp1)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Ltmp1)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.d $a0, $a0, 8
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1+8)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(.Ltmp1+8)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(.Ltmp1+8)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Ltmp1+8)
+; LA64-LARGE-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   br label %label
@@ -850,26 +782,23 @@ label:
 define dso_local ptr @load_addr_offset_1() nounwind {
 ; LA32-LABEL: load_addr_offset_1:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, 8
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+8)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_1:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, 8
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64+8)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_addr_offset_1:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64+8)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64+8)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64+8)
 ; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    addi.d $a0, $a0, 8
 ; LA64-LARGE-NEXT:    ret
 entry:
   ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1)
@@ -878,29 +807,23 @@ entry:
 define dso_local ptr @load_addr_offset_257() nounwind {
 ; LA32-LABEL: load_addr_offset_257:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a0, $a0, 9
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2056)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+2056)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_257:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, 2047
-; LA64-NEXT:    addi.d $a0, $a0, 9
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2056)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64+2056)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_addr_offset_257:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2056)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64+2056)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64+2056)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64+2056)
 ; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    addi.d $a0, $a0, 2047
-; LA64-LARGE-NEXT:    addi.d $a0, $a0, 9
 ; LA64-LARGE-NEXT:    ret
 entry:
   ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 257)
@@ -909,27 +832,23 @@ entry:
 define dso_local ptr @load_addr_offset_1048576() nounwind {
 ; LA32-LABEL: load_addr_offset_1048576:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    lu12i.w $a1, 2048
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8388608)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+8388608)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_1048576:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
-; LA64-NEXT:    addu16i.d $a0, $a0, 128
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8388608)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64+8388608)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_addr_offset_1048576:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8388608)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64+8388608)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64+8388608)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64+8388608)
 ; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    addu16i.d $a0, $a0, 128
 ; LA64-LARGE-NEXT:    ret
 entry:
   ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1048576)
@@ -938,30 +857,23 @@ entry:
 define dso_local ptr @load_addr_offset_1048577() nounwind {
 ; LA32-LABEL: load_addr_offset_1048577:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    lu12i.w $a1, 2048
-; LA32-NEXT:    ori $a1, $a1, 8
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8388616)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+8388616)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_1048577:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
-; LA64-NEXT:    addu16i.d $a0, $a0, 128
-; LA64-NEXT:    addi.d $a0, $a0, 8
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8388616)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64+8388616)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_addr_offset_1048577:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8388616)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64+8388616)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64+8388616)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64+8388616)
 ; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    addu16i.d $a0, $a0, 128
-; LA64-LARGE-NEXT:    addi.d $a0, $a0, 8
 ; LA64-LARGE-NEXT:    ret
 entry:
   ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1048577)
@@ -970,29 +882,23 @@ entry:
 define dso_local ptr @load_addr_offset_268432896() nounwind {
 ; LA32-LABEL: load_addr_offset_268432896:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    lu12i.w $a1, 524283
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2147463168)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+2147463168)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_268432896:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
-; LA64-NEXT:    lu12i.w $a1, 524283
-; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2147463168)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64+2147463168)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_addr_offset_268432896:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2147463168)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64+2147463168)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64+2147463168)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64+2147463168)
 ; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    lu12i.w $a1, 524283
-; LA64-LARGE-NEXT:    add.d $a0, $a0, $a1
 ; LA64-LARGE-NEXT:    ret
 entry:
   ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 268432896)
@@ -1001,32 +907,23 @@ entry:
 define dso_local ptr @load_addr_offset_268432897() nounwind {
 ; LA32-LABEL: load_addr_offset_268432897:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    lu12i.w $a1, 524283
-; LA32-NEXT:    ori $a1, $a1, 8
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2147463176)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+2147463176)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_268432897:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
-; LA64-NEXT:    lu12i.w $a1, 524283
-; LA64-NEXT:    ori $a1, $a1, 8
-; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2147463176)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64+2147463176)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_addr_offset_268432897:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2147463176)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64+2147463176)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64+2147463176)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64+2147463176)
 ; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    lu12i.w $a1, 524283
-; LA64-LARGE-NEXT:    ori $a1, $a1, 8
-; LA64-LARGE-NEXT:    add.d $a0, $a0, $a1
 ; LA64-LARGE-NEXT:    ret
 entry:
   ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 268432897)
@@ -1035,11 +932,8 @@ entry:
 define dso_local ptr @load_addr_offset_9380351707272() nounwind {
 ; LA32-LABEL: load_addr_offset_9380351707272:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    lu12i.w $a1, 279556
-; LA32-NEXT:    ori $a1, $a1, 1088
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+1145062464)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+1145062464)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_9380351707272:
@@ -1071,11 +965,8 @@ entry:
 define dso_local ptr @load_addr_offset_614750729487779976() nounwind {
 ; LA32-LABEL: load_addr_offset_614750729487779976:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    lu12i.w $a1, 279556
-; LA32-NEXT:    ori $a1, $a1, 1088
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+1145062464)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+1145062464)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_614750729487779976:
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index 391888a38daf696..e07a334888c189f 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -105,6 +105,7 @@
 ; LAXX-NEXT:       Remove dead machine instructions
 ; LA64-NEXT:       LoongArch Optimize W Instructions
 ; LAXX-NEXT:       LoongArch Pre-RA pseudo instruction expansion pass
+; LAXX-NEXT:       LoongArch Merge Base Offset
 ; LAXX-NEXT:       Detect Dead Lanes
 ; LAXX-NEXT:       Init Undef Pass
 ; LAXX-NEXT:       Process Implicit Definitions
diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
index b03a523fb79a9bf..a7873f466bee3f0 100644
--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -25,8 +25,7 @@ define void @foo() nounwind {
 ; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, 0
 ; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(.Lg$local)
-; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, 0
+; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, %pc_lo12(.Lg$local)
 ; MEDIUM_NO_SCH-NEXT:    ori $a0, $zero, 1
 ; MEDIUM_NO_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
 ; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
@@ -55,8 +54,7 @@ define void @foo() nounwind {
 ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, 0
 ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(.Lg$local)
-; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, 0
+; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, %pc_lo12(.Lg$local)
 ; MEDIUM_SCH-NEXT:    ori $a0, $zero, 1
 ; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
 ; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
@@ -91,8 +89,7 @@ define void @foo() nounwind {
 ; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
 ; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
 ; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
-; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
-; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a1, $a0
 ; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
 ; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
@@ -148,8 +145,7 @@ define void @foo() nounwind {
 ; LARGE_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
 ; LARGE_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
 ; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
-; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
-; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGE_SCH-NEXT:    ldx.d $zero, $a1, $a0
 ; LARGE_SCH-NEXT:    ori $a0, $zero, 1
 ; LARGE_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
 ; LARGE_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
diff --git a/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll b/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll
index 0a401ebe5f6b2ae..8dd1ec465c13add 100644
--- a/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll
+++ b/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll
@@ -94,8 +94,7 @@ define void @test_f2(ptr %P, ptr %S) nounwind {
 ; LA32F-NEXT:    fld.s $fa1, $a0, 0
 ; LA32F-NEXT:    addi.w $a0, $zero, 1
 ; LA32F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; LA32F-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI1_0)
-; LA32F-NEXT:    fld.s $fa2, $a2, 0
+; LA32F-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI1_0)
 ; LA32F-NEXT:    movgr2fr.w $fa3, $a0
 ; LA32F-NEXT:    ffint.s.w $fa3, $fa3
 ; LA32F-NEXT:    fadd.s $fa1, $fa1, $fa3
@@ -110,8 +109,7 @@ define void @test_f2(ptr %P, ptr %S) nounwind {
 ; LA32D-NEXT:    fld.s $fa1, $a0, 0
 ; LA32D-NEXT:    addi.w $a0, $zero, 1
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI1_0)
-; LA32D-NEXT:    fld.s $fa2, $a2, 0
+; LA32D-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI1_0)
 ; LA32D-NEXT:    movgr2fr.w $fa3, $a0
 ; LA32D-NEXT:    ffint.s.w $fa3, $fa3
 ; LA32D-NEXT:    fadd.s $fa1, $fa1, $fa3
@@ -126,8 +124,7 @@ define void @test_f2(ptr %P, ptr %S) nounwind {
 ; LA64F-NEXT:    fld.s $fa1, $a0, 0
 ; LA64F-NEXT:    addi.w $a0, $zero, 1
 ; LA64F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; LA64F-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI1_0)
-; LA64F-NEXT:    fld.s $fa2, $a2, 0
+; LA64F-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI1_0)
 ; LA64F-NEXT:    movgr2fr.w $fa3, $a0
 ; LA64F-NEXT:    ffint.s.w $fa3, $fa3
 ; LA64F-NEXT:    fadd.s $fa1, $fa1, $fa3
@@ -142,8 +139,7 @@ define void @test_f2(ptr %P, ptr %S) nounwind {
 ; LA64D-NEXT:    fld.s $fa1, $a0, 0
 ; LA64D-NEXT:    addi.w $a0, $zero, 1
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI1_0)
-; LA64D-NEXT:    fld.s $fa2, $a2, 0
+; LA64D-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI1_0)
 ; LA64D-NEXT:    movgr2fr.w $fa3, $a0
 ; LA64D-NEXT:    ffint.s.w $fa3, $fa3
 ; LA64D-NEXT:    fadd.s $fa1, $fa1, $fa3
@@ -168,14 +164,11 @@ define void @test_f4(ptr %P, ptr %S) nounwind {
 ; LA32F-NEXT:    movgr2fr.w $fa4, $a0
 ; LA32F-NEXT:    ffint.s.w $fa4, $fa4
 ; LA32F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA32F-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA32F-NEXT:    fld.s $fa5, $a0, 0
+; LA32F-NEXT:    fld.s $fa5, $a0, %pc_lo12(.LCPI2_0)
 ; LA32F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA32F-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_1)
-; LA32F-NEXT:    fld.s $fa6, $a0, 0
+; LA32F-NEXT:    fld.s $fa6, $a0, %pc_lo12(.LCPI2_1)
 ; LA32F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_2)
-; LA32F-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_2)
-; LA32F-NEXT:    fld.s $fa7, $a0, 0
+; LA32F-NEXT:    fld.s $fa7, $a0, %pc_lo12(.LCPI2_2)
 ; LA32F-NEXT:    fadd.s $fa3, $fa3, $fa4
 ; LA32F-NEXT:    fadd.s $fa2, $fa2, $fa5
 ; LA32F-NEXT:    fadd.s $fa1, $fa1, $fa6
@@ -196,14 +189,11 @@ define void @test_f4(ptr %P, ptr %S) nounwind {
 ; LA32D-NEXT:    movgr2fr.w $fa4, $a0
 ; LA32D-NEXT:    ffint.s.w $fa4, $fa4
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA32D-NEXT:    fld.s $fa5, $a0, 0
+; LA32D-NEXT:    fld.s $fa5, $a0, %pc_lo12(.LCPI2_0)
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_1)
-; LA32D-NEXT:    fld.s $fa6, $a0, 0
+; LA32D-NEXT:    fld.s $fa6, $a0, %pc_lo12(.LCPI2_1)
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_2)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_2)
-; LA32D-NEXT:    fld.s $fa7, $a0, 0
+; LA32D-NEXT:    fld.s $fa7, $a0, %pc_lo12(.LCPI2_2)
 ; LA32D-NEXT:    fadd.s $fa3, $fa3, $fa4
 ; LA32D-NEXT:    fadd.s $fa2, $fa2, $fa5
 ; LA32D-NEXT:    fadd.s $fa1, $fa1, $fa6
@@ -224,14 +214,11 @@ define void @test_f4(ptr %P, ptr %S) nounwind {
 ; LA64F-NEXT:    movgr2fr.w $fa4, $a0
 ; LA64F-NEXT:    ffint.s.w $fa4, $fa4
 ; LA64F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA64F-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA64F-NEXT:    fld.s $fa5, $a0, 0
+; LA64F-NEXT:    fld.s $fa5, $a0, %pc_lo12(.LCPI2_0)
 ; LA64F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA64F-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_1)
-; LA64F-NEXT:    fld.s $fa6, $a0, 0
+; LA64F-NEXT:    fld.s $fa6, $a0, %pc_lo12(.LCPI2_1)
 ; LA64F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_2)
-; LA64F-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_2)
-; LA64F-NEXT:    fld.s $fa7, $a0, 0
+; LA64F-NEXT:    fld.s $fa7, $a0, %pc_lo12(.LCPI2_2)
 ; LA64F-NEXT:    fadd.s $fa3, $fa3, $fa4
 ; LA64F-NEXT:    fadd.s $fa2, $fa2, $fa5
 ; LA64F-NEXT:    fadd.s $fa1, $fa1, $fa6
@@ -252,14 +239,11 @@ define void @test_f4(ptr %P, ptr %S) nounwind {
 ; LA64D-NEXT:    movgr2fr.w $fa4, $a0
 ; LA64D-NEXT:    ffint.s.w $fa4, $fa4
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA64D-NEXT:    fld.s $fa5, $a0, 0
+; LA64D-NEXT:    fld.s $fa5, $a0, %pc_lo12(.LCPI2_0)
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_1)
-; LA64D-NEXT:    fld.s $fa6, $a0, 0
+; LA64D-NEXT:    fld.s $fa6, $a0, %pc_lo12(.LCPI2_1)
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_2)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_2)
-; LA64D-NEXT:    fld.s $fa7, $a0, 0
+; LA64D-NEXT:    fld.s $fa7, $a0, %pc_lo12(.LCPI2_2)
 ; LA64D-NEXT:    fadd.s $fa3, $fa3, $fa4
 ; LA64D-NEXT:    fadd.s $fa2, $fa2, $fa5
 ; LA64D-NEXT:    fadd.s $fa1, $fa1, $fa6
@@ -281,14 +265,11 @@ define void @test_f8(ptr %P, ptr %S) nounwind {
 ; LA32F-NEXT:    addi.w $a2, $zero, 1
 ; LA32F-NEXT:    movgr2fr.w $fa0, $a2
 ; LA32F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; LA32F-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI3_0)
-; LA32F-NEXT:    fld.s $fa1, $a2, 0
+; LA32F-NEXT:    fld.s $fa1, $a2, %pc_lo12(.LCPI3_0)
 ; LA32F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_1)
-; LA32F-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI3_1)
-; LA32F-NEXT:    fld.s $fa2, $a2, 0
+; LA32F-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI3_1)
 ; LA32F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_2)
-; LA32F-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI3_2)
-; LA32F-NEXT:    fld.s $fa3, $a2, 0
+; LA32F-NEXT:    fld.s $fa3, $a2, %pc_lo12(.LCPI3_2)
 ; LA32F-NEXT:    fld.s $fa4, $a0, 28
 ; LA32F-NEXT:    fld.s $fa5, $a0, 24
 ; LA32F-NEXT:    fld.s $fa6, $a0, 12
@@ -321,14 +302,11 @@ define void @test_f8(ptr %P, ptr %S) nounwind {
 ; LA32D-NEXT:    addi.w $a2, $zero, 1
 ; LA32D-NEXT:    movgr2fr.w $fa0, $a2
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI3_0)
-; LA32D-NEXT:    fld.s $fa1, $a2, 0
+; LA32D-NEXT:    fld.s $fa1, $a2, %pc_lo12(.LCPI3_0)
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_1)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI3_1)
-; LA32D-NEXT:    fld.s $fa2, $a2, 0
+; LA32D-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI3_1)
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_2)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI3_2)
-; LA32D-NEXT:    fld.s $fa3, $a2, 0
+; LA32D-NEXT:    fld.s $fa3, $a2, %pc_lo12(.LCPI3_2)
 ; LA32D-NEXT:    fld.s $fa4, $a0, 28
 ; LA32D-NEXT:    fld.s $fa5, $a0, 24
 ; LA32D-NEXT:    fld.s $fa6, $a0, 12
@@ -361,14 +339,11 @@ define void @test_f8(ptr %P, ptr %S) nounwind {
 ; LA64F-NEXT:    addi.w $a2, $zero, 1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a2
 ; LA64F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; LA64F-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI3_0)
-; LA64F-NEXT:    fld.s $fa1, $a2, 0
+; LA64F-NEXT:    fld.s $fa1, $a2, %pc_lo12(.LCPI3_0)
 ; LA64F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_1)
-; LA64F-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI3_1)
-; LA64F-NEXT:    fld.s $fa2, $a2, 0
+; LA64F-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI3_1)
 ; LA64F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_2)
-; LA64F-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI3_2)
-; LA64F-NEXT:    fld.s $fa3, $a2, 0
+; LA64F-NEXT:    fld.s $fa3, $a2, %pc_lo12(.LCPI3_2)
 ; LA64F-NEXT:    fld.s $fa4, $a0, 28
 ; LA64F-NEXT:    fld.s $fa5, $a0, 24
 ; LA64F-NEXT:    fld.s $fa6, $a0, 12
@@ -401,14 +376,11 @@ define void @test_f8(ptr %P, ptr %S) nounwind {
 ; LA64D-NEXT:    addi.w $a2, $zero, 1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a2
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI3_0)
-; LA64D-NEXT:    fld.s $fa1, $a2, 0
+; LA64D-NEXT:    fld.s $fa1, $a2, %pc_lo12(.LCPI3_0)
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_1)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI3_1)
-; LA64D-NEXT:    fld.s $fa2, $a2, 0
+; LA64D-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI3_1)
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_2)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI3_2)
-; LA64D-NEXT:    fld.s $fa3, $a2, 0
+; LA64D-NEXT:    fld.s $fa3, $a2, %pc_lo12(.LCPI3_2)
 ; LA64D-NEXT:    fld.s $fa4, $a0, 28
 ; LA64D-NEXT:    fld.s $fa5, $a0, 24
 ; LA64D-NEXT:    fld.s $fa6, $a0, 12
@@ -488,8 +460,7 @@ define void @test_d2(ptr %P, ptr %S) nounwind {
 ; LA32D-NEXT:    addi.w $a0, $zero, 1
 ; LA32D-NEXT:    movgr2fr.w $fa2, $a0
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI4_0)
-; LA32D-NEXT:    fld.d $fa3, $a0, 0
+; LA32D-NEXT:    fld.d $fa3, $a0, %pc_lo12(.LCPI4_0)
 ; LA32D-NEXT:    ffint.s.w $fa2, $fa2
 ; LA32D-NEXT:    fcvt.d.s $fa2, $fa2
 ; LA32D-NEXT:    fadd.d $fa1, $fa1, $fa2
@@ -529,8 +500,7 @@ define void @test_d2(ptr %P, ptr %S) nounwind {
 ; LA64D-NEXT:    fld.d $fa1, $a0, 0
 ; LA64D-NEXT:    addi.d $a0, $zero, 1
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI4_0)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI4_0)
-; LA64D-NEXT:    fld.d $fa2, $a2, 0
+; LA64D-NEXT:    fld.d $fa2, $a2, %pc_lo12(.LCPI4_0)
 ; LA64D-NEXT:    movgr2fr.d $fa3, $a0
 ; LA64D-NEXT:    ffint.d.l $fa3, $fa3
 ; LA64D-NEXT:    fadd.d $fa1, $fa1, $fa3
@@ -625,14 +595,11 @@ define void @test_d4(ptr %P, ptr %S) nounwind {
 ; LA32D-NEXT:    ffint.s.w $fa4, $fa4
 ; LA32D-NEXT:    fcvt.d.s $fa4, $fa4
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI5_0)
-; LA32D-NEXT:    fld.d $fa5, $a0, 0
+; LA32D-NEXT:    fld.d $fa5, $a0, %pc_lo12(.LCPI5_0)
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_1)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI5_1)
-; LA32D-NEXT:    fld.d $fa6, $a0, 0
+; LA32D-NEXT:    fld.d $fa6, $a0, %pc_lo12(.LCPI5_1)
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_2)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI5_2)
-; LA32D-NEXT:    fld.d $fa7, $a0, 0
+; LA32D-NEXT:    fld.d $fa7, $a0, %pc_lo12(.LCPI5_2)
 ; LA32D-NEXT:    fadd.d $fa3, $fa3, $fa4
 ; LA32D-NEXT:    fadd.d $fa2, $fa2, $fa5
 ; LA32D-NEXT:    fadd.d $fa1, $fa1, $fa6
@@ -696,14 +663,11 @@ define void @test_d4(ptr %P, ptr %S) nounwind {
 ; LA64D-NEXT:    movgr2fr.d $fa4, $a0
 ; LA64D-NEXT:    ffint.d.l $fa4, $fa4
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
-; LA64D-NEXT:    fld.d $fa5, $a0, 0
+; LA64D-NEXT:    fld.d $fa5, $a0, %pc_lo12(.LCPI5_0)
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_1)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_1)
-; LA64D-NEXT:    fld.d $fa6, $a0, 0
+; LA64D-NEXT:    fld.d $fa6, $a0, %pc_lo12(.LCPI5_1)
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_2)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_2)
-; LA64D-NEXT:    fld.d $fa7, $a0, 0
+; LA64D-NEXT:    fld.d $fa7, $a0, %pc_lo12(.LCPI5_2)
 ; LA64D-NEXT:    fadd.d $fa3, $fa3, $fa4
 ; LA64D-NEXT:    fadd.d $fa2, $fa2, $fa5
 ; LA64D-NEXT:    fadd.d $fa1, $fa1, $fa6
@@ -852,14 +816,11 @@ define void @test_d8(ptr %P, ptr %S) nounwind {
 ; LA32D-NEXT:    addi.w $a2, $zero, 1
 ; LA32D-NEXT:    movgr2fr.w $fa0, $a2
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI6_0)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI6_0)
-; LA32D-NEXT:    fld.d $fa1, $a2, 0
+; LA32D-NEXT:    fld.d $fa1, $a2, %pc_lo12(.LCPI6_0)
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI6_1)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI6_1)
-; LA32D-NEXT:    fld.d $fa2, $a2, 0
+; LA32D-NEXT:    fld.d $fa2, $a2, %pc_lo12(.LCPI6_1)
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI6_2)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI6_2)
-; LA32D-NEXT:    fld.d $fa3, $a2, 0
+; LA32D-NEXT:    fld.d $fa3, $a2, %pc_lo12(.LCPI6_2)
 ; LA32D-NEXT:    fld.d $fa4, $a0, 56
 ; LA32D-NEXT:    fld.d $fa5, $a0, 48
 ; LA32D-NEXT:    fld.d $fa6, $a0, 24
@@ -976,14 +937,11 @@ define void @test_d8(ptr %P, ptr %S) nounwind {
 ; LA64D-NEXT:    addi.d $a2, $zero, 1
 ; LA64D-NEXT:    movgr2fr.d $fa0, $a2
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI6_0)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI6_0)
-; LA64D-NEXT:    fld.d $fa1, $a2, 0
+; LA64D-NEXT:    fld.d $fa1, $a2, %pc_lo12(.LCPI6_0)
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI6_1)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI6_1)
-; LA64D-NEXT:    fld.d $fa2, $a2, 0
+; LA64D-NEXT:    fld.d $fa2, $a2, %pc_lo12(.LCPI6_1)
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI6_2)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI6_2)
-; LA64D-NEXT:    fld.d $fa3, $a2, 0
+; LA64D-NEXT:    fld.d $fa3, $a2, %pc_lo12(.LCPI6_2)
 ; LA64D-NEXT:    fld.d $fa4, $a0, 56
 ; LA64D-NEXT:    fld.d $fa5, $a0, 48
 ; LA64D-NEXT:    fld.d $fa6, $a0, 24
diff --git a/llvm/test/CodeGen/RISCV/double-convert-strict.ll b/llvm/test/CodeGen/RISCV/double-convert-strict.ll
index 13bcafb5ebd1361..3732978b8bd83ea 100644
--- a/llvm/test/CodeGen/RISCV/double-convert-strict.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert-strict.ll
@@ -777,11 +777,9 @@ define signext i32 @fcvt_d_w_demanded_bits(i32 signext %0, ptr %1) nounwind stri
 ;
 ; RV64IZFINXZDINX-LABEL: fcvt_d_w_demanded_bits:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    addiw a2, a0, 1
-; RV64IZFINXZDINX-NEXT:    addi a0, a0, 1
-; RV64IZFINXZDINX-NEXT:    fcvt.d.w a0, a0
-; RV64IZFINXZDINX-NEXT:    sd a0, 0(a1)
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    addiw a0, a0, 1
+; RV64IZFINXZDINX-NEXT:    fcvt.d.w a2, a0
+; RV64IZFINXZDINX-NEXT:    sd a2, 0(a1)
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_d_w_demanded_bits:
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index feea4f19720b0ba..2e2e1b924cf0096 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -1459,11 +1459,9 @@ define signext i32 @fcvt_d_w_demanded_bits(i32 signext %0, ptr %1) nounwind {
 ;
 ; RV64IZFINXZDINX-LABEL: fcvt_d_w_demanded_bits:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    addiw a2, a0, 1
-; RV64IZFINXZDINX-NEXT:    addi a0, a0, 1
-; RV64IZFINXZDINX-NEXT:    fcvt.d.w a0, a0
-; RV64IZFINXZDINX-NEXT:    sd a0, 0(a1)
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    addiw a0, a0, 1
+; RV64IZFINXZDINX-NEXT:    fcvt.d.w a2, a0
+; RV64IZFINXZDINX-NEXT:    sd a2, 0(a1)
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_d_w_demanded_bits:
diff --git a/llvm/test/CodeGen/RISCV/float-convert-strict.ll b/llvm/test/CodeGen/RISCV/float-convert-strict.ll
index 402d6f0362e6d3d..0c265e11652a2ec 100644
--- a/llvm/test/CodeGen/RISCV/float-convert-strict.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert-strict.ll
@@ -645,11 +645,9 @@ define signext i32 @fcvt_s_w_demanded_bits(i32 signext %0, ptr %1) nounwind stri
 ;
 ; RV64IZFINX-LABEL: fcvt_s_w_demanded_bits:
 ; RV64IZFINX:       # %bb.0:
-; RV64IZFINX-NEXT:    addiw a2, a0, 1
-; RV64IZFINX-NEXT:    addi a0, a0, 1
-; RV64IZFINX-NEXT:    fcvt.s.w a0, a0
-; RV64IZFINX-NEXT:    sw a0, 0(a1)
-; RV64IZFINX-NEXT:    mv a0, a2
+; RV64IZFINX-NEXT:    addiw a0, a0, 1
+; RV64IZFINX-NEXT:    fcvt.s.w a2, a0
+; RV64IZFINX-NEXT:    sw a2, 0(a1)
 ; RV64IZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_s_w_demanded_bits:
diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll
index 7eabd3f5f2273af..21bf6618c52a26b 100644
--- a/llvm/test/CodeGen/RISCV/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert.ll
@@ -1247,11 +1247,9 @@ define signext i32 @fcvt_s_w_demanded_bits(i32 signext %0, ptr %1) nounwind {
 ;
 ; RV64IZFINX-LABEL: fcvt_s_w_demanded_bits:
 ; RV64IZFINX:       # %bb.0:
-; RV64IZFINX-NEXT:    addiw a2, a0, 1
-; RV64IZFINX-NEXT:    addi a0, a0, 1
-; RV64IZFINX-NEXT:    fcvt.s.w a0, a0
-; RV64IZFINX-NEXT:    sw a0, 0(a1)
-; RV64IZFINX-NEXT:    mv a0, a2
+; RV64IZFINX-NEXT:    addiw a0, a0, 1
+; RV64IZFINX-NEXT:    fcvt.s.w a2, a0
+; RV64IZFINX-NEXT:    sw a2, 0(a1)
 ; RV64IZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_s_w_demanded_bits:
diff --git a/llvm/test/CodeGen/RISCV/half-convert-strict.ll b/llvm/test/CodeGen/RISCV/half-convert-strict.ll
index 677aa9263ea6155..8f88a4c570ea054 100644
--- a/llvm/test/CodeGen/RISCV/half-convert-strict.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert-strict.ll
@@ -1963,11 +1963,9 @@ define signext i32 @fcvt_h_w_demanded_bits(i32 signext %0, ptr %1) strictfp {
 ;
 ; RV64IZHINX-LABEL: fcvt_h_w_demanded_bits:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    addiw a2, a0, 1
-; RV64IZHINX-NEXT:    addi a0, a0, 1
-; RV64IZHINX-NEXT:    fcvt.h.w a0, a0
-; RV64IZHINX-NEXT:    sh a0, 0(a1)
-; RV64IZHINX-NEXT:    mv a0, a2
+; RV64IZHINX-NEXT:    addiw a0, a0, 1
+; RV64IZHINX-NEXT:    fcvt.h.w a2, a0
+; RV64IZHINX-NEXT:    sh a2, 0(a1)
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IDZFH-LABEL: fcvt_h_w_demanded_bits:
@@ -1993,11 +1991,9 @@ define signext i32 @fcvt_h_w_demanded_bits(i32 signext %0, ptr %1) strictfp {
 ;
 ; RV64IZDINXZHINX-LABEL: fcvt_h_w_demanded_bits:
 ; RV64IZDINXZHINX:       # %bb.0:
-; RV64IZDINXZHINX-NEXT:    addiw a2, a0, 1
-; RV64IZDINXZHINX-NEXT:    addi a0, a0, 1
-; RV64IZDINXZHINX-NEXT:    fcvt.h.w a0, a0
-; RV64IZDINXZHINX-NEXT:    sh a0, 0(a1)
-; RV64IZDINXZHINX-NEXT:    mv a0, a2
+; RV64IZDINXZHINX-NEXT:    addiw a0, a0, 1
+; RV64IZDINXZHINX-NEXT:    fcvt.h.w a2, a0
+; RV64IZDINXZHINX-NEXT:    sh a2, 0(a1)
 ; RV64IZDINXZHINX-NEXT:    ret
 ;
 ; CHECK32-IZFHMIN-LABEL: fcvt_h_w_demanded_bits:
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index 31fb6e2ee9c8409..48bfe1c37c625c6 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -5760,11 +5760,9 @@ define signext i32 @fcvt_h_w_demanded_bits(i32 signext %0, ptr %1) nounwind {
 ;
 ; RV64IZHINX-LABEL: fcvt_h_w_demanded_bits:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    addiw a2, a0, 1
-; RV64IZHINX-NEXT:    addi a0, a0, 1
-; RV64IZHINX-NEXT:    fcvt.h.w a0, a0
-; RV64IZHINX-NEXT:    sh a0, 0(a1)
-; RV64IZHINX-NEXT:    mv a0, a2
+; RV64IZHINX-NEXT:    addiw a0, a0, 1
+; RV64IZHINX-NEXT:    fcvt.h.w a2, a0
+; RV64IZHINX-NEXT:    sh a2, 0(a1)
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IZDINXZHINX-LABEL: fcvt_h_w_demanded_bits:
@@ -5776,11 +5774,9 @@ define signext i32 @fcvt_h_w_demanded_bits(i32 signext %0, ptr %1) nounwind {
 ;
 ; RV64IZDINXZHINX-LABEL: fcvt_h_w_demanded_bits:
 ; RV64IZDINXZHINX:       # %bb.0:
-; RV64IZDINXZHINX-NEXT:    addiw a2, a0, 1
-; RV64IZDINXZHINX-NEXT:    addi a0, a0, 1
-; RV64IZDINXZHINX-NEXT:    fcvt.h.w a0, a0
-; RV64IZDINXZHINX-NEXT:    sh a0, 0(a1)
-; RV64IZDINXZHINX-NEXT:    mv a0, a2
+; RV64IZDINXZHINX-NEXT:    addiw a0, a0, 1
+; RV64IZDINXZHINX-NEXT:    fcvt.h.w a2, a0
+; RV64IZDINXZHINX-NEXT:    sh a2, 0(a1)
 ; RV64IZDINXZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_h_w_demanded_bits:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
index 17a63eff26ac134..3cfcb4398a1f00c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
@@ -3,8 +3,8 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-V
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-V
-; RUN: llc -mtriple=riscv32 -mattr=+zve32x,+zvfh,+zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X
-; RUN: llc -mtriple=riscv64 -mattr=+zve32x,+zvfh,+zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X
+; RUN: llc -mtriple=riscv32 -mattr=+zve32x,+zvfh,+zvkb,+zvl64b -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X
+; RUN: llc -mtriple=riscv64 -mattr=+zve32x,+zvfh,+zvkb,+zvl64b -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X
 
 define <8 x i1> @shuffle_v8i1_as_i8_1(<8 x i1> %v) {
 ; CHECK-LABEL: shuffle_v8i1_as_i8_1:
@@ -191,7 +191,7 @@ define <8 x i8> @shuffle_v8i8_as_i16(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i16:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vrev8.v v8, v8
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -215,7 +215,7 @@ define <8 x i8> @shuffle_v8i8_as_i32_8(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i32_8:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 8
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
@@ -239,7 +239,7 @@ define <8 x i8> @shuffle_v8i8_as_i32_16(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i32_16:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 16
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
@@ -263,7 +263,7 @@ define <8 x i8> @shuffle_v8i8_as_i32_24(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i32_24:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 24
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
@@ -287,10 +287,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_8(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_8:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 1
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 7
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 1
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 7
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
   ret <8 x i8> %shuffle
@@ -313,10 +313,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_16(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_16:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 2
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 6
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 2
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 6
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
   ret <8 x i8> %shuffle
@@ -339,10 +339,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_24(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_24:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 3
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 5
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 3
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 5
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2>
   ret <8 x i8> %shuffle
@@ -365,10 +365,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_32(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_32:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 4
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 4
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 4
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 4
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i8> %shuffle
@@ -391,10 +391,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_40(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_40:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 5
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 3
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 5
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 3
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4>
   ret <8 x i8> %shuffle
@@ -417,10 +417,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_48(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_48:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 6
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 2
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 6
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 2
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
   ret <8 x i8> %shuffle
@@ -443,10 +443,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_56(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_56:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 7
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 1
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 7
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 1
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
   ret <8 x i8> %shuffle
@@ -469,7 +469,7 @@ define <8 x i16> @shuffle_v8i16_as_i32(<8 x i16> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i16_as_i32:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 16
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -512,11 +512,11 @@ define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI19_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI19_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
   ret <8 x i16> %shuffle
@@ -558,11 +558,11 @@ define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI20_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI20_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   ret <8 x i16> %shuffle
@@ -604,11 +604,11 @@ define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI21_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI21_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
   ret <8 x i16> %shuffle
@@ -650,12 +650,12 @@ define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI22_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI22_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v16, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v24, v16
-; ZVKB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVKB-ZVE32X-NEXT:    vrgatherei16.vv v16, v8, v24
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v16
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
+; ZVKB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vrgatherei16.vv v12, v8, v16
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   ret <8 x i32> %shuffle
@@ -678,7 +678,7 @@ define <8 x half> @shuffle_v8f16_as_i32(<8 x half> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8f16_as_i32:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 16
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -721,11 +721,11 @@ define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI24_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI24_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
   ret <8 x half> %shuffle
@@ -767,11 +767,11 @@ define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI25_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI25_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   ret <8 x half> %shuffle
@@ -813,11 +813,11 @@ define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI26_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI26_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
   ret <8 x half> %shuffle
@@ -859,12 +859,12 @@ define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI27_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI27_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v16, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v24, v16
-; ZVKB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVKB-ZVE32X-NEXT:    vrgatherei16.vv v16, v8, v24
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v16
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
+; ZVKB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vrgatherei16.vv v12, v8, v16
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x float> %v, <8 x float> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   ret <8 x float> %shuffle
diff --git a/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll b/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll
new file mode 100644
index 000000000000000..c6ace3eb55b287b
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll
@@ -0,0 +1,111 @@
+; RUN: llc %s -o - -mtriple=thumbv8m.main -mattr=+vfp4 | FileCheck %s
+
+;; No outgoing arguments, plenty of free registers to hold the target address.
+define void @test0(ptr %fptr) {
+; CHECK-LABEL: test0:
+; CHECK: bx {{r0|r1|r2|r3|r12}}
+entry:
+  tail call void %fptr()
+  ret void
+}
+
+;; Four integer outgoing arguments, which use up r0-r3.
+define void @test1(ptr %fptr) {
+; CHECK-LABEL: test1:
+; CHECK: bx r12
+entry:
+  tail call void %fptr(i32 0, i32 0, i32 0, i32 0)
+  ret void
+}
+
+;; Four integer outgoing arguments, which use up r0-r3, and sign-return-address
+;; uses r12, so we can never tail-call this.
+define void @test2(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test2:
+; CHECK: blx
+  entry:
+  tail call void %fptr(i32 0, i32 0, i32 0, i32 0)
+  ret void
+}
+
+;; An i32 and an i64 argument, which uses r0, r2 and r3 for arguments, leaving
+;; r1 free for the address.
+define void @test3(ptr %fptr) {
+; CHECK-LABEL: test3:
+; CHECK: bx {{r1|r12}}
+entry:
+  tail call void %fptr(i32 0, i64 0)
+  ret void
+}
+
+;; Four float arguments, using the soft-float calling convention, which uses
+;; r0-r3.
+define void @test4(ptr %fptr) {
+; CHECK-LABEL: test4:
+; CHECK: bx r12
+entry:
+  tail call arm_aapcscc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Four float arguments, using the soft-float calling convention, which uses
+;; r0-r3, and sign-return-address uses r12. Currently fails with "ran out of
+;; registers during register allocation".
+define void @test5(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test5:
+; CHECK: blx
+entry:
+  tail call arm_aapcscc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Four float arguments, using the hard-float calling convention, which uses
+;; s0-s3, leaving the all of the integer registers free for the address.
+define void @test6(ptr %fptr) {
+; CHECK-LABEL: test6:
+; CHECK: bx {{r0|r1|r2|r3|r12}}
+entry:
+  tail call arm_aapcs_vfpcc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Four float arguments, using the hard-float calling convention, which uses
+;; s0-s3, leaving r0-r3 free for the address, with r12 used for
+;; sign-return-address.
+define void @test7(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test7:
+; CHECK: bx {{r0|r1|r2|r3}}
+entry:
+  tail call arm_aapcs_vfpcc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Two double arguments, using the soft-float calling convention, which uses
+;; r0-r3.
+define void @test8(ptr %fptr) {
+; CHECK-LABEL: test8:
+; CHECK: bx r12
+entry:
+  tail call arm_aapcscc void %fptr(double 0.0, double 0.0)
+  ret void
+}
+
+;; Two double arguments, using the soft-float calling convention, which uses
+;; r0-r3, and sign-return-address uses r12, so we can't tail-call this.
+define void @test9(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test9:
+; CHECK: blx
+entry:
+  tail call arm_aapcscc void %fptr(double 0.0, double 0.0)
+  ret void
+}
+
+;; Four integer arguments (one on the stack), but dut to alignment r1 is left
+;; empty, so can be used for the tail-call.
+define void @test10(ptr %fptr, i64 %b, i32 %c) "sign-return-address"="all" {
+; CHECK-LABEL: test10:
+; CHECK: bx r1
+entry:
+  tail call void %fptr(i32 0, i64 %b, i32 %c)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/combine-sub.ll b/llvm/test/CodeGen/X86/combine-sub.ll
index 9d5934c345f8a5a..3123efc306360e4 100644
--- a/llvm/test/CodeGen/X86/combine-sub.ll
+++ b/llvm/test/CodeGen/X86/combine-sub.ll
@@ -452,3 +452,16 @@ define void @PR52032_4(ptr %p, ptr %q) {
   store <4 x i32> %i9, ptr %p2, align 4
   ret void
 }
+
+; FIXME: Failure to fold add(xor(bsr(x),-32),33) -> add(or(bsr(x),-32),33) -> add(bsr(x),1)
+define i32 @PR74101(i32 %a0) {
+; CHECK-LABEL: PR74101:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bsrl %edi, %eax
+; CHECK-NEXT:    xorl $-32, %eax
+; CHECK-NEXT:    addl $33, %eax
+; CHECK-NEXT:    retq
+  %lz = call i32 @llvm.ctlz.i32(i32 %a0, i1 true)
+  %add = sub nuw nsw i32 32, %lz
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/X86/section-stats.ll b/llvm/test/CodeGen/X86/section-stats.ll
new file mode 100644
index 000000000000000..94d0a965ac59ee4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/section-stats.ll
@@ -0,0 +1,13 @@
+; REQUIRES: asserts
+; RUN: llc -o /dev/null -filetype=obj -stats %s 2>&1 | FileCheck %s
+
+; CHECK-DAG: 1 elf-object-writer - Total size of SHF_ALLOC text sections
+; CHECK-DAG: 1 elf-object-writer - Total size of SHF_ALLOC read-write sections
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = global i8 1
+
+define void @f() {
+    ret void
+}
diff --git a/llvm/test/DebugInfo/X86/undef-dbg-val.ll b/llvm/test/DebugInfo/X86/undef-dbg-val.ll
new file mode 100644
index 000000000000000..61f3776c22d5a41
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/undef-dbg-val.ll
@@ -0,0 +1,34 @@
+; RUN:  opt -S -passes=globalopt --experimental-debuginfo-iterators=false <%s | FileCheck %s
+; CHECK: #dbg_value(ptr undef, 
+; CHECK-SAME:    [[VAR:![0-9]+]],
+; CHECK-SAME:    !DIExpression()
+; CHECK: [[VAR]] = !DILocalVariable(name: "_format"
+
+
+; ModuleID = '<stdin>'
+source_filename = "test.cpp"
+
+@_ZZZZ4main_format = internal constant [24 x i8] c"Result1: Hello, World!\0A\00", align 16, !dbg !9
+
+define void  @foo() align 2 !dbg !5 {
+entry:
+  call void @llvm.dbg.value(metadata ptr @_ZZZZ4main_format, metadata !11, metadata !DIExpression()), !dbg !12
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, emissionKind: FullDebug)
+!1 = !DIFile(filename: "test.cpp", directory: "/path/to")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 27, type: !6, scopeLine: 27, spFlags: DISPFlagDefinition, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null}
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !DIGlobalVariableExpression(var: !10, expr: !DIExpression())
+!10 = distinct !DIGlobalVariable(name: "_format", scope: !5, file: !1, line: 49, type: !8, isLocal: true, isDefinition: true)
+!11 = !DILocalVariable(name: "_format", arg: 1, scope: !5, file: !1, line: 79, type: !8)
+!12 = !DILocation(line: 0, scope: !5)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll
index a755562d683fbd6..4cc038f03ff2c36 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll
@@ -35,7 +35,12 @@ define <8 x i8> @sqshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -74,7 +79,12 @@ define <4 x i16> @sqshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -113,7 +123,12 @@ define <2 x i32> @sqshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -152,7 +167,12 @@ define <1 x i64> @sqshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -179,7 +199,8 @@ define <1 x i64> @sqshl1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -217,7 +238,10 @@ define i64 @sqshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSLD1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[_MSLD]], i64 [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[TMP1]], i64 [[TMP2]])
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -244,7 +268,8 @@ define i64 @sqshl_scalar_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -282,7 +307,12 @@ define <8 x i8> @uqshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -321,7 +351,12 @@ define <4 x i16> @uqshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -360,7 +395,12 @@ define <2 x i32> @uqshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -399,7 +439,13 @@ define <16 x i8> @sqshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -438,7 +484,13 @@ define <8 x i16> @sqshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -477,7 +529,13 @@ define <4 x i32> @sqshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -516,7 +574,13 @@ define <2 x i64> @sqshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <2 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -555,7 +619,13 @@ define <16 x i8> @uqshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -594,7 +664,13 @@ define <8 x i16> @uqshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -633,7 +709,13 @@ define <4 x i32> @uqshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -672,7 +754,13 @@ define <2 x i64> @uqshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <2 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -711,7 +799,12 @@ define <1 x i64> @uqshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -738,7 +831,8 @@ define <1 x i64> @uqshl1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -776,7 +870,10 @@ define i64 @uqshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSLD1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[_MSLD]], i64 [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[TMP1]], i64 [[TMP2]])
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -803,7 +900,8 @@ define i64 @uqshl_scalar_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -864,7 +962,12 @@ define <8 x i8> @srshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -903,7 +1006,12 @@ define <4 x i16> @srshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -942,7 +1050,12 @@ define <2 x i32> @srshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -981,7 +1094,12 @@ define <1 x i64> @srshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -1008,7 +1126,8 @@ define <1 x i64> @srshl1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -1046,7 +1165,10 @@ define i64 @srshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSLD1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[_MSLD]], i64 [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[TMP1]], i64 [[TMP2]])
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -1073,7 +1195,8 @@ define i64 @srshl_scalar_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -1111,7 +1234,12 @@ define <8 x i8> @urshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -1150,7 +1278,12 @@ define <4 x i16> @urshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -1189,7 +1322,12 @@ define <2 x i32> @urshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -1228,7 +1366,12 @@ define <1 x i64> @urshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -1255,7 +1398,8 @@ define <1 x i64> @urshl1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -1293,7 +1437,10 @@ define i64 @urshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSLD1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[_MSLD]], i64 [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[TMP1]], i64 [[TMP2]])
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -1320,7 +1467,8 @@ define i64 @urshl_scalar_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -1358,7 +1506,13 @@ define <16 x i8> @srshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -1397,7 +1551,13 @@ define <8 x i16> @srshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -1436,7 +1596,13 @@ define <4 x i32> @srshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -1475,7 +1641,13 @@ define <2 x i64> @srshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <2 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -1514,7 +1686,13 @@ define <16 x i8> @urshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -1553,7 +1731,13 @@ define <8 x i16> @urshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -1592,7 +1776,13 @@ define <4 x i32> @urshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -1631,7 +1821,13 @@ define <2 x i64> @urshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <2 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -1692,7 +1888,12 @@ define <8 x i8> @sqrshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -1731,7 +1932,12 @@ define <4 x i16> @sqrshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -1770,7 +1976,12 @@ define <2 x i32> @sqrshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -1809,7 +2020,12 @@ define <8 x i8> @uqrshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -1848,7 +2064,12 @@ define <4 x i16> @uqrshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -1887,7 +2108,12 @@ define <2 x i32> @uqrshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -1926,7 +2152,13 @@ define <16 x i8> @sqrshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -1965,7 +2197,13 @@ define <8 x i16> @sqrshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -2004,7 +2242,13 @@ define <4 x i32> @sqrshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2043,7 +2287,13 @@ define <2 x i64> @sqrshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <2 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -2082,7 +2332,12 @@ define <1 x i64> @sqrshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -2109,7 +2364,8 @@ define <1 x i64> @sqrshl1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -2147,7 +2403,10 @@ define i64 @sqrshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSLD1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 [[_MSLD]], i64 [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 [[TMP1]], i64 [[TMP2]])
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -2174,7 +2433,8 @@ define i64 @sqrshl_scalar_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -2212,7 +2472,13 @@ define <16 x i8> @uqrshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -2251,7 +2517,13 @@ define <8 x i16> @uqrshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -2290,7 +2562,13 @@ define <4 x i32> @uqrshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2329,7 +2607,13 @@ define <2 x i64> @uqrshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <2 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -2368,7 +2652,12 @@ define <1 x i64> @uqrshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -2395,7 +2684,8 @@ define <1 x i64> @uqrshl1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -2433,7 +2723,10 @@ define i64 @uqrshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSLD1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 [[_MSLD]], i64 [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 [[TMP1]], i64 [[TMP2]])
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -2460,7 +2753,8 @@ define i64 @uqrshl_scalar_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -2508,7 +2802,8 @@ define <8 x i8> @urshr8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -2534,7 +2829,8 @@ define <4 x i16> @urshr4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -2560,7 +2856,8 @@ define <2 x i32> @urshr2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 -1, i32 -1>)
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -2586,7 +2883,8 @@ define <16 x i8> @urshr16b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -2612,7 +2910,8 @@ define <8 x i16> @urshr8h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -2638,7 +2937,8 @@ define <4 x i32> @urshr4s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2664,7 +2964,8 @@ define <2 x i64> @urshr2d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 -1, i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 -1, i64 -1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -2690,7 +2991,8 @@ define <1 x i64> @urshr1d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 -1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -2716,7 +3018,8 @@ define i64 @urshr_scalar(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[_MSLD]], i64 -1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[TMP1]], i64 -1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -2742,7 +3045,8 @@ define <8 x i8> @srshr8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -2768,7 +3072,8 @@ define <4 x i16> @srshr4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -2794,7 +3099,8 @@ define <2 x i32> @srshr2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 -1, i32 -1>)
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -2820,7 +3126,8 @@ define <16 x i8> @srshr16b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -2846,7 +3153,8 @@ define <8 x i16> @srshr8h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -2872,7 +3180,8 @@ define <4 x i32> @srshr4s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2898,7 +3207,8 @@ define <2 x i64> @srshr2d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 -1, i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 -1, i64 -1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -2924,7 +3234,8 @@ define <1 x i64> @srshr1d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 -1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -2950,7 +3261,8 @@ define i64 @srshr_scalar(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[_MSLD]], i64 -1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[TMP1]], i64 -1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -2976,7 +3288,8 @@ define <8 x i8> @sqshlu8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -3002,7 +3315,8 @@ define <4 x i16> @sqshlu4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -3028,7 +3342,8 @@ define <2 x i32> @sqshlu2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -3054,7 +3369,8 @@ define <16 x i8> @sqshlu16b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -3080,7 +3396,8 @@ define <8 x i16> @sqshlu8h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -3106,7 +3423,8 @@ define <4 x i32> @sqshlu4s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -3132,7 +3450,8 @@ define <2 x i64> @sqshlu2d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 1, i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 1, i64 1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -3158,7 +3477,8 @@ define <1 x i64> @sqshlu1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -3184,7 +3504,8 @@ define i64 @sqshlu_i64_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -3210,7 +3531,8 @@ define i32 @sqshlu_i32_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 [[TMP1]], i32 1)
 ; CHECK-NEXT:    store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP3]]
@@ -3248,15 +3570,10 @@ define <8 x i8> @rshrn8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -3280,15 +3597,10 @@ define <4 x i16> @rshrn4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -3312,15 +3624,10 @@ define <2 x i32> @rshrn2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -3356,15 +3663,10 @@ define <16 x i8> @rshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -3404,15 +3706,10 @@ define <8 x i16> @rshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -3452,15 +3749,10 @@ define <4 x i32> @rshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -3713,14 +4005,10 @@ define i32 @sqshrn1s(i64 %A) nounwind sanitize_memory {
 ; CHECK-SAME: i64 [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 [[TMP1]], i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 [[A]], i32 1)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ;
   %tmp = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %A, i32 1)
@@ -3743,15 +4031,10 @@ define <8 x i8> @sqshrn8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -3775,15 +4058,10 @@ define <4 x i16> @sqshrn4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -3807,15 +4085,10 @@ define <2 x i32> @sqshrn2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -3852,15 +4125,10 @@ define <16 x i8> @sqshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -3900,15 +4168,10 @@ define <8 x i16> @sqshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -3948,15 +4211,10 @@ define <4 x i32> @sqshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -3978,14 +4236,10 @@ define i32 @sqshrun1s(i64 %A) nounwind sanitize_memory {
 ; CHECK-SAME: i64 [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 [[TMP1]], i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 [[A]], i32 1)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ;
   %tmp = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %A, i32 1)
@@ -4008,15 +4262,10 @@ define <8 x i8> @sqshrun8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -4040,15 +4289,10 @@ define <4 x i16> @sqshrun4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -4072,15 +4316,10 @@ define <2 x i32> @sqshrun2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -4116,15 +4355,10 @@ define <16 x i8> @sqshrun16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -4164,15 +4398,10 @@ define <8 x i16> @sqshrun8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -4212,15 +4441,10 @@ define <4 x i32> @sqshrun4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -4242,14 +4466,10 @@ define i32 @sqrshrn1s(i64 %A) nounwind sanitize_memory {
 ; CHECK-SAME: i64 [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 [[TMP1]], i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 [[A]], i32 1)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ;
   %tmp = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %A, i32 1)
@@ -4272,15 +4492,10 @@ define <8 x i8> @sqrshrn8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -4304,15 +4519,10 @@ define <4 x i16> @sqrshrn4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -4336,15 +4546,10 @@ define <2 x i32> @sqrshrn2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -4380,15 +4585,10 @@ define <16 x i8> @sqrshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -4428,15 +4628,10 @@ define <8 x i16> @sqrshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -4476,15 +4671,10 @@ define <4 x i32> @sqrshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -4506,14 +4696,10 @@ define i32 @sqrshrun1s(i64 %A) nounwind sanitize_memory {
 ; CHECK-SAME: i64 [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 [[TMP1]], i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 [[A]], i32 1)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ;
   %tmp = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %A, i32 1)
@@ -4536,15 +4722,10 @@ define <8 x i8> @sqrshrun8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -4568,15 +4749,10 @@ define <4 x i16> @sqrshrun4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -4600,15 +4776,10 @@ define <2 x i32> @sqrshrun2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -4644,15 +4815,10 @@ define <16 x i8> @sqrshrun16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -4692,15 +4858,10 @@ define <8 x i16> @sqrshrun8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -4740,15 +4901,10 @@ define <4 x i32> @sqrshrun4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -4770,14 +4926,10 @@ define i32 @uqrshrn1s(i64 %A) nounwind sanitize_memory {
 ; CHECK-SAME: i64 [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 [[TMP1]], i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 [[A]], i32 1)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ;
   %tmp = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %A, i32 1)
@@ -4800,15 +4952,10 @@ define <8 x i8> @uqrshrn8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -4832,15 +4979,10 @@ define <4 x i16> @uqrshrn4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -4864,15 +5006,10 @@ define <2 x i32> @uqrshrn2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -4908,15 +5045,10 @@ define <16 x i8> @uqrshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -4956,15 +5088,10 @@ define <8 x i16> @uqrshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -5004,15 +5131,10 @@ define <4 x i32> @uqrshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -5034,14 +5156,10 @@ define i32 @uqshrn1s(i64 %A) nounwind sanitize_memory {
 ; CHECK-SAME: i64 [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 [[TMP1]], i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 [[A]], i32 1)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ;
   %tmp = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %A, i32 1)
@@ -5064,15 +5182,10 @@ define <8 x i8> @uqshrn8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -5096,15 +5209,10 @@ define <4 x i16> @uqshrn4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -5128,15 +5236,10 @@ define <2 x i32> @uqshrn2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -5172,15 +5275,10 @@ define <16 x i8> @uqshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -5220,15 +5318,10 @@ define <8 x i16> @uqshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -5268,15 +5361,10 @@ define <4 x i32> @uqshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -5507,7 +5595,8 @@ define <8 x i16> @neon.ushll8h_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <8 x i8> [[_MSLD]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i16>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[_MSPROP]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[TMP2]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -5536,7 +5625,13 @@ define <8 x i16> @neon.ushl8h_no_constant_shift(ptr %A) nounwind sanitize_memory
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <8 x i8> [[_MSLD]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i16>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i128 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i128
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i128 [[TMP10]] to <8 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[_MSPROP]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[TMP12]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -5565,7 +5660,8 @@ define <4 x i32> @neon.ushl8h_constant_shift_extend_not_2x(ptr %A) nounwind sani
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i8>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <4 x i8> [[_MSLD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[_MSPROP]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -5592,7 +5688,8 @@ define <8 x i16> @neon.ushl8_noext_constant_shift(ptr %A) nounwind sanitize_memo
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -5620,7 +5717,8 @@ define <4 x i32> @neon.ushll4s_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <4 x i16> [[_MSLD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[_MSPROP]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -5650,7 +5748,8 @@ define <4 x i32> @neon.ushll4s_neg_constant_shift(ptr %A) nounwind sanitize_memo
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <4 x i16> [[_MSLD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[_MSPROP]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[TMP2]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -5666,8 +5765,10 @@ define <4 x i32> @neon.ushll4s_constant_fold() nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @neon.ushll4s_constant_fold(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
@@ -5692,7 +5793,8 @@ define <2 x i64> @neon.ushll2d_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <2 x i32> [[_MSLD]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> <i64 1, i64 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[TMP2]], <2 x i64> <i64 1, i64 1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -5721,7 +5823,8 @@ define <1 x i64> @neon.ushl_vscalar_constant_shift(ptr %A) nounwind sanitize_mem
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <1 x i32> [[_MSLD]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <1 x i32> [[TMP1]] to <1 x i64>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <1 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[_MSPROP]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[TMP2]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -5750,7 +5853,8 @@ define i64 @neon.ushl_scalar_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext i32 [[_MSLD]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 [[_MSPROP]], i64 1)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 [[TMP2]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -5844,7 +5948,8 @@ define <16 x i8> @neon.sshl16b_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
@@ -5870,7 +5975,8 @@ define <16 x i8> @neon.sshl16b_non_splat_constant_shift(ptr %A) nounwind sanitiz
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 6, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 6, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
@@ -5896,7 +6002,8 @@ define <16 x i8> @neon.sshl16b_neg_constant_shift(ptr %A) nounwind sanitize_memo
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
@@ -5924,7 +6031,8 @@ define <8 x i16> @neon.sshll8h_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <8 x i8> [[_MSLD]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[_MSPROP]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[TMP2]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -5953,7 +6061,8 @@ define <4 x i32> @neon.sshl4s_wrong_ext_constant_shift(ptr %A) nounwind sanitize
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i8>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <4 x i8> [[_MSLD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[_MSPROP]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -5982,7 +6091,8 @@ define <4 x i32> @neon.sshll4s_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <4 x i16> [[_MSLD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[_MSPROP]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -6011,7 +6121,8 @@ define <4 x i32> @neon.sshll4s_neg_constant_shift(ptr %A) nounwind sanitize_memo
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <4 x i16> [[_MSLD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[_MSPROP]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[TMP2]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -6027,8 +6138,10 @@ define <4 x i32> @neon.sshl4s_constant_fold() nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @neon.sshl4s_constant_fold(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
@@ -6051,7 +6164,8 @@ define <4 x i32> @neon.sshl4s_no_fold(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -6079,7 +6193,8 @@ define <2 x i64> @neon.sshll2d_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <2 x i32> [[_MSLD]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> <i64 1, i64 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[TMP2]], <2 x i64> <i64 1, i64 1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -6108,7 +6223,8 @@ define <1 x i64> @neon.sshll_vscalar_constant_shift(ptr %A) nounwind sanitize_me
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <1 x i32> [[_MSLD]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <1 x i32> [[TMP1]] to <1 x i64>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <1 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[_MSPROP]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[TMP2]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -6137,7 +6253,8 @@ define i64 @neon.sshll_scalar_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext i32 [[_MSLD]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 [[_MSPROP]], i64 1)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 [[TMP2]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -6166,7 +6283,8 @@ define i64 @neon.sshll_scalar_constant_shift_m1(ptr %A) nounwind sanitize_memory
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext i32 [[_MSLD]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 [[_MSPROP]], i64 -1)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 [[TMP2]], i64 -1)
 ; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -6182,8 +6300,10 @@ define <2 x i64> @neon.sshl2d_constant_fold() nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @neon.sshl2d_constant_fold(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> zeroinitializer, <2 x i64> <i64 1, i64 1>)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> <i64 99, i64 1000>, <2 x i64> <i64 1, i64 1>)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> <i64 99, i64 1000>, <2 x i64> <i64 1, i64 1>)
@@ -6206,7 +6326,8 @@ define <2 x i64> @neon.sshl2d_no_fold(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 2, i64 2>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[TMP2]], <2 x i64> <i64 2, i64 2>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -6331,7 +6452,8 @@ define <8 x i8> @sqshli8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -6357,7 +6479,8 @@ define <4 x i16> @sqshli4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -6383,7 +6506,8 @@ define <2 x i32> @sqshli2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -6409,7 +6533,8 @@ define <16 x i8> @sqshli16b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -6435,7 +6560,8 @@ define <8 x i16> @sqshli8h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -6461,7 +6587,8 @@ define <4 x i32> @sqshli4s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -6487,7 +6614,8 @@ define <2 x i64> @sqshli2d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 1, i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 1, i64 1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -6513,7 +6641,8 @@ define <8 x i8> @uqshli8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -6539,7 +6668,8 @@ define <8 x i8> @uqshli8b_1(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -6565,7 +6695,8 @@ define <4 x i16> @uqshli4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -6591,7 +6722,8 @@ define <2 x i32> @uqshli2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -6617,7 +6749,8 @@ define <16 x i8> @uqshli16b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -6643,7 +6776,8 @@ define <8 x i16> @uqshli8h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -6669,7 +6803,8 @@ define <4 x i32> @uqshli4s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -6695,7 +6830,8 @@ define <2 x i64> @uqshli2d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 1, i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 1, i64 1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -6722,14 +6858,15 @@ define <8 x i8> @ursra8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -6764,14 +6901,15 @@ define <4 x i16> @ursra4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -6806,14 +6944,15 @@ define <2 x i32> @ursra2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 -1, i32 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -6848,14 +6987,15 @@ define <16 x i8> @ursra16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -6890,14 +7030,15 @@ define <8 x i16> @ursra8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -6932,14 +7073,15 @@ define <4 x i32> @ursra4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -6974,14 +7116,15 @@ define <2 x i64> @ursra2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 -1, i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 -1, i64 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7016,14 +7159,15 @@ define <1 x i64> @ursra1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7058,14 +7202,15 @@ define i64 @ursra_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[_MSLD]], i64 -1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP8]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[TMP1]], i64 -1)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7100,14 +7245,15 @@ define <8 x i8> @srsra8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7142,14 +7288,15 @@ define <4 x i16> @srsra4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7184,14 +7331,15 @@ define <2 x i32> @srsra2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 -1, i32 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7226,14 +7374,15 @@ define <16 x i8> @srsra16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7268,14 +7417,15 @@ define <8 x i16> @srsra8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7310,14 +7460,15 @@ define <4 x i32> @srsra4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7352,14 +7503,15 @@ define <2 x i64> @srsra2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 -1, i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 -1, i64 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7394,14 +7546,15 @@ define <1 x i64> @srsra1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7436,14 +7589,15 @@ define i64 @srsra_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[_MSLD]], i64 -1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP8]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[TMP1]], i64 -1)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -9253,14 +9407,15 @@ define void @sqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9285,14 +9440,15 @@ define void @uqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9317,14 +9473,15 @@ define void @srshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9349,14 +9506,15 @@ define void @urshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9381,14 +9539,15 @@ define void @sqshlu_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sani
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9413,14 +9572,15 @@ define void @sshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) saniti
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9445,14 +9605,15 @@ define void @ushl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) saniti
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9473,16 +9634,12 @@ define <4 x i32> @sext_rshrn(<4 x i32> noundef %a) sanitize_memory {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[TMP0]], i32 13)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[VRSHRN_N1:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[A]], i32 13)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
 ; CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <4 x i16> [[VRSHRN_N1]] to <4 x i32>
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
 ;
 entry:
@@ -9497,16 +9654,12 @@ define <4 x i32> @zext_rshrn(<4 x i32> noundef %a) sanitize_memory {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[TMP0]], i32 13)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[VRSHRN_N1:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[A]], i32 13)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
 ; CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[VRSHRN_N1]] to <4 x i32>
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
 ;
 entry:
@@ -9523,15 +9676,10 @@ define <4 x i16> @mul_rshrn(<4 x i32> noundef %a) sanitize_memory {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = add <4 x i32> [[A]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[_MSPROP]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[_MSPROP]], i32 13)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[VRSHRN_N1:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[B]], i32 13)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[VRSHRN_N1]]
 ;
 entry:
diff --git a/llvm/test/Instrumentation/RealtimeSanitizer/rtsan.ll b/llvm/test/Instrumentation/RealtimeSanitizer/rtsan.ll
new file mode 100644
index 000000000000000..a0bc4aef2cc319a
--- /dev/null
+++ b/llvm/test/Instrumentation/RealtimeSanitizer/rtsan.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -passes=rtsan -S | FileCheck %s
+
+define void @violation() #0 {
+  %1 = alloca ptr, align 8
+  %2 = call ptr @malloc(i64 noundef 2) #3
+  store ptr %2, ptr %1, align 8
+  ret void
+}
+
+declare ptr @malloc(i64 noundef) #1
+
+define noundef i32 @main() #2 {
+  %1 = alloca i32, align 4
+  store i32 0, ptr %1, align 4
+  call void @violation() #4
+  ret i32 0
+}
+
+attributes #0 = { mustprogress noinline sanitize_realtime optnone ssp uwtable(sync) }
+
+; RealtimeSanitizer pass should insert __rtsan_realtime_enter right after function definition
+; CHECK-LABEL: @violation()
+; CHECK-NEXT: call{{.*}}@__rtsan_realtime_enter
+
+; RealtimeSanitizer pass should insert __rtsan_realtime_exit right before function return
+; CHECK: call{{.*}}@__rtsan_realtime_exit
+; CHECK-NEXT: ret{{.*}}void
diff --git a/llvm/test/Instrumentation/RealtimeSanitizer/rtsan_multi_return.ll b/llvm/test/Instrumentation/RealtimeSanitizer/rtsan_multi_return.ll
new file mode 100644
index 000000000000000..39a1ff0b7c442ae
--- /dev/null
+++ b/llvm/test/Instrumentation/RealtimeSanitizer/rtsan_multi_return.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -passes=rtsan -S | FileCheck %s
+
+define i32 @example(i32 %x) #0 {
+entry:
+    %retval = alloca i32
+    %cmp = icmp sgt i32 %x, 10
+    br i1 %cmp, label %then, label %else
+
+then:
+    ret i32 1
+
+else:
+    ret i32 0
+}
+
+attributes #0 = { mustprogress noinline sanitize_realtime optnone ssp uwtable(sync) }
+
+; RealtimeSanitizer pass should insert __rtsan_realtime_enter right after function definition
+; CHECK-LABEL: @example(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: call{{.*}}@__rtsan_realtime_enter
+
+; RealtimeSanitizer pass should insert the call at both function returns
+; CHECK-LABEL: then:
+; CHECK-NEXT: call{{.*}}@__rtsan_realtime_exit
+; CHECK-NEXT: ret i32 1
+
+; CHECK-LABEL: else:
+; CHECK-NEXT: call{{.*}}@__rtsan_realtime_exit
+; CHECK-NEXT: ret i32 0
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
index 7f99afe01925997..68442b01bf7d909 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
@@ -169,21 +169,3 @@ s_load_b96 s[20:22], s[2:3], s0
 
 s_buffer_load_b96 s[20:22], s[4:7], s0
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-
-v_mov_b16 v0.l, s0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, ttmp0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, a0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, s0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, ttmp0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, a0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s
new file mode 100644
index 000000000000000..aa2309dd7d5d7cd
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s
@@ -0,0 +1,10 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s
+
+v_mov_b16 v0.l, s0.h
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_mov_b16 v0.l, ttmp0.h
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_mov_b16 v0.l, a0.h
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll
index d51e9291a6119ca..78969839efcb8a2 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll
@@ -127,8 +127,6 @@ define i16 @test_cmpxchg_i16_global_agent_align4(ptr addrspace(1) %out, i16 %in,
 define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ; GFX90A-LABEL: define void @syncscope_workgroup_nortn(
 ; GFX90A-SAME: ptr [[ADDR:%.*]], float [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX90A-NEXT:    br label [[ATOMICRMW_CHECK_SHARED:%.*]]
-; GFX90A:       atomicrmw.check.shared:
 ; GFX90A-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[ADDR]])
 ; GFX90A-NEXT:    br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
 ; GFX90A:       atomicrmw.shared:
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization-preserve-name.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization-preserve-name.ll
new file mode 100644
index 000000000000000..44cd7097059cd2e
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization-preserve-name.ll
@@ -0,0 +1,10 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck %s
+
+; CHECK: %preserve_me = phi float [ %{{[0-9]+}}, %atomicrmw.shared ], [ %loaded.private, %atomicrmw.private ], [ %{{[0-9]+}}, %atomicrmw.global ]
+; CHECK: ret float %preserve_me
+define float @expand_preserve_name(ptr %addr, float %val) {
+  %preserve_me = atomicrmw fadd ptr %addr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %preserve_me
+}
+
+!0 = !{}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
index 70dc5b267f73b98..fc586a01e3bcf8e 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
@@ -22,8 +22,6 @@ define float @syncscope_system(ptr %addr, float %val) {
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
 ; GFX90A-LABEL: @syncscope_system(
-; GFX90A-NEXT:    br label [[ATOMICRMW_CHECK_SHARED:%.*]]
-; GFX90A:       atomicrmw.check.shared:
 ; GFX90A-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[ADDR:%.*]])
 ; GFX90A-NEXT:    br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
 ; GFX90A:       atomicrmw.shared:
@@ -36,18 +34,18 @@ define float @syncscope_system(ptr %addr, float %val) {
 ; GFX90A:       atomicrmw.private:
 ; GFX90A-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(5)
 ; GFX90A-NEXT:    [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4
-; GFX90A-NEXT:    [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
-; GFX90A-NEXT:    store float [[VAL_NEW]], ptr addrspace(5) [[TMP3]], align 4
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
+; GFX90A-NEXT:    store float [[NEW]], ptr addrspace(5) [[TMP3]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.global:
 ; GFX90A-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
 ; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.phi:
-; GFX90A-NEXT:    [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
+; GFX90A-NEXT:    [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_END:%.*]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    ret float [[LOADED_PHI]]
+; GFX90A-NEXT:    ret float [[RES]]
 ;
 ; GFX940-LABEL: @syncscope_system(
 ; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float [[VAL:%.*]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.ignore.denormal.mode [[META0]]
@@ -94,8 +92,6 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) {
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
 ; GFX90A-LABEL: @syncscope_workgroup_rtn(
-; GFX90A-NEXT:    br label [[ATOMICRMW_CHECK_SHARED:%.*]]
-; GFX90A:       atomicrmw.check.shared:
 ; GFX90A-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[ADDR:%.*]])
 ; GFX90A-NEXT:    br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
 ; GFX90A:       atomicrmw.shared:
@@ -108,18 +104,18 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) {
 ; GFX90A:       atomicrmw.private:
 ; GFX90A-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(5)
 ; GFX90A-NEXT:    [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4
-; GFX90A-NEXT:    [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
-; GFX90A-NEXT:    store float [[VAL_NEW]], ptr addrspace(5) [[TMP3]], align 4
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
+; GFX90A-NEXT:    store float [[NEW]], ptr addrspace(5) [[TMP3]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.global:
 ; GFX90A-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
 ; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.phi:
-; GFX90A-NEXT:    [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
+; GFX90A-NEXT:    [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_END:%.*]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    ret float [[LOADED_PHI]]
+; GFX90A-NEXT:    ret float [[RES]]
 ;
 ; GFX940-LABEL: @syncscope_workgroup_rtn(
 ; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
@@ -150,8 +146,6 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) {
 
 define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
 ; GFX908-LABEL: @syncscope_workgroup_nortn(
-; GFX908-NEXT:    br label [[ATOMICRMW_CHECK_SHARED:%.*]]
-; GFX908:       atomicrmw.check.shared:
 ; GFX908-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[ADDR:%.*]])
 ; GFX908-NEXT:    br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
 ; GFX908:       atomicrmw.shared:
@@ -164,22 +158,20 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
 ; GFX908:       atomicrmw.private:
 ; GFX908-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(5)
 ; GFX908-NEXT:    [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4
-; GFX908-NEXT:    [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
-; GFX908-NEXT:    store float [[VAL_NEW]], ptr addrspace(5) [[TMP3]], align 4
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
+; GFX908-NEXT:    store float [[NEW]], ptr addrspace(5) [[TMP3]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX908:       atomicrmw.global:
 ; GFX908-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
 ; GFX908-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX908-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX908:       atomicrmw.phi:
-; GFX908-NEXT:    [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
+; GFX908-NEXT:    [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
 ; GFX908-NEXT:    br label [[ATOMICRMW_END:%.*]]
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret void
 ;
 ; GFX90A-LABEL: @syncscope_workgroup_nortn(
-; GFX90A-NEXT:    br label [[ATOMICRMW_CHECK_SHARED:%.*]]
-; GFX90A:       atomicrmw.check.shared:
 ; GFX90A-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[ADDR:%.*]])
 ; GFX90A-NEXT:    br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
 ; GFX90A:       atomicrmw.shared:
@@ -192,15 +184,15 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
 ; GFX90A:       atomicrmw.private:
 ; GFX90A-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(5)
 ; GFX90A-NEXT:    [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4
-; GFX90A-NEXT:    [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
-; GFX90A-NEXT:    store float [[VAL_NEW]], ptr addrspace(5) [[TMP3]], align 4
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
+; GFX90A-NEXT:    store float [[NEW]], ptr addrspace(5) [[TMP3]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.global:
 ; GFX90A-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
 ; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.phi:
-; GFX90A-NEXT:    [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
+; GFX90A-NEXT:    [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_END:%.*]]
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret void
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index def9522077004f5..0a091bd0fc9ada6 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -595,8 +595,6 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #0 {
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
 ; GFX90A-LABEL: @test_atomicrmw_fadd_f32_flat_unsafe(
-; GFX90A-NEXT:    br label [[ATOMICRMW_CHECK_SHARED:%.*]]
-; GFX90A:       atomicrmw.check.shared:
 ; GFX90A-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR:%.*]])
 ; GFX90A-NEXT:    br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
 ; GFX90A:       atomicrmw.shared:
@@ -609,18 +607,18 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #0 {
 ; GFX90A:       atomicrmw.private:
 ; GFX90A-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
 ; GFX90A-NEXT:    [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4
-; GFX90A-NEXT:    [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VALUE]]
-; GFX90A-NEXT:    store float [[VAL_NEW]], ptr addrspace(5) [[TMP3]], align 4
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VALUE]]
+; GFX90A-NEXT:    store float [[NEW]], ptr addrspace(5) [[TMP3]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.global:
 ; GFX90A-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
 ; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VALUE]] syncscope("wavefront") monotonic, align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.phi:
-; GFX90A-NEXT:    [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
+; GFX90A-NEXT:    [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_END:%.*]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    ret float [[LOADED_PHI]]
+; GFX90A-NEXT:    ret float [[RES]]
 ;
 ; GFX940-LABEL: @test_atomicrmw_fadd_f32_flat_unsafe(
 ; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4
diff --git a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
index f4cebf1fcb5da0c..dc685d2c4d1368a 100644
--- a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
+++ b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
@@ -410,3 +410,14 @@ define i8 @caller15_okay_intersect_ranges() {
   call void @use.val(i8 %r)
   ret i8 %r
 }
+
+define i8 @caller16_not_intersecting_ranges() {
+; CHECK-LABEL: define i8 @caller16_not_intersecting_ranges() {
+; CHECK-NEXT:    [[R_I:%.*]] = call range(i8 0, 0) i8 @val8()
+; CHECK-NEXT:    call void @use.val(i8 [[R_I]])
+; CHECK-NEXT:    ret i8 [[R_I]]
+;
+  %r = call range(i8 0, 5) i8 @callee15()
+  call void @use.val(i8 %r)
+  ret i8 %r
+}
diff --git a/llvm/test/Transforms/LICM/hoist-mustexec.ll b/llvm/test/Transforms/LICM/hoist-mustexec.ll
index 81e0815053ffe5b..a6f5a2be05ee410 100644
--- a/llvm/test/Transforms/LICM/hoist-mustexec.ll
+++ b/llvm/test/Transforms/LICM/hoist-mustexec.ll
@@ -218,7 +218,6 @@ fail:
 }
 
 ; Same as previous case, with commuted icmp.
-; FIXME: The load should get hoisted here as well.
 define i32 @test3_commuted(ptr noalias nocapture readonly %a) nounwind uwtable {
 ; CHECK-LABEL: define i32 @test3_commuted(
 ; CHECK-SAME: ptr noalias nocapture readonly [[A:%.*]]) #[[ATTR1]] {
@@ -227,6 +226,7 @@ define i32 @test3_commuted(ptr noalias nocapture readonly %a) nounwind uwtable {
 ; CHECK-NEXT:    [[IS_ZERO:%.*]] = icmp eq i32 [[LEN]], 0
 ; CHECK-NEXT:    br i1 [[IS_ZERO]], label [[FAIL:%.*]], label [[PREHEADER:%.*]]
 ; CHECK:       preheader:
+; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[INC:%.*]], [[CONTINUE:%.*]] ]
@@ -234,7 +234,6 @@ define i32 @test3_commuted(ptr noalias nocapture readonly %a) nounwind uwtable {
 ; CHECK-NEXT:    [[R_CHK:%.*]] = icmp uge i32 [[LEN]], [[IV]]
 ; CHECK-NEXT:    br i1 [[R_CHK]], label [[CONTINUE]], label [[FAIL_LOOPEXIT:%.*]]
 ; CHECK:       continue:
-; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    [[ADD]] = add nsw i32 [[I1]], [[ACC]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll b/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll
index ba78216100598d4..f37c32396cd059c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll
@@ -4,42 +4,21 @@
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
 target triple = "riscv64"
 
-; We can't use scalable vectorization for Zvl32b due to RVVBitsPerBlock being
-; 64. Since our vscale value is vlen/RVVBitsPerBlock this makes vscale 0.
-; Make sure we fall back to fixed vectorization instead.
+; We can't vectorize with Zvl32b due to RVVBitsPerBlock being 64. Since our
+; vscale value is vlen/RVVBitsPerBlock this makes vscale 0.
 define void @vector_add_i16(ptr noalias nocapture %a, i16 %v, i64 %n) {
 ; CHECK-LABEL: @vector_add_i16(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> [[TMP0]], i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> poison)
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[TMP1]], <4 x ptr> [[TMP0]], i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
-; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1020, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[ELEM:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[ELEM]], [[V]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[ELEM]], [[V:%.*]]
 ; CHECK-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
index b50a815be5abf5b..18ac2f92aa39d46 100644
--- a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
+++ b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; There is no profile, but that's OK because the prelink does not care about
 ; the content of the profile, just that we intend to use one.
 ; There is no scenario currently of doing ctx profile use without thinlto.
@@ -7,19 +7,22 @@
 
 declare void @bar()
 
+;.
+; CHECK: @__profn_foo = private constant [3 x i8] c"foo"
+;.
 define void @foo(i32 %a, ptr %fct) {
 ; CHECK-LABEL: define void @foo(
 ; CHECK-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0)
 ; CHECK-NEXT:    [[T:%.*]] = icmp eq i32 [[A]], 0
 ; CHECK-NEXT:    br i1 [[T]], label %[[YES:.*]], label %[[NO:.*]]
 ; CHECK:       [[YES]]:
 ; CHECK-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[FCT]] to i64
-; CHECK-NEXT:    call void @llvm.instrprof.value.profile(ptr @__profn_foo, i64 728453322856651412, i64 [[TMP1]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0, ptr [[FCT]])
 ; CHECK-NEXT:    call void [[FCT]](i32 0)
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[NO]]:
-; CHECK-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0)
+; CHECK-NEXT:    call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1, ptr @bar)
 ; CHECK-NEXT:    call void @bar()
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
@@ -36,3 +39,6 @@ no:
 exit:
   ret void
 }
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+;.
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
index 8fc5189e8bc79e9..cc4890e27f2bda6 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
@@ -9,95 +9,87 @@
 
 define i64 @sum_2_at_with_int_conversion(ptr %A, ptr %B, i64 %N) {
 ; CHECK-LABEL: @sum_2_at_with_int_conversion(
-; CHECK-NEXT:  at_with_int_conversion.exit11.peel:
+; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[START_I:%.*]] = load ptr, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[GEP_END_I:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[END_I:%.*]] = load ptr, ptr [[GEP_END_I]], align 8
 ; CHECK-NEXT:    [[START_INT_I:%.*]] = ptrtoint ptr [[START_I]] to i64
 ; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64
 ; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
+; CHECK-NEXT:    [[START_I1:%.*]] = load ptr, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[END_I3:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
+; CHECK-NEXT:    [[START_INT_I4:%.*]] = ptrtoint ptr [[START_I1]] to i64
+; CHECK-NEXT:    [[END_INT_I5:%.*]] = ptrtoint ptr [[END_I3]] to i64
+; CHECK-NEXT:    [[SUB_I6:%.*]] = sub i64 [[END_INT_I5]], [[START_INT_I4]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i64 [[SUB_I]] to i128
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i128 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[SUB_I6]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i128 [[TMP2]], 1
+; CHECK-NEXT:    [[UMIN:%.*]] = tail call i128 @llvm.umin.i128(i128 [[TMP3]], i128 [[TMP1]])
 ; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0)
-; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 8
-; CHECK-NEXT:    [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8
-; CHECK-NEXT:    [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
-; CHECK-NEXT:    [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64
-; CHECK-NEXT:    [[END_INT_I5_PEEL:%.*]] = ptrtoint ptr [[END_I3_PEEL]] to i64
-; CHECK-NEXT:    [[SUB_I6_PEEL:%.*]] = sub i64 [[END_INT_I5_PEEL]], [[START_INT_I4_PEEL]]
-; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8
-; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8
-; CHECK-NEXT:    [[SUM_NEXT_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]]
-; CHECK-NEXT:    [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1
-; CHECK-NEXT:    br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext nneg i64 [[SMAX]] to i128
+; CHECK-NEXT:    [[UMIN12:%.*]] = tail call i128 @llvm.umin.i128(i128 [[UMIN]], i128 [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i128 [[TMP1]], [[UMIN12]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[ERROR_I:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i128 [[TMP3]], [[UMIN12]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[ERROR_I10:%.*]], label [[LOOP_PREHEADER:%.*]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
-; CHECK-NEXT:    [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I6_PEEL]], i64 [[TMP0]])
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[UMIN]]
-; CHECK-NEXT:    [[UMIN15:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I]])
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[UMIN15]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 5
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER20:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw i64 [[SMAX]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER17:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[SUM_NEXT_PEEL]], i64 0
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP7]], -4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or disjoint i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI13:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[WIDE_LOAD17]], [[VEC_PHI16]]
-; CHECK-NEXT:    [[TMP12]] = add <2 x i64> [[TMP10]], [[WIDE_LOAD18]]
-; CHECK-NEXT:    [[TMP13]] = add <2 x i64> [[TMP11]], [[WIDE_LOAD19]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[START_I1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add <2 x i64> [[WIDE_LOAD14]], [[VEC_PHI13]]
+; CHECK-NEXT:    [[TMP14]] = add <2 x i64> [[TMP12]], [[WIDE_LOAD15]]
+; CHECK-NEXT:    [[TMP15]] = add <2 x i64> [[TMP13]], [[WIDE_LOAD16]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
-; CHECK-NEXT:    br label [[LOOP_PREHEADER20]]
-; CHECK:       loop.preheader20:
-; CHECK-NEXT:    [[IV_PH:%.*]] = phi i64 [ 1, [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP7]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[LOOP_PREHEADER17]]
+; CHECK:       loop.preheader17:
+; CHECK-NEXT:    [[IV_PH:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_PH:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT11:%.*]] ], [ [[IV_PH]], [[LOOP_PREHEADER20]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT11]] ], [ [[SUM_PH]], [[LOOP_PREHEADER20]] ]
-; CHECK-NEXT:    [[INRANGE_I:%.*]] = icmp ult i64 [[SUB_I]], [[IV]]
-; CHECK-NEXT:    br i1 [[INRANGE_I]], label [[ERROR_I:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT:%.*]]
-; CHECK:       error.i:
-; CHECK-NEXT:    tail call void @error()
-; CHECK-NEXT:    unreachable
-; CHECK:       at_with_int_conversion.exit:
-; CHECK-NEXT:    [[INRANGE_I7:%.*]] = icmp ult i64 [[SUB_I6_PEEL]], [[IV]]
-; CHECK-NEXT:    br i1 [[INRANGE_I7]], label [[ERROR_I10:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT11]]
-; CHECK:       error.i10:
-; CHECK-NEXT:    tail call void @error()
-; CHECK-NEXT:    unreachable
-; CHECK:       at_with_int_conversion.exit11:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[IV_PH]], [[LOOP_PREHEADER17]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[LOOP]] ], [ [[SUM_PH]], [[LOOP_PREHEADER17]] ]
 ; CHECK-NEXT:    [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]]
 ; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8
-; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1]], i64 [[IV]]
 ; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[LV_I]], [[SUM]]
 ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[ADD]], [[LV_I9]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       error.i:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
+; CHECK:       error.i10:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT11_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT11]] ]
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[SUM_NEXT]], [[LOOP]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
 ;
 entry:
@@ -120,120 +112,111 @@ exit:
 
 define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-LABEL: @sum_3_at_with_int_conversion(
-; CHECK-NEXT:  at_with_int_conversion.exit22.peel:
+; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[START_I:%.*]] = load ptr, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[GEP_END_I:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[END_I:%.*]] = load ptr, ptr [[GEP_END_I]], align 8
 ; CHECK-NEXT:    [[START_INT_I:%.*]] = ptrtoint ptr [[START_I]] to i64
 ; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64
 ; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
-; CHECK-NEXT:    [[GEP_END_I13:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 8
+; CHECK-NEXT:    [[START_I1:%.*]] = load ptr, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[END_I3:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
+; CHECK-NEXT:    [[START_INT_I4:%.*]] = ptrtoint ptr [[START_I1]] to i64
+; CHECK-NEXT:    [[END_INT_I5:%.*]] = ptrtoint ptr [[END_I3]] to i64
+; CHECK-NEXT:    [[SUB_I6:%.*]] = sub i64 [[END_INT_I5]], [[START_INT_I4]]
+; CHECK-NEXT:    [[START_I12:%.*]] = load ptr, ptr [[C:%.*]], align 8
+; CHECK-NEXT:    [[GEP_END_I13:%.*]] = getelementptr i8, ptr [[C]], i64 8
+; CHECK-NEXT:    [[END_I14:%.*]] = load ptr, ptr [[GEP_END_I13]], align 8
+; CHECK-NEXT:    [[START_INT_I15:%.*]] = ptrtoint ptr [[START_I12]] to i64
+; CHECK-NEXT:    [[END_INT_I16:%.*]] = ptrtoint ptr [[END_I14]] to i64
+; CHECK-NEXT:    [[SUB_I17:%.*]] = sub i64 [[END_INT_I16]], [[START_INT_I15]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i64 [[SUB_I]] to i128
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i128 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[SUB_I6]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i128 [[TMP2]], 1
+; CHECK-NEXT:    [[UMIN:%.*]] = tail call i128 @llvm.umin.i128(i128 [[TMP3]], i128 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i64 [[SUB_I17]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i128 [[TMP4]], 1
+; CHECK-NEXT:    [[UMIN23:%.*]] = tail call i128 @llvm.umin.i128(i128 [[UMIN]], i128 [[TMP5]])
 ; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0)
-; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 8
-; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8
-; CHECK-NEXT:    [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8
-; CHECK-NEXT:    [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
-; CHECK-NEXT:    [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64
-; CHECK-NEXT:    [[END_I3_PEEL_FR:%.*]] = freeze ptr [[END_I3_PEEL]]
-; CHECK-NEXT:    [[END_INT_I5_PEEL:%.*]] = ptrtoint ptr [[END_I3_PEEL_FR]] to i64
-; CHECK-NEXT:    [[SUB_I6_PEEL:%.*]] = sub i64 [[END_INT_I5_PEEL]], [[START_INT_I4_PEEL]]
-; CHECK-NEXT:    [[START_I12_PEEL:%.*]] = load ptr, ptr [[C]], align 8
-; CHECK-NEXT:    [[END_I14_PEEL:%.*]] = load ptr, ptr [[GEP_END_I13]], align 8
-; CHECK-NEXT:    [[START_INT_I15_PEEL:%.*]] = ptrtoint ptr [[START_I12_PEEL]] to i64
-; CHECK-NEXT:    [[END_INT_I16_PEEL:%.*]] = ptrtoint ptr [[END_I14_PEEL]] to i64
-; CHECK-NEXT:    [[SUB_I17_PEEL:%.*]] = sub i64 [[END_INT_I16_PEEL]], [[START_INT_I15_PEEL]]
-; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8
-; CHECK-NEXT:    [[LV_I20_PEEL:%.*]] = load i64, ptr [[START_I12_PEEL]], align 8
-; CHECK-NEXT:    [[ADD_2_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]]
-; CHECK-NEXT:    [[SUM_NEXT_PEEL:%.*]] = add i64 [[ADD_2_PEEL]], [[LV_I20_PEEL]]
-; CHECK-NEXT:    [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1
-; CHECK-NEXT:    br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext nneg i64 [[SMAX]] to i128
+; CHECK-NEXT:    [[UMIN24:%.*]] = tail call i128 @llvm.umin.i128(i128 [[UMIN23]], i128 [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i128 [[TMP1]], [[UMIN24]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i128 [[TMP5]], [[UMIN24]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[ERROR_I:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i128 [[TMP3]], [[UMIN24]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[ERROR_I10:%.*]], label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    br i1 [[TMP8]], label [[ERROR_I21:%.*]], label [[LOOP_PREHEADER:%.*]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
-; CHECK-NEXT:    [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I17_PEEL]], i64 [[TMP0]])
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[UMIN]]
-; CHECK-NEXT:    [[UMIN26:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I6_PEEL]])
-; CHECK-NEXT:    [[UMIN27:%.*]] = tail call i64 @llvm.umin.i64(i64 [[UMIN26]], i64 [[SUB_I]])
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[UMIN27]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 5
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER34:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add nuw i64 [[SMAX]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER31:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[SUM_NEXT_PEEL]], i64 0
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP10]], -4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI28:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or disjoint i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add <2 x i64> [[WIDE_LOAD29]], [[VEC_PHI28]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[TMP12]], [[WIDE_LOAD30]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add <2 x i64> [[TMP13]], [[WIDE_LOAD31]]
-; CHECK-NEXT:    [[TMP16]] = add <2 x i64> [[TMP14]], [[WIDE_LOAD32]]
-; CHECK-NEXT:    [[TMP17]] = add <2 x i64> [[TMP15]], [[WIDE_LOAD33]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI25:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP11]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <2 x i64>, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[START_I1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <2 x i64>, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD28:%.*]] = load <2 x i64>, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[START_I12]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add <2 x i64> [[WIDE_LOAD26]], [[VEC_PHI25]]
+; CHECK-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[TMP17]], [[WIDE_LOAD27]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add <2 x i64> [[TMP18]], [[WIDE_LOAD28]]
+; CHECK-NEXT:    [[TMP21]] = add <2 x i64> [[TMP19]], [[WIDE_LOAD29]]
+; CHECK-NEXT:    [[TMP22]] = add <2 x i64> [[TMP20]], [[WIDE_LOAD30]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
-; CHECK-NEXT:    br label [[LOOP_PREHEADER34]]
-; CHECK:       loop.preheader34:
-; CHECK-NEXT:    [[IV_PH:%.*]] = phi i64 [ 1, [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP10]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[LOOP_PREHEADER31]]
+; CHECK:       loop.preheader31:
+; CHECK-NEXT:    [[IV_PH:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_PH:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT22:%.*]] ], [ [[IV_PH]], [[LOOP_PREHEADER34]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT22]] ], [ [[SUM_PH]], [[LOOP_PREHEADER34]] ]
-; CHECK-NEXT:    [[INRANGE_I:%.*]] = icmp ult i64 [[SUB_I]], [[IV]]
-; CHECK-NEXT:    br i1 [[INRANGE_I]], label [[ERROR_I:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT:%.*]]
-; CHECK:       error.i:
-; CHECK-NEXT:    tail call void @error()
-; CHECK-NEXT:    unreachable
-; CHECK:       at_with_int_conversion.exit:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[IV_PH]], [[LOOP_PREHEADER31]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[LOOP]] ], [ [[SUM_PH]], [[LOOP_PREHEADER31]] ]
 ; CHECK-NEXT:    [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]]
 ; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8
-; CHECK-NEXT:    [[INRANGE_I7:%.*]] = icmp ult i64 [[SUB_I6_PEEL]], [[IV]]
-; CHECK-NEXT:    br i1 [[INRANGE_I7]], label [[ERROR_I10:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT11:%.*]]
-; CHECK:       error.i10:
-; CHECK-NEXT:    tail call void @error()
-; CHECK-NEXT:    unreachable
-; CHECK:       at_with_int_conversion.exit11:
-; CHECK-NEXT:    [[INRANGE_I18:%.*]] = icmp ult i64 [[SUB_I17_PEEL]], [[IV]]
-; CHECK-NEXT:    br i1 [[INRANGE_I18]], label [[ERROR_I21:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT22]]
-; CHECK:       error.i21:
-; CHECK-NEXT:    tail call void @error()
-; CHECK-NEXT:    unreachable
-; CHECK:       at_with_int_conversion.exit22:
-; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1]], i64 [[IV]]
 ; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8
-; CHECK-NEXT:    [[GEP_IDX_I19:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_IDX_I19:%.*]] = getelementptr i64, ptr [[START_I12]], i64 [[IV]]
 ; CHECK-NEXT:    [[LV_I20:%.*]] = load i64, ptr [[GEP_IDX_I19]], align 8
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i64 [[LV_I]], [[SUM]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i64 [[ADD_1]], [[LV_I9]]
 ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[ADD_2]], [[LV_I20]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       error.i:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
+; CHECK:       error.i10:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
+; CHECK:       error.i21:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT22_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT22]] ]
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[SUM_NEXT]], [[LOOP]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
index f44705a925d5a73..46c6c10125b95ff 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
@@ -18,8 +18,15 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v2i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
-; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+; GFX8-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX8-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX8-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
 ; GFX9-LABEL: @uadd_sat_v2i16(
 ; GFX9-NEXT:  bb:
@@ -53,8 +60,15 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @usub_sat_v2i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
-; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+; GFX8-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX8-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX8-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
 ; GFX9-LABEL: @usub_sat_v2i16(
 ; GFX9-NEXT:  bb:
@@ -88,8 +102,15 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @sadd_sat_v2i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
-; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+; GFX8-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX8-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX8-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
 ; GFX9-LABEL: @sadd_sat_v2i16(
 ; GFX9-NEXT:  bb:
@@ -123,8 +144,15 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @ssub_sat_v2i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
-; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+; GFX8-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX8-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX8-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
 ; GFX9-LABEL: @ssub_sat_v2i16(
 ; GFX9-NEXT:  bb:
@@ -262,11 +290,18 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v3i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
-; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP0:%.*]] = call <3 x i16> @llvm.umin.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX8-NEXT:    [[ARG0_0:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 0
+; GFX8-NEXT:    [[ARG0_1:%.*]] = extractelement <3 x i16> [[ARG0]], i64 1
+; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2
+; GFX8-NEXT:    [[ARG1_0:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 0
+; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <3 x i16> [[ARG1]], i64 1
+; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2
+; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP0]], i16 [[ADD_2]], i64 2
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
 ; GFX9-LABEL: @uadd_sat_v3i16(
@@ -317,11 +352,18 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]])
-; GFX8-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
-; GFX8-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; GFX8-NEXT:    [[ARG0_0:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 0
+; GFX8-NEXT:    [[ARG0_1:%.*]] = extractelement <4 x i16> [[ARG0]], i64 1
+; GFX8-NEXT:    [[ARG1_0:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 0
+; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1
+; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX8-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[INS_1]], <4 x i16> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 4978991b42c8f19..36681ecea4f50fd 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -142,27 +142,27 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
 ; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP78:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT:    [[TMP77:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP73:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP81:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32>
-; CHECK-NEXT:    [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP81]], [[TMP76]]
+; CHECK-NEXT:    [[TMP78:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32>
+; CHECK-NEXT:    [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
+; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP78]], [[TMP76]]
 ; CHECK-NEXT:    [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP83:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP80:%.*]] = zext <2 x i8> [[TMP83]] to <2 x i32>
-; CHECK-NEXT:    [[TMP77:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP82:%.*]] = zext <2 x i8> [[TMP77]] to <2 x i32>
-; CHECK-NEXT:    [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP84:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
+; CHECK-NEXT:    [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP80:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
+; CHECK-NEXT:    [[TMP81:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP82:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
+; CHECK-NEXT:    [[TMP83:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP84:%.*]] = zext <2 x i8> [[TMP83]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT:    [[TMP89:%.*]] = sub <2 x i32> [[TMP97]], [[TMP80]]
-; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP89]]
+; CHECK-NEXT:    [[TMP90:%.*]] = sub <2 x i32> [[TMP97]], [[TMP80]]
+; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP90]]
 ; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT:    [[TMP99:%.*]] = sub <2 x i32> [[TMP86]], [[TMP78]]
-; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP99]]
+; CHECK-NEXT:    [[TMP98:%.*]] = sub <2 x i32> [[TMP86]], [[TMP77]]
+; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP98]]
 ; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP106:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]]
 ; CHECK-NEXT:    [[TMP91:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
@@ -182,21 +182,21 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535
 ; CHECK-NEXT:    [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
 ; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
-; CHECK-NEXT:    [[TMP98:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR644]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32>
-; CHECK-NEXT:    [[TMP100:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP109:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32>
-; CHECK-NEXT:    [[TMP112:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP114:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP109]], [[TMP114]]
+; CHECK-NEXT:    [[TMP109:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR644]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
+; CHECK-NEXT:    [[TMP116:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
+; CHECK-NEXT:    [[TMP128:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP155:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32>
+; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP118]], [[TMP155]]
 ; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP113:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32>
-; CHECK-NEXT:    [[TMP115:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP115]] to <2 x i32>
-; CHECK-NEXT:    [[TMP116:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
-; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP118]], [[TMP128]]
+; CHECK-NEXT:    [[TMP156:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP156]] to <2 x i32>
+; CHECK-NEXT:    [[TMP112:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
+; CHECK-NEXT:    [[TMP114:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
+; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP113]], [[TMP115]]
 ; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
 ; CHECK-NEXT:    [[TMP119:%.*]] = sub <2 x i32> [[TMP137]], [[TMP111]]
@@ -480,11 +480,11 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP84:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]]
 ; THR15-NEXT:    [[TMP85:%.*]] = shl <2 x i32> [[TMP84]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV33]], i32 1
-; THR15-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP86]], [[TMP76]]
-; THR15-NEXT:    [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP87]]
+; THR15-NEXT:    [[TMP93:%.*]] = sub <2 x i32> [[TMP86]], [[TMP76]]
+; THR15-NEXT:    [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP93]]
 ; THR15-NEXT:    [[TMP92:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV]], i32 0
-; THR15-NEXT:    [[TMP93:%.*]] = sub <2 x i32> [[TMP92]], [[TMP68]]
-; THR15-NEXT:    [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP93]]
+; THR15-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP92]], [[TMP68]]
+; THR15-NEXT:    [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP87]]
 ; THR15-NEXT:    [[TMP97:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP95]], <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP77:%.*]] = add <2 x i32> [[TMP88]], [[TMP95]]
 ; THR15-NEXT:    [[TMP91:%.*]] = sub <2 x i32> [[TMP95]], [[TMP88]]
@@ -521,13 +521,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP113:%.*]] = sub <2 x i32> [[TMP109]], [[TMP111]]
 ; THR15-NEXT:    [[TMP114:%.*]] = shl <2 x i32> [[TMP113]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP115:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV33_1]], i32 1
-; THR15-NEXT:    [[TMP116:%.*]] = sub <2 x i32> [[TMP115]], [[TMP107]]
-; THR15-NEXT:    [[TMP117:%.*]] = add <2 x i32> [[TMP114]], [[TMP116]]
+; THR15-NEXT:    [[TMP117:%.*]] = sub <2 x i32> [[TMP115]], [[TMP107]]
+; THR15-NEXT:    [[TMP116:%.*]] = add <2 x i32> [[TMP114]], [[TMP117]]
 ; THR15-NEXT:    [[TMP126:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV_1]], i32 0
 ; THR15-NEXT:    [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP99]]
 ; THR15-NEXT:    [[TMP128:%.*]] = add <2 x i32> [[TMP102]], [[TMP127]]
-; THR15-NEXT:    [[TMP106:%.*]] = add <2 x i32> [[TMP117]], [[TMP128]]
-; THR15-NEXT:    [[TMP121:%.*]] = sub <2 x i32> [[TMP128]], [[TMP117]]
+; THR15-NEXT:    [[TMP106:%.*]] = add <2 x i32> [[TMP116]], [[TMP128]]
+; THR15-NEXT:    [[TMP121:%.*]] = sub <2 x i32> [[TMP128]], [[TMP116]]
 ; THR15-NEXT:    [[TMP118:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
 ; THR15-NEXT:    [[TMP119:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
 ; THR15-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP119]], [[TMP118]]
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
new file mode 100644
index 000000000000000..54eb564768318b8
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=riscv64-unknown-linux -mattr=+v -passes=slp-vectorizer -S | FileCheck %s
+
+@src = common global [8 x double] zeroinitializer, align 64
+@dst = common global [4 x double] zeroinitializer, align 64
+
+define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i64(ptr align 8 @src, i64 16, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i64(ptr align 8 getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), i64 16, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    store <4 x double> [[TMP3]], ptr @dst, align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, ptr @src, align 8
+  %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
+  %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
+  %a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
+  %a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
+  %a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
+  %a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
+  %a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+  %res1 = fsub fast double %a0, %a1
+  %res2 = fsub fast double %a2, %a3
+  %res3 = fsub fast double %a4, %a5
+  %res4 = fsub fast double %a6, %a7
+  store double %res1, ptr @dst, align 8
+  store double %res2, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 1), align 8
+  store double %res3, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 2), align 8
+  store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
index 7c05355e98d70b1..b80be40d9fc8610 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
@@ -31,6 +31,6 @@ define void @fun(ptr nocapture, i32 zeroext) local_unnamed_addr #0 {
 ._crit_edge:                                      ; preds = %.lr.ph
   ret void
 
-; CHECK: SLP: Adding cost -1 for bundle n=2 [  %4 = icmp ult i32 %2, %1, ..]
+; CHECK: SLP: Adding cost -1 for bundle Idx: 3, n=2 [  %4 = icmp ult i32 %2, %1, ..]
 }
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
index 0937b686ee2f75d..b2bcdb178b21be4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
@@ -16,10 +16,10 @@ define i32 @foo(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 %n, i32
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -59,13 +59,13 @@ define i32 @extr_user(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 %n
 ; CHECK-LABEL: @extr_user(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    ret i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 entry:
   %0 = load i32, ptr %A, align 4
@@ -95,13 +95,13 @@ define i32 @extr_user1(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 %
 ; CHECK-LABEL: @extr_user1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
-; CHECK-NEXT:    ret i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 entry:
   %0 = load i32, ptr %A, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
index c600d75ed1e8c4c..0eb18239ae3fb6e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
@@ -13,9 +13,9 @@ define i32 @fn1() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 11, i64 56>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[ADD_PTR]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
@@ -92,11 +92,11 @@ define void @externally_used_ptrs() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 56, i64 11>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[ADD_PTR]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[ADD_PTR]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
index d7144d750321fe1..f197b2480d61ca8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
@@ -4,6 +4,25 @@
 ; RUN: -slp-skip-early-profitability-check < %s | FileCheck %s --check-prefixes=FORCED
 
 define i64 @foo() {
+; CHECK-LABEL: define i64 @foo() {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[ADD:%.*]], [[BB3]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i64 [ [[TMP9:%.*]], [[BB3]] ]
+; CHECK-NEXT:    ret i64 0
+; CHECK:       bb3:
+; CHECK-NEXT:    [[PHI5:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ 0, [[BB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, [[BB]] ], [ [[TMP7:%.*]], [[BB3]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; CHECK-NEXT:    [[ADD]] = add i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 0
+; CHECK-NEXT:    [[TMP9]] = or i64 [[PHI5]], 0
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP7]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[ADD]], i32 0
+; CHECK-NEXT:    br i1 false, label [[BB3]], label [[BB1:%.*]]
+;
 ; FORCED-LABEL: define i64 @foo() {
 ; FORCED-NEXT:  bb:
 ; FORCED-NEXT:    br label [[BB3:%.*]]
@@ -25,25 +44,6 @@ define i64 @foo() {
 ; FORCED-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0
 ; FORCED-NEXT:    br i1 false, label [[BB3]], label [[BB1:%.*]]
 ;
-; CHECK-LABEL: define i64 @foo() {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label [[BB3:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[ADD:%.*]], [[BB3]] ]
-; CHECK-NEXT:    [[PHI2:%.*]] = phi i64 [ [[TMP9:%.*]], [[BB3]] ]
-; CHECK-NEXT:    ret i64 0
-; CHECK:       bb3:
-; CHECK-NEXT:    [[PHI5:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ 0, [[BB3]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, [[BB]] ], [ [[TMP7:%.*]], [[BB3]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-; CHECK-NEXT:    [[ADD]] = add i64 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 0
-; CHECK-NEXT:    [[TMP9]] = or i64 [[PHI5]], 0
-; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP7]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[ADD]], i32 0
-; CHECK-NEXT:    br i1 false, label [[BB3]], label [[BB1:%.*]]
-;
 bb:
   br label %bb3
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
index e459cd8c6955b03..e94dd2119270ce1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
@@ -11,37 +11,37 @@ define dso_local i32 @g() local_unnamed_addr {
 ; CHECK-NEXT:    [[TOBOOL_NOT19:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT19]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
-; CHECK-NEXT:    [[C_022:%.*]] = phi ptr [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[A_020:%.*]] = phi ptr [ [[A_020_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 1, i64 1>
-; CHECK-NEXT:    switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[A_020]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 1, i64 1>
+; CHECK-NEXT:    switch i32 [[TMP4]], label [[WHILE_BODY_BACKEDGE]] [
 ; CHECK-NEXT:      i32 2, label [[SW_BB:%.*]]
 ; CHECK-NEXT:      i32 4, label [[SW_BB6:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2
-; CHECK-NEXT:    store i32 [[TMP7]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[A_020]], i64 2
+; CHECK-NEXT:    store i32 [[TMP8]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       sw.bb6:
-; CHECK-NEXT:    [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[A_020]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[INCDEC_PTR]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       while.body.backedge:
-; CHECK-NEXT:    [[C_022_BE]] = phi ptr [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ]
-; CHECK-NEXT:    [[TMP14]] = phi <2 x ptr> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ]
+; CHECK-NEXT:    [[A_020_BE]] = phi ptr [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR7]], [[SW_BB6]] ], [ [[INCDEC_PTR4]], [[SW_BB]] ]
+; CHECK-NEXT:    [[TMP14]] = phi <2 x ptr> [ [[TMP5]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP9]], [[SW_BB]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret i32 undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index 8562e53b1538722..9df2b9a8e8f3ec0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -189,11 +189,11 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1
 ; CHECK-NEXT:    [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
 ; CHECK-NEXT:    [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
 ; CHECK-NEXT:    ret void
@@ -238,10 +238,10 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll
index eb5d13d6fc19d74..69b4639d9c13187 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll
@@ -47,22 +47,22 @@ define void @reorder_crash(ptr %ptr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 undef, label [[BB0:%.*]], label [[BB12:%.*]]
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTR:%.*]], align 4
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[PTR]], align 4
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb12:
 ; CHECK-NEXT:    br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[PTR]], align 4
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTR]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[PTR]], align 4
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[PTR]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x float> [ [[TMP1]], [[BB0]] ], [ [[TMP4]], [[BB1]] ], [ [[SHUFFLE]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <4 x float> [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP4]], [[BB2]] ]
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll
index d6dd4128de9c7e5..d7c3ccd8c9ce8a4 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec.ll
@@ -124,3 +124,26 @@ entry:
   store <8 x i1> %6, ptr %7, align 1
   ret void
 }
+
+define void @test5(ptr %ptr0, ptr %ptr1) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GETELEMENTPTR0:%.*]] = getelementptr i8, ptr null, i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x ptr> <ptr null, ptr null, ptr undef, ptr undef>, ptr [[GETELEMENTPTR0]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x ptr> [[TMP0]], ptr null, i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <4 x ptr> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x ptr> <ptr poison, ptr null, ptr null, ptr null>, ptr [[PTR0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x ptr> [[TMP1]], ptr [[PTR1:%.*]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult <4 x ptr> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %getelementptr0 = getelementptr i8, ptr null, i64 0
+  %0 = insertelement <4 x ptr> <ptr null, ptr null, ptr undef, ptr undef>, ptr %getelementptr0, i32 2
+  %1 = insertelement <4 x ptr> %0, ptr null, i32 3
+  %2 = icmp ult <4 x ptr> zeroinitializer, %1
+  %3 = insertelement <4 x ptr> <ptr poison, ptr null, ptr null, ptr null>, ptr %ptr0, i32 0
+  %4 = insertelement <4 x ptr> %1, ptr %ptr1, i32 3
+  %5 = icmp ult <4 x ptr> %3, %4
+  ret void
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/sink-common-code.ll b/llvm/test/Transforms/SimplifyCFG/X86/sink-common-code.ll
index 0150b3b60d9e42b..cb2bbb8e0b9317e 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/sink-common-code.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/sink-common-code.ll
@@ -2033,6 +2033,152 @@ join:
   ret void
 }
 
+define i32 @many_indirect_phis(i1 %cond, i32 %a, i32 %b) {
+; CHECK-LABEL: @many_indirect_phis(
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[B_SINK:%.*]] = phi i32 [ [[A:%.*]], [[IF]] ], [ [[B:%.*]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[DOTSINK3:%.*]] = phi i32 [ 10, [[IF]] ], [ 11, [[TMP0]] ]
+; CHECK-NEXT:    [[DOTSINK2:%.*]] = phi i32 [ 20, [[IF]] ], [ 21, [[TMP0]] ]
+; CHECK-NEXT:    [[DOTSINK1:%.*]] = phi i32 [ 30, [[IF]] ], [ 31, [[TMP0]] ]
+; CHECK-NEXT:    [[DOTSINK:%.*]] = phi i32 [ 40, [[IF]] ], [ 41, [[TMP0]] ]
+; CHECK-NEXT:    [[ADD_0_B:%.*]] = add i32 [[B_SINK]], 1
+; CHECK-NEXT:    [[ADD_1_B:%.*]] = add i32 [[ADD_0_B]], [[DOTSINK3]]
+; CHECK-NEXT:    [[ADD_2_B:%.*]] = add i32 [[ADD_1_B]], [[DOTSINK2]]
+; CHECK-NEXT:    [[ADD_3_B:%.*]] = add i32 [[ADD_2_B]], [[DOTSINK1]]
+; CHECK-NEXT:    [[ADD_4_B:%.*]] = add i32 [[ADD_3_B]], [[DOTSINK]]
+; CHECK-NEXT:    ret i32 [[ADD_4_B]]
+;
+  br i1 %cond, label %if, label %else
+
+if:
+  call void @dummy()
+  %add.0.a = add i32 %a, 1
+  %add.1.a = add i32 %add.0.a, 10
+  %add.2.a = add i32 %add.1.a, 20
+  %add.3.a = add i32 %add.2.a, 30
+  %add.4.a = add i32 %add.3.a, 40
+  br label %join
+
+else:
+  %add.0.b = add i32 %b, 1
+  %add.1.b = add i32 %add.0.b, 11
+  %add.2.b = add i32 %add.1.b, 21
+  %add.3.b = add i32 %add.2.b, 31
+  %add.4.b = add i32 %add.3.b, 41
+  br label %join
+
+join:
+  %phi = phi i32 [ %add.4.a, %if ], [ %add.4.b, %else ]
+  ret i32 %phi
+}
+
+define i32 @store_and_unrelated_many_phi_add(i1 %cond, ptr %p, i32 %a, i32 %b) {
+; CHECK-LABEL: @store_and_unrelated_many_phi_add(
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[A:%.*]], 2
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[B:%.*]], 3
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[ADD_1]], [[IF]] ], [ [[ADD_2]], [[ELSE]] ]
+; CHECK-NEXT:    store i32 1, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+  br i1 %cond, label %if, label %else
+
+if:
+  call void @dummy()
+  %add.1 = add i32 %a, 2
+  store i32 1, ptr %p
+  br label %join
+
+else:
+  %add.2 = add i32 %b, 3
+  store i32 1, ptr %p
+  br label %join
+
+join:
+  %phi = phi i32 [ %add.1, %if ], [ %add.2, %else ]
+  ret i32 %phi
+}
+
+define i32 @store_and_related_many_phi_add(i1 %cond, ptr %p, i32 %a, i32 %b) {
+; CHECK-LABEL: @store_and_related_many_phi_add(
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[A:%.*]], 2
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[B:%.*]], 3
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[ADD_2_SINK:%.*]] = phi i32 [ [[ADD_2]], [[ELSE]] ], [ [[ADD_1]], [[IF]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[ADD_1]], [[IF]] ], [ [[ADD_2]], [[ELSE]] ]
+; CHECK-NEXT:    store i32 [[ADD_2_SINK]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+  br i1 %cond, label %if, label %else
+
+if:
+  call void @dummy()
+  %add.1 = add i32 %a, 2
+  store i32 %add.1, ptr %p
+  br label %join
+
+else:
+  %add.2 = add i32 %b, 3
+  store i32 %add.2, ptr %p
+  br label %join
+
+join:
+  %phi = phi i32 [ %add.1, %if ], [ %add.2, %else ]
+  ret i32 %phi
+}
+
+define i32 @store_and_unrelated_many_phi_add2(i1 %cond, ptr %p, i32 %a, i32 %b) {
+; CHECK-LABEL: @store_and_unrelated_many_phi_add2(
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[A:%.*]], 2
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[B:%.*]], 3
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[ADD_1]], [[IF]] ], [ [[ADD_2]], [[ELSE]] ]
+; CHECK-NEXT:    [[ADD_A_2:%.*]] = add i32 [[A]], 1
+; CHECK-NEXT:    store i32 [[ADD_A_2]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+  br i1 %cond, label %if, label %else
+
+if:
+  call void @dummy()
+  %add.1 = add i32 %a, 2
+  %add.a.1 = add i32 %a, 1
+  store i32 %add.a.1, ptr %p
+  br label %join
+
+else:
+  %add.2 = add i32 %b, 3
+  %add.a.2 = add i32 %a, 1
+  store i32 %add.a.2, ptr %p
+  br label %join
+
+join:
+  %phi = phi i32 [ %add.1, %if ], [ %add.2, %else ]
+  ret i32 %phi
+}
+
 declare void @dummy()
 declare void @use.ptr(ptr)
 
diff --git a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
index 385e37e2750d1e3..10a3e65e5f57d67 100644
--- a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
+++ b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
@@ -28,7 +28,7 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[I_INITIAL:%.*]] = load volatile i32, ptr addrspace(1) [[GEP]], align 4
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       LOOP.HEADER:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[FLOW3:%.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FLOW3:%.*]] ]
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 0x100b
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[I]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) null, i64 [[TMP12]]
@@ -49,8 +49,8 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
 ; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
 ; CHECK:       Flow2:
-; CHECK-NEXT:    [[TMP3]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP6:%.*]], [[FLOW]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP8:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP7:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP9:%.*]], [[FLOW]] ]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW3]]
 ; CHECK:       INNER_LOOP:
 ; CHECK-NEXT:    [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
@@ -66,19 +66,20 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[LOAD13:%.*]] = icmp uge i32 [[TMP16]], 271
 ; CHECK-NEXT:    br i1 [[LOAD13]], label [[INCREMENT_I]], label [[FLOW1:%.*]]
 ; CHECK:       Flow3:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    [[TMP5]] = phi i32 [ [[TMP3]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW2]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
 ; CHECK:       Flow4:
-; CHECK-NEXT:    br i1 [[TMP7:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
+; CHECK-NEXT:    br i1 [[TMP8:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
 ; CHECK:       bb64:
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 42", "~{memory}"() #[[ATTR0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       Flow:
-; CHECK-NEXT:    [[TMP6]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP7]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[BB18]], label [[FLOW2]]
+; CHECK-NEXT:    [[TMP7]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP9]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[BB18]], label [[FLOW2]]
 ; CHECK:       INCREMENT_I:
 ; CHECK-NEXT:    [[INC_I]] = add i32 [[I]], 1
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 0x1336
diff --git a/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll b/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll
index 46881ec8272861c..c832b7d1394a880 100644
--- a/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll
+++ b/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll
@@ -7,8 +7,8 @@ define float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[HEADER:.*]]
 ; CHECK:       [[HEADER]]:
-; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[FLOW2:.*]] ]
-; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[FLOW2]] ]
+; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[FLOW2:.*]] ]
+; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP6:%.*]], %[[FLOW2]] ]
 ; CHECK-NEXT:    [[CC:%.*]] = icmp sge i32 [[IND]], [[X]]
 ; CHECK-NEXT:    br i1 [[CC]], label %[[ELSE:.*]], label %[[FLOW:.*]]
 ; CHECK:       [[FLOW]]:
@@ -23,17 +23,20 @@ define float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 {
 ; CHECK-NEXT:    [[CC2]] = icmp slt i32 [[IND]], [[Y]]
 ; CHECK-NEXT:    br label %[[FLOW]]
 ; CHECK:       [[FLOW1]]:
-; CHECK-NEXT:    [[TMP8]] = phi float [ [[V_IF]], %[[IF]] ], [ [[TMP0]], %[[FLOW]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ true, %[[IF]] ], [ [[TMP1]], %[[FLOW]] ]
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[LATCH:.*]], label %[[FLOW2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ undef, %[[IF]] ], [ [[TMP0]], %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ [[TMP0]], %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ true, %[[IF]] ], [ [[TMP1]], %[[FLOW]] ]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[LATCH:.*]], label %[[FLOW2]]
 ; CHECK:       [[LATCH]]:
 ; CHECK-NEXT:    [[IND_INC:%.*]] = add i32 [[IND]], 1
 ; CHECK-NEXT:    [[CC3:%.*]] = icmp slt i32 [[IND]], [[Z]]
 ; CHECK-NEXT:    br label %[[FLOW2]]
 ; CHECK:       [[FLOW2]]:
-; CHECK-NEXT:    [[TMP5]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ]
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[END:.*]], label %[[HEADER]]
+; CHECK-NEXT:    [[TMP6]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ]
+; CHECK-NEXT:    [[TMP7]] = phi float [ [[TMP4]], %[[LATCH]] ], [ undef, %[[FLOW1]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi float [ [[TMP4]], %[[LATCH]] ], [ [[TMP3]], %[[FLOW1]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[END:.*]], label %[[HEADER]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret float [[TMP8]]
 ;
@@ -72,8 +75,8 @@ define float @while_break2(i32 %z, float %v, i32 %x, i32 %y) #0 {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[HEADER:.*]]
 ; CHECK:       [[HEADER]]:
-; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[FLOW2:.*]] ]
-; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[FLOW2]] ]
+; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[FLOW2:.*]] ]
+; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP6:%.*]], %[[FLOW2]] ]
 ; CHECK-NEXT:    [[CC:%.*]] = icmp sge i32 [[IND]], [[X]]
 ; CHECK-NEXT:    br i1 [[CC]], label %[[IF:.*]], label %[[FLOW:.*]]
 ; CHECK:       [[IF]]:
@@ -88,17 +91,20 @@ define float @while_break2(i32 %z, float %v, i32 %x, i32 %y) #0 {
 ; CHECK-NEXT:    [[CC2:%.*]] = icmp slt i32 [[IND]], [[Y]]
 ; CHECK-NEXT:    br label %[[FLOW1]]
 ; CHECK:       [[FLOW1]]:
-; CHECK-NEXT:    [[TMP8]] = phi float [ [[V_1]], %[[ELSE]] ], [ [[TMP0]], %[[FLOW]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ [[CC2]], %[[ELSE]] ], [ [[TMP1]], %[[FLOW]] ]
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[LATCH:.*]], label %[[FLOW2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ [[V_1]], %[[ELSE]] ], [ undef, %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[V_1]], %[[ELSE]] ], [ [[TMP0]], %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ [[CC2]], %[[ELSE]] ], [ [[TMP1]], %[[FLOW]] ]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[LATCH:.*]], label %[[FLOW2]]
 ; CHECK:       [[LATCH]]:
 ; CHECK-NEXT:    [[IND_INC:%.*]] = add i32 [[IND]], 1
 ; CHECK-NEXT:    [[CC3:%.*]] = icmp slt i32 [[IND]], [[Z]]
 ; CHECK-NEXT:    br label %[[FLOW2]]
 ; CHECK:       [[FLOW2]]:
-; CHECK-NEXT:    [[TMP5]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ]
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[END:.*]], label %[[HEADER]]
+; CHECK-NEXT:    [[TMP6]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ]
+; CHECK-NEXT:    [[TMP7]] = phi float [ [[TMP4]], %[[LATCH]] ], [ undef, %[[FLOW1]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi float [ [[TMP4]], %[[LATCH]] ], [ [[TMP3]], %[[FLOW1]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[END:.*]], label %[[HEADER]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret float [[TMP8]]
 ;
@@ -137,9 +143,9 @@ define < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i32 %y, i32
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[HEADER:.*]]
 ; CHECK:       [[HEADER]]:
-; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[FLOW1:.*]] ]
-; CHECK-NEXT:    [[V_COPY:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[FLOW1]] ]
-; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[FLOW1]] ]
+; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP6:%.*]], %[[FLOW1:.*]] ]
+; CHECK-NEXT:    [[V_COPY:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[FLOW1]] ]
+; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP4:%.*]], %[[FLOW1]] ]
 ; CHECK-NEXT:    [[CC:%.*]] = icmp slt i32 [[IND]], [[X]]
 ; CHECK-NEXT:    [[CC_INV:%.*]] = xor i1 [[CC]], true
 ; CHECK-NEXT:    br i1 [[CC]], label %[[IF:.*]], label %[[FLOW:.*]]
@@ -150,18 +156,23 @@ define < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i32 %y, i32
 ; CHECK-NEXT:    [[CC2:%.*]] = icmp slt i32 [[IND]], [[Y]]
 ; CHECK-NEXT:    br label %[[FLOW]]
 ; CHECK:       [[FLOW]]:
-; CHECK-NEXT:    [[TMP7]] = phi float [ [[V_IF]], %[[IF]] ], [ [[V_COPY]], %[[HEADER]] ]
-; CHECK-NEXT:    [[TMP8]] = phi float [ [[V_IF]], %[[IF]] ], [ [[V_1]], %[[HEADER]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ [[CC2]], %[[IF]] ], [ [[CC_INV]], %[[HEADER]] ]
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[LATCH:.*]], label %[[FLOW1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ undef, %[[HEADER]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ [[V_COPY]], %[[HEADER]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ [[V_1]], %[[HEADER]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i1 [ [[CC2]], %[[IF]] ], [ [[CC_INV]], %[[HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[LATCH:.*]], label %[[FLOW1]]
 ; CHECK:       [[LATCH]]:
 ; CHECK-NEXT:    [[IND_INC:%.*]] = add i32 [[IND]], 1
 ; CHECK-NEXT:    [[CC3:%.*]] = icmp slt i32 [[IND]], [[Z]]
 ; CHECK-NEXT:    br label %[[FLOW1]]
 ; CHECK:       [[FLOW1]]:
-; CHECK-NEXT:    [[TMP3]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW]] ]
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[END:.*]], label %[[HEADER]]
+; CHECK-NEXT:    [[TMP4]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP5]] = phi float [ [[TMP1]], %[[LATCH]] ], [ undef, %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP6]] = phi float [ [[TMP2]], %[[LATCH]] ], [ undef, %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi float [ [[TMP1]], %[[LATCH]] ], [ [[TMP0]], %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi float [ [[TMP2]], %[[LATCH]] ], [ [[TMP0]], %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW]] ]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[END:.*]], label %[[HEADER]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    [[PACKED0:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0
 ; CHECK-NEXT:    [[PACKED1:%.*]] = insertelement <2 x float> [[PACKED0]], float [[TMP7]], i32 1
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.generated.expected
index 56b6c90a2f6f33f..7a7115b393b1db2 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.generated.expected
@@ -122,7 +122,6 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    .cfi_def_cfa 22, 0
 ; CHECK-NEXT:    st.w $zero, $fp, -12
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(x)
-; CHECK-NEXT:    addi.w $a0, $a0, %pc_lo12(x)
 ; CHECK-NEXT:    ori $a1, $zero, 1
 ; CHECK-NEXT:    st.w $a1, $fp, -16
 ; CHECK-NEXT:    ori $a2, $zero, 2
@@ -131,7 +130,7 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    st.w $a3, $fp, -24
 ; CHECK-NEXT:    ori $a4, $zero, 4
 ; CHECK-NEXT:    st.w $a4, $fp, -28
-; CHECK-NEXT:    st.w $a1, $a0, 0
+; CHECK-NEXT:    st.w $a1, $a0, %pc_lo12(x)
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    st.w $a1, $fp, -16
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.nogenerated.expected
index 2e063202fcf79e3..d99eb3749826f01 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.nogenerated.expected
@@ -99,7 +99,6 @@ define dso_local i32 @main() #0 {
 ; CHECK-NEXT:    .cfi_def_cfa 22, 0
 ; CHECK-NEXT:    st.w $zero, $fp, -12
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(x)
-; CHECK-NEXT:    addi.w $a0, $a0, %pc_lo12(x)
 ; CHECK-NEXT:    ori $a1, $zero, 1
 ; CHECK-NEXT:    st.w $a1, $fp, -16
 ; CHECK-NEXT:    ori $a2, $zero, 2
@@ -108,7 +107,7 @@ define dso_local i32 @main() #0 {
 ; CHECK-NEXT:    st.w $a3, $fp, -24
 ; CHECK-NEXT:    ori $a4, $zero, 4
 ; CHECK-NEXT:    st.w $a4, $fp, -28
-; CHECK-NEXT:    st.w $a1, $a0, 0
+; CHECK-NEXT:    st.w $a1, $a0, %pc_lo12(x)
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    st.w $a1, $fp, -16
diff --git a/llvm/tools/bugpoint/CMakeLists.txt b/llvm/tools/bugpoint/CMakeLists.txt
index f846aed24b75ebb..b0e71910c7cc38f 100644
--- a/llvm/tools/bugpoint/CMakeLists.txt
+++ b/llvm/tools/bugpoint/CMakeLists.txt
@@ -37,5 +37,5 @@ add_llvm_tool(bugpoint
   DEPENDS
   intrinsics_gen
   SUPPORT_PLUGINS
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
+export_executable_symbols_for_plugins(bugpoint)
diff --git a/llvm/tools/llc/CMakeLists.txt b/llvm/tools/llc/CMakeLists.txt
index c5407944dd2138c..01825c6e4c64c77 100644
--- a/llvm/tools/llc/CMakeLists.txt
+++ b/llvm/tools/llc/CMakeLists.txt
@@ -30,5 +30,6 @@ add_llvm_tool(llc
   DEPENDS
   intrinsics_gen
   SUPPORT_PLUGINS
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
+
+export_executable_symbols_for_plugins(llc)
diff --git a/llvm/tools/llvm-lto2/CMakeLists.txt b/llvm/tools/llvm-lto2/CMakeLists.txt
index 335392fb8990a07..3b4644d6e277150 100644
--- a/llvm/tools/llvm-lto2/CMakeLists.txt
+++ b/llvm/tools/llvm-lto2/CMakeLists.txt
@@ -21,6 +21,5 @@ add_llvm_tool(llvm-lto2
 
   DEPENDS
   intrinsics_gen
-
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
+export_executable_symbols_for_plugins(llvm-lto2)
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp
index a211b6ac8d6cfc3..d2274877f126b21 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp
@@ -88,7 +88,8 @@ static void maybeRewriteCallWithDifferentBundles(
             });
 
   // Finally actually replace the bundles on the call.
-  CallBase *NewCall = CallBase::Create(OrigCall, NewBundles, OrigCall);
+  CallBase *NewCall =
+      CallBase::Create(OrigCall, NewBundles, OrigCall->getIterator());
   OrigCall->replaceAllUsesWith(NewCall);
   OrigCall->eraseFromParent();
 }
diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp
index 80fb21038d304f9..e44b6023fff231e 100644
--- a/llvm/tools/llvm-stress/llvm-stress.cpp
+++ b/llvm/tools/llvm-stress/llvm-stress.cpp
@@ -336,7 +336,7 @@ struct LoadModifier: public Modifier {
     // Try to use predefined pointers. If non-exist, use undef pointer value;
     Value *Ptr = getRandomPointerValue();
     Type *Ty = pickType();
-    Value *V = new LoadInst(Ty, Ptr, "L", BB->getTerminator());
+    Value *V = new LoadInst(Ty, Ptr, "L", BB->getTerminator()->getIterator());
     PT->push_back(V);
   }
 };
@@ -356,7 +356,7 @@ struct StoreModifier: public Modifier {
       return;
 
     Value *Val = getRandomValue(ValTy);
-    new StoreInst(Val, Ptr, BB->getTerminator());
+    new StoreInst(Val, Ptr, BB->getTerminator()->getIterator());
   }
 };
 
@@ -399,7 +399,8 @@ struct BinModifier: public Modifier {
     case 12:{Op = Instruction::Xor;  break; }
     }
 
-    PT->push_back(BinaryOperator::Create(Op, Val0, Val1, "B", Term));
+    PT->push_back(
+        BinaryOperator::Create(Op, Val0, Val1, "B", Term->getIterator()));
   }
 };
 
@@ -462,8 +463,8 @@ struct AllocaModifier: public Modifier {
   void Act() override {
     Type *Tp = pickType();
     const DataLayout &DL = BB->getDataLayout();
-    PT->push_back(new AllocaInst(Tp, DL.getAllocaAddrSpace(),
-                                 "A", BB->getFirstNonPHI()));
+    PT->push_back(new AllocaInst(Tp, DL.getAllocaAddrSpace(), "A",
+                                 BB->getFirstNonPHIIt()));
   }
 };
 
@@ -474,9 +475,8 @@ struct ExtractElementModifier: public Modifier {
   void Act() override {
     Value *Val0 = getRandomVectorValue();
     Value *V = ExtractElementInst::Create(
-        Val0,
-        getRandomValue(Type::getInt32Ty(BB->getContext())),
-        "E", BB->getTerminator());
+        Val0, getRandomValue(Type::getInt32Ty(BB->getContext())), "E",
+        BB->getTerminator()->getIterator());
     return PT->push_back(V);
   }
 };
@@ -508,7 +508,7 @@ struct ShuffModifier: public Modifier {
     Constant *Mask = ConstantVector::get(Idxs);
 
     Value *V = new ShuffleVectorInst(Val0, Val1, Mask, "Shuff",
-                                     BB->getTerminator());
+                                     BB->getTerminator()->getIterator());
     PT->push_back(V);
   }
 };
@@ -522,9 +522,8 @@ struct InsertElementModifier: public Modifier {
     Value *Val1 = getRandomValue(Val0->getType()->getScalarType());
 
     Value *V = InsertElementInst::Create(
-        Val0, Val1,
-        getRandomValue(Type::getInt32Ty(BB->getContext())),
-        "I", BB->getTerminator());
+        Val0, Val1, getRandomValue(Type::getInt32Ty(BB->getContext())), "I",
+        BB->getTerminator()->getIterator());
     return PT->push_back(V);
   }
 };
@@ -550,7 +549,7 @@ struct CastModifier: public Modifier {
       if (!DestTy->isPointerTy())
         DestTy = PointerType::get(DestTy, 0);
       return PT->push_back(
-        new BitCastInst(V, DestTy, "PC", BB->getTerminator()));
+          new BitCastInst(V, DestTy, "PC", BB->getTerminator()->getIterator()));
     }
 
     unsigned VSize = VTy->getScalarType()->getPrimitiveSizeInBits();
@@ -559,47 +558,50 @@ struct CastModifier: public Modifier {
     // Generate lots of bitcasts.
     if ((getRandom() & 1) && VSize == DestSize) {
       return PT->push_back(
-        new BitCastInst(V, DestTy, "BC", BB->getTerminator()));
+          new BitCastInst(V, DestTy, "BC", BB->getTerminator()->getIterator()));
     }
 
     // Both types are integers:
     if (VTy->isIntOrIntVectorTy() && DestTy->isIntOrIntVectorTy()) {
       if (VSize > DestSize) {
         return PT->push_back(
-          new TruncInst(V, DestTy, "Tr", BB->getTerminator()));
+            new TruncInst(V, DestTy, "Tr", BB->getTerminator()->getIterator()));
       } else {
         assert(VSize < DestSize && "Different int types with the same size?");
         if (getRandom() & 1)
-          return PT->push_back(
-            new ZExtInst(V, DestTy, "ZE", BB->getTerminator()));
-        return PT->push_back(new SExtInst(V, DestTy, "Se", BB->getTerminator()));
+          return PT->push_back(new ZExtInst(
+              V, DestTy, "ZE", BB->getTerminator()->getIterator()));
+        return PT->push_back(
+            new SExtInst(V, DestTy, "Se", BB->getTerminator()->getIterator()));
       }
     }
 
     // Fp to int.
     if (VTy->isFPOrFPVectorTy() && DestTy->isIntOrIntVectorTy()) {
       if (getRandom() & 1)
-        return PT->push_back(
-          new FPToSIInst(V, DestTy, "FC", BB->getTerminator()));
-      return PT->push_back(new FPToUIInst(V, DestTy, "FC", BB->getTerminator()));
+        return PT->push_back(new FPToSIInst(
+            V, DestTy, "FC", BB->getTerminator()->getIterator()));
+      return PT->push_back(
+          new FPToUIInst(V, DestTy, "FC", BB->getTerminator()->getIterator()));
     }
 
     // Int to fp.
     if (VTy->isIntOrIntVectorTy() && DestTy->isFPOrFPVectorTy()) {
       if (getRandom() & 1)
-        return PT->push_back(
-          new SIToFPInst(V, DestTy, "FC", BB->getTerminator()));
-      return PT->push_back(new UIToFPInst(V, DestTy, "FC", BB->getTerminator()));
+        return PT->push_back(new SIToFPInst(
+            V, DestTy, "FC", BB->getTerminator()->getIterator()));
+      return PT->push_back(
+          new UIToFPInst(V, DestTy, "FC", BB->getTerminator()->getIterator()));
     }
 
     // Both floats.
     if (VTy->isFPOrFPVectorTy() && DestTy->isFPOrFPVectorTy()) {
       if (VSize > DestSize) {
-        return PT->push_back(
-          new FPTruncInst(V, DestTy, "Tr", BB->getTerminator()));
+        return PT->push_back(new FPTruncInst(
+            V, DestTy, "Tr", BB->getTerminator()->getIterator()));
       } else if (VSize < DestSize) {
         return PT->push_back(
-          new FPExtInst(V, DestTy, "ZE", BB->getTerminator()));
+            new FPExtInst(V, DestTy, "ZE", BB->getTerminator()->getIterator()));
       }
       // If VSize == DestSize, then the two types must be fp128 and ppc_fp128,
       // for which there is no defined conversion. So do nothing.
@@ -625,7 +627,8 @@ struct SelectModifier: public Modifier {
         CondTy = VectorType::get(CondTy, VTy->getElementCount());
 
     Value *Cond = getRandomValue(CondTy);
-    Value *V = SelectInst::Create(Cond, Val0, Val1, "Sl", BB->getTerminator());
+    Value *V = SelectInst::Create(Cond, Val0, Val1, "Sl",
+                                  BB->getTerminator()->getIterator());
     return PT->push_back(V);
   }
 };
@@ -654,7 +657,7 @@ struct CmpModifier: public Modifier {
 
     Value *V = CmpInst::Create(fp ? Instruction::FCmp : Instruction::ICmp,
                                (CmpInst::Predicate)op, Val0, Val1, "Cmp",
-                               BB->getTerminator());
+                               BB->getTerminator()->getIterator());
     return PT->push_back(V);
   }
 };
@@ -712,7 +715,8 @@ static void IntroduceControlFlow(Function *F, Random &R) {
     BasicBlock *Next = Curr->splitBasicBlock(Loc, "CF");
     Instr->moveBefore(Curr->getTerminator());
     if (Curr != &F->getEntryBlock()) {
-      BranchInst::Create(Curr, Next, Instr, Curr->getTerminator());
+      BranchInst::Create(Curr, Next, Instr,
+                         Curr->getTerminator()->getIterator());
       Curr->getTerminator()->eraseFromParent();
     }
   }
diff --git a/llvm/tools/opt/CMakeLists.txt b/llvm/tools/opt/CMakeLists.txt
index c235fcf1ac9605b..8d031b2cc57c786 100644
--- a/llvm/tools/opt/CMakeLists.txt
+++ b/llvm/tools/opt/CMakeLists.txt
@@ -45,7 +45,8 @@ add_llvm_tool(opt
   DEPENDS
   intrinsics_gen
   SUPPORT_PLUGINS
-  EXPORT_SYMBOLS_FOR_PLUGINS
 
   )
 target_link_libraries(opt PRIVATE LLVMOptDriver)
+
+export_executable_symbols_for_plugins(opt)
diff --git a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
index 9fd782d1b7614b1..5c71bc8063d6c9b 100644
--- a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
+++ b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
@@ -1205,7 +1205,7 @@ TEST_F(CGSCCPassManagerTest, TestAnalysisInvalidationCGSCCUpdate) {
         // Insert a bitcast of `h3` so that we retain a ref edge to it.
         (void)CastInst::CreatePointerCast(
             &H3F, PointerType::getUnqual(H2F.getContext()), "dummy",
-            &*H2F.begin()->begin());
+            H2F.begin()->begin());
 
         // Now update the call graph.
         auto &NewC =
@@ -1251,7 +1251,7 @@ TEST_F(CGSCCPassManagerTest, TestAnalysisInvalidationCGSCCUpdate) {
         assert(H3F.getName() == "h3" && "Wrong called function!");
         H2F.begin()->begin()->eraseFromParent();
         // And insert a call to `h3`.
-        (void)CallInst::Create(&H3F, {}, "", &*H2F.begin()->begin());
+        (void)CallInst::Create(&H3F, {}, "", H2F.begin()->begin());
 
         // Now update the call graph.
         auto &NewC =
@@ -1359,7 +1359,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses0) {
         ASSERT_NE(FnH3, nullptr);
 
         // And insert a call to `h1`, `h2`, and `h3`.
-        Instruction *IP = &FnH2->getEntryBlock().front();
+        BasicBlock::iterator IP = FnH2->getEntryBlock().begin();
         (void)CallInst::Create(FnH1, {}, "", IP);
         (void)CallInst::Create(FnH2, {}, "", IP);
         (void)CallInst::Create(FnH3, {}, "", IP);
@@ -1396,7 +1396,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses1) {
     ASSERT_NE(FnH3, nullptr);
 
     // And insert a call to `h1`, `h2`, and `h3`.
-    Instruction *IP = &FnH2->getEntryBlock().front();
+    BasicBlock::iterator IP = FnH2->getEntryBlock().begin();
     (void)CallInst::Create(FnH1, {}, "", IP);
     (void)CallInst::Create(FnH2, {}, "", IP);
     (void)CallInst::Create(FnH3, {}, "", IP);
@@ -1429,7 +1429,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses2) {
         ASSERT_NE(FnH2, nullptr);
 
         // And insert a call to `h2`
-        Instruction *IP = &FnF->getEntryBlock().front();
+        BasicBlock::iterator IP = FnF->getEntryBlock().begin();
         (void)CallInst::Create(FnH2, {}, "", IP);
 
         auto &FN = *llvm::find_if(
@@ -1460,7 +1460,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses3) {
     ASSERT_NE(FnH2, nullptr);
 
     // And insert a call to `h2`
-    Instruction *IP = &FnF->getEntryBlock().front();
+    BasicBlock::iterator IP = FnF->getEntryBlock().begin();
     (void)CallInst::Create(FnH2, {}, "", IP);
 
     auto &FN = *llvm::find_if(
@@ -1492,7 +1492,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses4) {
         ReturnInst::Create(FnewF->getContext(), BB);
 
         // And insert a call to `newF`
-        Instruction *IP = &FnF->getEntryBlock().front();
+        BasicBlock::iterator IP = FnF->getEntryBlock().begin();
         (void)CallInst::Create(FnewF, {}, "", IP);
 
         // Use the CallGraphUpdater to update the call graph for the new
@@ -1536,7 +1536,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses5) {
     CGU.initialize(CG, C, AM, UR);
 
     // And insert a call to `newF`
-    Instruction *IP = &FnF->getEntryBlock().front();
+    BasicBlock::iterator IP = FnF->getEntryBlock().begin();
     (void)CallInst::Create(FnewF, {}, "", IP);
 
     auto &FN = *llvm::find_if(
@@ -1569,7 +1569,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses6) {
         ASSERT_NE(FnH3, nullptr);
 
         // And insert a call to `h1`, `h2`, and `h3`.
-        Instruction *IP = &FnH2->getEntryBlock().front();
+        BasicBlock::iterator IP = FnH2->getEntryBlock().begin();
         (void)CallInst::Create(FnH1, {}, "", IP);
         (void)CallInst::Create(FnH2, {}, "", IP);
         (void)CallInst::Create(FnH3, {}, "", IP);
@@ -1600,7 +1600,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses7) {
         ASSERT_NE(FnH2, nullptr);
 
         // And insert a call to `h2`
-        Instruction *IP = &FnF->getEntryBlock().front();
+        BasicBlock::iterator IP = FnF->getEntryBlock().begin();
         (void)CallInst::Create(FnH2, {}, "", IP);
 
         // Use the CallGraphUpdater to update the call graph for the new
@@ -1690,7 +1690,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses10) {
         ASSERT_NE(FnH3, nullptr);
 
         // And insert a call to `h1`, and `h3`.
-        Instruction *IP = &FnH1->getEntryBlock().front();
+        BasicBlock::iterator IP = FnH1->getEntryBlock().begin();
         (void)CallInst::Create(FnH1, {}, "", IP);
         (void)CallInst::Create(FnH3, {}, "", IP);
 
@@ -1763,11 +1763,11 @@ TEST_F(CGSCCPassManagerTest, TestInsertionOfNewFunctions1) {
           // 2. Insert a ref edge from 'f' to 'f'.
           (void)CastInst::CreatePointerCast(
               &F, PointerType::getUnqual(F.getContext()), "f.ref",
-              &F.getEntryBlock().front());
+              F.getEntryBlock().begin());
           // 3. Insert a ref edge from 'f' to 'g'.
           (void)CastInst::CreatePointerCast(
               G, PointerType::getUnqual(F.getContext()), "g.ref",
-              &F.getEntryBlock().front());
+              F.getEntryBlock().begin());
 
           CG.addSplitFunction(F, *G);
 
@@ -1827,9 +1827,9 @@ TEST_F(CGSCCPassManagerTest, TestInsertionOfNewFunctions2) {
       (void)ReturnInst::Create(G2->getContext(), G2BB);
 
       // Add 'f -> g1' call edge.
-      (void)CallInst::Create(G1, {}, "", &F.getEntryBlock().front());
+      (void)CallInst::Create(G1, {}, "", F.getEntryBlock().begin());
       // Add 'f -> g2' call edge.
-      (void)CallInst::Create(G2, {}, "", &F.getEntryBlock().front());
+      (void)CallInst::Create(G2, {}, "", F.getEntryBlock().begin());
 
       CG.addSplitFunction(F, *G1);
       CG.addSplitFunction(F, *G2);
@@ -1853,11 +1853,11 @@ TEST_F(CGSCCPassManagerTest, TestInsertionOfNewFunctions2) {
       // Add 'f -> h1' ref edge.
       (void)CastInst::CreatePointerCast(H1,
                                         PointerType::getUnqual(F.getContext()),
-                                        "h1.ref", &F.getEntryBlock().front());
+                                        "h1.ref", F.getEntryBlock().begin());
       // Add 'f -> h2' ref edge.
       (void)CastInst::CreatePointerCast(H2,
                                         PointerType::getUnqual(F.getContext()),
-                                        "h2.ref", &F.getEntryBlock().front());
+                                        "h2.ref", F.getEntryBlock().begin());
 
       CG.addSplitRefRecursiveFunctions(F, SmallVector<Function *, 2>({H1, H2}));
 
@@ -1980,7 +1980,8 @@ TEST_F(CGSCCPassManagerTest, TestInsertionOfNewNonTrivialCallEdge) {
       ASSERT_TRUE(F3 != nullptr);
 
       // Create call from f1 to f3.
-      (void)CallInst::Create(F3, {}, "", F.getEntryBlock().getTerminator());
+      (void)CallInst::Create(F3, {}, "",
+                             F.getEntryBlock().getTerminator()->getIterator());
 
       ASSERT_NO_FATAL_FAILURE(
           updateCGAndAnalysisManagerForCGSCCPass(CG, C, *N, AM, UR, FAM))
diff --git a/llvm/unittests/Analysis/CMakeLists.txt b/llvm/unittests/Analysis/CMakeLists.txt
index a1199adba076a7d..3cba630867a83b3 100644
--- a/llvm/unittests/Analysis/CMakeLists.txt
+++ b/llvm/unittests/Analysis/CMakeLists.txt
@@ -62,14 +62,8 @@ else()
   LIST(APPEND LLVM_OPTIONAL_SOURCES ${MLGO_TESTS})
 endif()
 
-# Export symbols from the plugins shared objects.
-if(NOT WIN32)
-  set(export_symbols EXPORT_SYMBOLS_FOR_PLUGINS)
-endif()
-
 add_llvm_unittest_with_input_files(AnalysisTests
   ${ANALYSIS_TEST_SOURCES}
-  ${export_symbols}
   )
 
 add_dependencies(AnalysisTests intrinsics_gen)
@@ -82,5 +76,10 @@ if(CMAKE_SYSTEM_NAME STREQUAL "AIX")
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-brtl")
 endif()
 
+# Export symbols from the plugins shared objects.
+if(NOT WIN32)
+  export_executable_symbols_for_plugins(AnalysisTests)
+endif()
+
 add_subdirectory(InlineAdvisorPlugin)
 add_subdirectory(InlineOrderPlugin)
diff --git a/llvm/unittests/Analysis/LazyCallGraphTest.cpp b/llvm/unittests/Analysis/LazyCallGraphTest.cpp
index 64b6ccddc53b0c6..6cfc01ed81102fb 100644
--- a/llvm/unittests/Analysis/LazyCallGraphTest.cpp
+++ b/llvm/unittests/Analysis/LazyCallGraphTest.cpp
@@ -2357,7 +2357,7 @@ TEST(LazyCallGraphTest, AddSplitFunction1) {
   (void)ReturnInst::Create(Context, GBB);
 
   // Create f -call-> g.
-  (void)CallInst::Create(G, {}, "", &*F.getEntryBlock().begin());
+  (void)CallInst::Create(G, {}, "", F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2398,7 +2398,7 @@ TEST(LazyCallGraphTest, AddSplitFunction2) {
 
   // Create f -ref-> g.
   (void)CastInst::CreatePointerCast(G, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2441,7 +2441,7 @@ TEST(LazyCallGraphTest, AddSplitFunction3) {
   (void)ReturnInst::Create(Context, GBB);
 
   // Create f -call-> g.
-  (void)CallInst::Create(G, {}, "", &*F.getEntryBlock().begin());
+  (void)CallInst::Create(G, {}, "", F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2487,7 +2487,7 @@ TEST(LazyCallGraphTest, AddSplitFunction4) {
 
   // Create f -ref-> g.
   (void)CastInst::CreatePointerCast(G, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2533,7 +2533,7 @@ TEST(LazyCallGraphTest, AddSplitFunction5) {
 
   // Create f -ref-> g.
   (void)CastInst::CreatePointerCast(G, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2577,7 +2577,7 @@ TEST(LazyCallGraphTest, AddSplitFunction6) {
   (void)ReturnInst::Create(Context, GBB);
 
   // Create f -call-> g.
-  (void)CallInst::Create(G, {}, "", &*F.getEntryBlock().begin());
+  (void)CallInst::Create(G, {}, "", F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2628,7 +2628,7 @@ TEST(LazyCallGraphTest, AddSplitFunction7) {
   (void)ReturnInst::Create(Context, GBB);
 
   // Create f -call-> g.
-  (void)CallInst::Create(G, {}, "", &*F.getEntryBlock().begin());
+  (void)CallInst::Create(G, {}, "", F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2681,7 +2681,7 @@ TEST(LazyCallGraphTest, AddSplitFunction8) {
 
   // Create f -ref-> g.
   (void)CastInst::CreatePointerCast(G, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2734,7 +2734,7 @@ TEST(LazyCallGraphTest, AddSplitFunction9) {
   (void)ReturnInst::Create(Context, GBB);
 
   // Create f -call-> g.
-  (void)CallInst::Create(G, {}, "", &*F.getEntryBlock().begin());
+  (void)CallInst::Create(G, {}, "", F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2778,7 +2778,7 @@ TEST(LazyCallGraphTest, AddSplitFunctions1) {
 
   // Create f -ref-> g.
   (void)CastInst::CreatePointerCast(G, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2822,7 +2822,7 @@ TEST(LazyCallGraphTest, AddSplitFunctions2) {
 
   // Create f -ref-> g.
   (void)CastInst::CreatePointerCast(G, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2875,9 +2875,9 @@ TEST(LazyCallGraphTest, AddSplitFunctions3) {
 
   // Create f -ref-> g1 and f -ref-> g2.
   (void)CastInst::CreatePointerCast(G1, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
   (void)CastInst::CreatePointerCast(G2, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2934,9 +2934,9 @@ TEST(LazyCallGraphTest, AddSplitFunctions4) {
 
   // Create f -ref-> g1 and f -ref-> g2.
   (void)CastInst::CreatePointerCast(G1, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
   (void)CastInst::CreatePointerCast(G2, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -3004,9 +3004,9 @@ TEST(LazyCallGraphTest, AddSplitFunctions5) {
 
   // Create f -ref-> g1 and f -ref-> g2.
   (void)CastInst::CreatePointerCast(G1, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
   (void)CastInst::CreatePointerCast(G2, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
index 64a7503d30eedd0..6fc24f6796310d5 100644
--- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -137,7 +137,7 @@ TEST_F(ScalarEvolutionsTest, SimplifiedPHI) {
                      LoopBB);
   ReturnInst::Create(Context, nullptr, ExitBB);
   auto *Ty = Type::getInt32Ty(Context);
-  auto *PN = PHINode::Create(Ty, 2, "", &*LoopBB->begin());
+  auto *PN = PHINode::Create(Ty, 2, "", LoopBB->begin());
   PN->addIncoming(Constant::getNullValue(Ty), EntryBB);
   PN->addIncoming(UndefValue::get(Ty), LoopBB);
   ScalarEvolution SE = buildSE(*F);
@@ -930,10 +930,12 @@ TEST_F(ScalarEvolutionsTest, SCEVAddRecFromPHIwithLargeConstants) {
   auto *Int64_32 = ConstantInt::get(Context, APInt(64, 32));
   auto *Br = BranchInst::Create(
       LoopBB, ExitBB, UndefValue::get(Type::getInt1Ty(Context)), LoopBB);
-  auto *Phi = PHINode::Create(Type::getInt64Ty(Context), 2, "", Br);
-  auto *Shl = BinaryOperator::CreateShl(Phi, Int64_32, "", Br);
-  auto *AShr = BinaryOperator::CreateExactAShr(Shl, Int64_32, "", Br);
-  auto *Add = BinaryOperator::CreateAdd(AShr, MinInt64, "", Br);
+  auto *Phi =
+      PHINode::Create(Type::getInt64Ty(Context), 2, "", Br->getIterator());
+  auto *Shl = BinaryOperator::CreateShl(Phi, Int64_32, "", Br->getIterator());
+  auto *AShr =
+      BinaryOperator::CreateExactAShr(Shl, Int64_32, "", Br->getIterator());
+  auto *Add = BinaryOperator::CreateAdd(AShr, MinInt64, "", Br->getIterator());
   Phi->addIncoming(MinInt64, EntryBB);
   Phi->addIncoming(Add, LoopBB);
   // exit:
@@ -986,10 +988,11 @@ TEST_F(ScalarEvolutionsTest, SCEVAddRecFromPHIwithLargeConstantAccum) {
   auto *Int32_16 = ConstantInt::get(Context, APInt(32, 16));
   auto *Br = BranchInst::Create(
       LoopBB, ExitBB, UndefValue::get(Type::getInt1Ty(Context)), LoopBB);
-  auto *Phi = PHINode::Create(Int32Ty, 2, "", Br);
-  auto *Shl = BinaryOperator::CreateShl(Phi, Int32_16, "", Br);
-  auto *AShr = BinaryOperator::CreateExactAShr(Shl, Int32_16, "", Br);
-  auto *Add = BinaryOperator::CreateAdd(AShr, MinInt32, "", Br);
+  auto *Phi = PHINode::Create(Int32Ty, 2, "", Br->getIterator());
+  auto *Shl = BinaryOperator::CreateShl(Phi, Int32_16, "", Br->getIterator());
+  auto *AShr =
+      BinaryOperator::CreateExactAShr(Shl, Int32_16, "", Br->getIterator());
+  auto *Add = BinaryOperator::CreateAdd(AShr, MinInt32, "", Br->getIterator());
   auto *Arg = &*(F->arg_begin());
   Phi->addIncoming(Arg, EntryBB);
   Phi->addIncoming(Add, LoopBB);
diff --git a/llvm/unittests/FuzzMutate/OperationsTest.cpp b/llvm/unittests/FuzzMutate/OperationsTest.cpp
index be4c75423a89a53..bc972ad21d049f1 100644
--- a/llvm/unittests/FuzzMutate/OperationsTest.cpp
+++ b/llvm/unittests/FuzzMutate/OperationsTest.cpp
@@ -261,7 +261,7 @@ TEST(OperationsTest, SplitBlock) {
   // Create a block with only a return and split it on the return.
   auto *BB = BasicBlock::Create(Ctx, "BB", F);
   auto *RI = ReturnInst::Create(Ctx, BB);
-  SBOp.BuilderFunc({UndefValue::get(Type::getInt1Ty(Ctx))}, RI);
+  SBOp.BuilderFunc({UndefValue::get(Type::getInt1Ty(Ctx))}, RI->getIterator());
 
   // We should end up with an unconditional branch from BB to BB1, and the
   // return ends up in BB1.
@@ -271,9 +271,9 @@ TEST(OperationsTest, SplitBlock) {
   ASSERT_THAT(RI->getParent(), Eq(BB1));
 
   // Now add an instruction to BB1 and split on that.
-  auto *AI = new AllocaInst(Type::getInt8Ty(Ctx), 0, "a", RI);
+  auto *AI = new AllocaInst(Type::getInt8Ty(Ctx), 0, "a", RI->getIterator());
   Value *Cond = ConstantInt::getFalse(Ctx);
-  SBOp.BuilderFunc({Cond}, AI);
+  SBOp.BuilderFunc({Cond}, AI->getIterator());
 
   // We should end up with a loop back on BB1 and the instruction we split on
   // moves to BB2.
@@ -313,7 +313,7 @@ TEST(OperationsTest, SplitEHBlock) {
 
   fuzzerop::OpDescriptor Descr = fuzzerop::splitBlockDescriptor(1);
 
-  Descr.BuilderFunc({ConstantInt::getTrue(Ctx)}, &*BB.getFirstInsertionPt());
+  Descr.BuilderFunc({ConstantInt::getTrue(Ctx)}, BB.getFirstInsertionPt());
   ASSERT_TRUE(!verifyModule(*M, &errs()));
 }
 
@@ -346,7 +346,7 @@ TEST(OperationsTest, SplitBlockWithPhis) {
 
   // Now we split the block with PHI nodes, making sure they're all updated.
   Value *Cond = ConstantInt::getFalse(Ctx);
-  SBOp.BuilderFunc({Cond}, RI);
+  SBOp.BuilderFunc({Cond}, RI->getIterator());
 
   // Make sure the PHIs are updated with a value for the third incoming edge.
   EXPECT_THAT(PHI1->getNumIncomingValues(), Eq(3u));
@@ -373,7 +373,7 @@ TEST(OperationsTest, GEP) {
                                            ConstantInt::get(Int32Ty, 0)));
 
   GEPOp.BuilderFunc({UndefValue::get(Int8PtrTy), ConstantInt::get(Int32Ty, 0)},
-                    RI);
+                    RI->getIterator());
   EXPECT_FALSE(verifyModule(M, &errs()));
 }
 
diff --git a/llvm/unittests/IR/BasicBlockTest.cpp b/llvm/unittests/IR/BasicBlockTest.cpp
index 3756f227143a508..eea2746a352aa62 100644
--- a/llvm/unittests/IR/BasicBlockTest.cpp
+++ b/llvm/unittests/IR/BasicBlockTest.cpp
@@ -49,12 +49,15 @@ TEST(BasicBlockTest, PhiRange) {
 
   // Now insert some PHI nodes.
   auto *Int32Ty = Type::getInt32Ty(Context);
-  auto *P1 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.1", BI);
-  auto *P2 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.2", BI);
-  auto *P3 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.3", BI);
+  auto *P1 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.1",
+                             BI->getIterator());
+  auto *P2 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.2",
+                             BI->getIterator());
+  auto *P3 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.3",
+                             BI->getIterator());
 
   // Some non-PHI nodes.
-  auto *Sum = BinaryOperator::CreateAdd(P1, P2, "sum", BI);
+  auto *Sum = BinaryOperator::CreateAdd(P1, P2, "sum", BI->getIterator());
 
   // Now wire up the incoming values that are interesting.
   P1->addIncoming(P2, BB.get());
diff --git a/llvm/unittests/Passes/Plugins/CMakeLists.txt b/llvm/unittests/Passes/Plugins/CMakeLists.txt
index 55d7e715014f4b7..e90cae167bc2223 100644
--- a/llvm/unittests/Passes/Plugins/CMakeLists.txt
+++ b/llvm/unittests/Passes/Plugins/CMakeLists.txt
@@ -6,9 +6,8 @@ if (NOT WIN32 AND NOT CYGWIN)
   set(LLVM_LINK_COMPONENTS Support Passes Core AsmParser)
   add_llvm_unittest(PluginsTests
     PluginsTest.cpp
-
-    EXPORT_SYMBOLS_FOR_PLUGINS
     )
+  export_executable_symbols_for_plugins(PluginsTests)
   target_link_libraries(PluginsTests PRIVATE LLVMTestingSupport)
 
   unset(LLVM_LINK_COMPONENTS)
diff --git a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
index c4560cb9ce9b22a..32870d9fb373782 100644
--- a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
+++ b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
@@ -99,22 +99,24 @@ TEST_F(ScalarEvolutionExpanderTest, ExpandPtrTypeSCEV) {
   const DataLayout &DL = F->getDataLayout();
   BranchInst *Br = BranchInst::Create(
       LoopBB, ExitBB, UndefValue::get(Type::getInt1Ty(Context)), LoopBB);
-  AllocaInst *Alloca =
-      new AllocaInst(I32Ty, DL.getAllocaAddrSpace(), "alloca", Br);
+  AllocaInst *Alloca = new AllocaInst(I32Ty, DL.getAllocaAddrSpace(), "alloca",
+                                      Br->getIterator());
   ConstantInt *Ci32 = ConstantInt::get(Context, APInt(32, 1));
   GetElementPtrInst *Gep0 =
-      GetElementPtrInst::Create(I32Ty, Alloca, Ci32, "gep0", Br);
-  CastInst *CastA =
-      CastInst::CreateBitOrPointerCast(Gep0, I8PtrTy, "bitcast1", Br);
+      GetElementPtrInst::Create(I32Ty, Alloca, Ci32, "gep0", Br->getIterator());
+  CastInst *CastA = CastInst::CreateBitOrPointerCast(Gep0, I8PtrTy, "bitcast1",
+                                                     Br->getIterator());
   GetElementPtrInst *Gep1 =
-      GetElementPtrInst::Create(I8Ty, CastA, Ci32, "gep1", Br);
+      GetElementPtrInst::Create(I8Ty, CastA, Ci32, "gep1", Br->getIterator());
   GetElementPtrInst *Gep2 = GetElementPtrInst::Create(
-      I8Ty, UndefValue::get(I8PtrTy), Ci32, "gep2", Br);
+      I8Ty, UndefValue::get(I8PtrTy), Ci32, "gep2", Br->getIterator());
   CmpInst *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT,
-                                 UndefValue::get(I8PtrTy), CastA, "cmp", Br);
-  SelectInst *Sel = SelectInst::Create(Cmp, Gep1, Gep2, "select", Br);
-  CastInst *CastB =
-      CastInst::CreateBitOrPointerCast(Sel, I32PtrTy, "bitcast2", Br);
+                                 UndefValue::get(I8PtrTy), CastA, "cmp",
+                                 Br->getIterator());
+  SelectInst *Sel =
+      SelectInst::Create(Cmp, Gep1, Gep2, "select", Br->getIterator());
+  CastInst *CastB = CastInst::CreateBitOrPointerCast(Sel, I32PtrTy, "bitcast2",
+                                                     Br->getIterator());
 
   ScalarEvolution SE = buildSE(*F);
   const SCEV *S = SE.getSCEV(CastB);
diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
index 910488a14b9858c..ae5ce32d617b476 100644
--- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
@@ -36,9 +36,9 @@ static void printExtensionTable(raw_ostream &OS,
     if (R->getValueAsBit("Experimental") != Experimental)
       continue;
 
-    OS << "    {\"" << getExtensionName(R) << "\", {"
-       << R->getValueAsInt("MajorVersion") << ", "
-       << R->getValueAsInt("MinorVersion") << "}},\n";
+    OS.indent(4) << "{\"" << getExtensionName(R) << "\", {"
+                 << R->getValueAsInt("MajorVersion") << ", "
+                 << R->getValueAsInt("MinorVersion") << "}},\n";
   }
 
   OS << "};\n\n";
@@ -77,8 +77,8 @@ static void emitRISCVExtensions(RecordKeeper &Records, raw_ostream &OS) {
         if (!ImpliedExt->isSubClassOf("RISCVExtension"))
           continue;
 
-        OS << "    { {\"" << Name << "\"}, \"" << getExtensionName(ImpliedExt)
-           << "\"},\n";
+        OS.indent(4) << "{ {\"" << Name << "\"}, \""
+                     << getExtensionName(ImpliedExt) << "\"},\n";
       }
     }
 
@@ -236,10 +236,10 @@ static void emitRISCVExtensionBitmask(RecordKeeper &RK, raw_ostream &OS) {
            "duplicated bitmask");
 #endif
 
-    OS << "    {"
-       << "\"" << ExtName << "\""
-       << ", " << GroupIDVal << ", " << BitPosVal << "ULL"
-       << "},\n";
+    OS.indent(4) << "{"
+                 << "\"" << ExtName << "\""
+                 << ", " << GroupIDVal << ", " << BitPosVal << "ULL"
+                 << "},\n";
   }
   OS << "};\n";
   OS << "#endif\n";
diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py
index f150098eaaeef48..f05d8b89e73b933 100644
--- a/llvm/utils/UpdateTestChecks/asm.py
+++ b/llvm/utils/UpdateTestChecks/asm.py
@@ -52,6 +52,7 @@ class string:
 
 ASM_FUNCTION_AMDGPU_RE = re.compile(
     r"\.type\s+_?(?P<func>[^,\n]+),@function\n"
+    r"(^\s*\.amdgpu_hsa_kernel (?P=func)\n)?"
     r'^_?(?P=func):(?:[ \t]*;+[ \t]*@"?(?P=func)"?)?\n'
     r"(?P<body>.*?)\n"  # (body of the function)
     # This list is incomplete
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/LoongArch/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/LoongArch/BUILD.gn
index 248e47a9584cdf2..09a5311c122fc54 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/LoongArch/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/LoongArch/BUILD.gn
@@ -42,6 +42,7 @@ static_library("LLVMLoongArchCodeGen") {
     "LoongArchISelLowering.cpp",
     "LoongArchInstrInfo.cpp",
     "LoongArchMCInstLower.cpp",
+    "LoongArchMergeBaseOffset.cpp",
     "LoongArchOptWInstrs.cpp",
     "LoongArchRegisterInfo.cpp",
     "LoongArchSubtarget.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
index 0f67db3549fb140..aeaeb576513576b 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
@@ -32,6 +32,7 @@ static_library("Instrumentation") {
     "PGOInstrumentation.cpp",
     "PGOMemOPSizeOpt.cpp",
     "PoisonChecking.cpp",
+    "RealtimeSanitizer.cpp",
     "SanitizerBinaryMetadata.cpp",
     "SanitizerCoverage.cpp",
     "ThreadSanitizer.cpp",
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
index e228229302cff46..d79b90f840ce836 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
@@ -21,6 +21,7 @@
 namespace mlir {
 
 class DataLayoutAnalysis;
+class FunctionOpInterface;
 class LowerToLLVMOptions;
 
 namespace LLVM {
@@ -50,13 +51,25 @@ class LLVMTypeConverter : public TypeConverter {
   LLVMTypeConverter(MLIRContext *ctx, const LowerToLLVMOptions &options,
                     const DataLayoutAnalysis *analysis = nullptr);
 
-  /// Convert a function type.  The arguments and results are converted one by
+  /// Convert a function type. The arguments and results are converted one by
   /// one and results are packed into a wrapped LLVM IR structure type. `result`
   /// is populated with argument mapping.
   Type convertFunctionSignature(FunctionType funcTy, bool isVariadic,
                                 bool useBarePtrCallConv,
                                 SignatureConversion &result) const;
 
+  /// Convert a function type. The arguments and results are converted one by
+  /// one and results are packed into a wrapped LLVM IR structure type. `result`
+  /// is populated with argument mapping. Converted types of `llvm.byval` and
+  /// `llvm.byref` function arguments which are not LLVM pointers are overridden
+  /// with LLVM pointers. Overridden arguments are returned in
+  /// `byValRefNonPtrAttrs`.
+  Type convertFunctionSignature(FunctionOpInterface funcOp, bool isVariadic,
+                                bool useBarePtrCallConv,
+                                LLVMTypeConverter::SignatureConversion &result,
+                                SmallVectorImpl<std::optional<NamedAttribute>>
+                                    &byValRefNonPtrAttrs) const;
+
   /// Convert a non-empty list of types to be returned from a function into an
   /// LLVM-compatible type. In particular, if more than one value is returned,
   /// create an LLVM dialect structure type with elements that correspond to
@@ -159,12 +172,26 @@ class LLVMTypeConverter : public TypeConverter {
   SmallVector<Type> &getCurrentThreadRecursiveStack();
 
 private:
-  /// Convert a function type.  The arguments and results are converted one by
-  /// one.  Additionally, if the function returns more than one value, pack the
+  /// Convert a function type. The arguments and results are converted one by
+  /// one. Additionally, if the function returns more than one value, pack the
   /// results into an LLVM IR structure type so that the converted function type
   /// returns at most one result.
   Type convertFunctionType(FunctionType type) const;
 
+  /// Common implementation for `convertFunctionSignature` methods. Convert a
+  /// function type. The arguments and results are converted one by one and
+  /// results are packed into a wrapped LLVM IR structure type. `result` is
+  /// populated with argument mapping. If `byValRefNonPtrAttrs` is provided,
+  /// converted types of `llvm.byval` and `llvm.byref` function arguments which
+  /// are not LLVM pointers are overridden with LLVM pointers. `llvm.byval` and
+  /// `llvm.byref` arguments that were already converted to LLVM pointer types
+  /// are removed from 'byValRefNonPtrAttrs`.
+  Type convertFunctionSignatureImpl(
+      FunctionType funcTy, bool isVariadic, bool useBarePtrCallConv,
+      LLVMTypeConverter::SignatureConversion &result,
+      SmallVectorImpl<std::optional<NamedAttribute>> *byValRefNonPtrAttrs)
+      const;
+
   /// Convert the index type.  Uses llvmModule data layout to create an integer
   /// of the pointer bitwidth.
   Type convertIndexType(IndexType type) const;
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index acbcbae105dbfb1..847040466a85fd3 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -80,8 +80,8 @@ def ExecuteRegionOp : SCF_Op<"execute_region", [
     DeclareOpInterfaceMethods<RegionBranchOpInterface>]> {
   let summary = "operation that executes its region exactly once";
   let description = [{
-    The `execute_region` operation is used to allow multiple blocks within SCF
-    and other operations which can hold only one block.  The `execute_region`
+    The `scf.execute_region` operation is used to allow multiple blocks within SCF
+    and other operations which can hold only one block.  The `scf.execute_region`
     operation executes the region held exactly once and cannot have any operands.
     As such, its region has no arguments. All SSA values that dominate the op can
     be accessed inside the op. The op's region can have multiple blocks and the
@@ -344,7 +344,7 @@ def ForallOp : SCF_Op<"forall", [
 
     The only allowed terminator is `scf.forall.in_parallel`.
     `scf.forall` returns one value per `shared_out` operand. The
-    actions of the `in_parallel` terminators specify how to combine the
+    actions of the `scf.forall.in_parallel` terminators specify how to combine the
     partial results of all parallel invocations into a full value, in some
     unspecified order. The "destination" of each such op must be a `shared_out`
     block argument of the `scf.forall` op.
@@ -633,7 +633,7 @@ def InParallelOp : SCF_Op<"forall.in_parallel", [
       ] # GraphRegionNoTerminator.traits> {
   let summary = "terminates a `forall` block";
   let description = [{
-    `scf.forall.in_parallel` is a designated terminator for
+    The `scf.forall.in_parallel` is a designated terminator for
     the `scf.forall` operation.
 
     It has a single region with a single block that contains a flat list of ops.
@@ -778,7 +778,7 @@ def ParallelOp : SCF_Op<"parallel",
      HasParallelRegion]> {
   let summary = "parallel for operation";
   let description = [{
-    The "scf.parallel" operation represents a loop nest taking 4 groups of SSA
+    The `scf.parallel` operation represents a loop nest taking 4 groups of SSA
     values as operands that represent the lower bounds, upper bounds, steps and
     initial values, respectively. The operation defines a variadic number of
     SSA values for its induction variables. It has one region capturing the
@@ -787,7 +787,7 @@ def ParallelOp : SCF_Op<"parallel",
     machine word. The steps are values of type index, required to be positive.
     The lower and upper bounds specify a half-open range: the range includes
     the lower bound but does not include the upper bound. The initial values
-    have the same types as results of "scf.parallel". If there are no results,
+    have the same types as results of `scf.parallel`. If there are no results,
     the keyword `init` can be omitted.
 
     Semantically we require that the iteration space can be iterated in any
@@ -796,17 +796,17 @@ def ParallelOp : SCF_Op<"parallel",
 
     The parallel loop operation supports reduction of values produced by
     individual iterations into a single result. This is modeled using the
-    "scf.reduce" terminator operation (see "scf.reduce" for details). The i-th
-    result of an "scf.parallel" operation is associated with the i-th initial
-    value operand, the i-th operand of the "scf.reduce" operation (the value to
-    be reduced) and the i-th region of the "scf.reduce" operation (the reduction
+    `scf.reduce` terminator operation (see `scf.reduce` for details). The i-th
+    result of an `scf.parallel` operation is associated with the i-th initial
+    value operand, the i-th operand of the `scf.reduce` operation (the value to
+    be reduced) and the i-th region of the `scf.reduce` operation (the reduction
     function). Consequently, we require that the number of results of an
-    "scf.parallel" op matches the number of initial values and the the number of
-    reductions in the "scf.reduce" terminator.
+    `scf.parallel` op matches the number of initial values and the the number of
+    reductions in the `scf.reduce` terminator.
 
     The body region must contain exactly one block that terminates with a
-    "scf.reduce" operation. If an "scf.parallel" op has no reductions, the
-    terminator has no operands and no regions. The "scf.parallel" parser will
+    `scf.reduce` operation. If an `scf.parallel` op has no reductions, the
+    terminator has no operands and no regions. The `scf.parallel` parser will
     automatically insert the terminator for ops that have no reductions if it is
     absent.
 
@@ -875,25 +875,25 @@ def ReduceOp : SCF_Op<"reduce", [
     DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface>]> {
   let summary = "reduce operation for scf.parallel";
   let description = [{
-    "scf.reduce" is the terminator for "scf.parallel" operations. It can model
+    The `scf.reduce` operation is the terminator for `scf.parallel` operations. It can model
     an arbitrary number of reductions. It has one region per reduction. Each
     region has one block with two arguments which have the same type as the
-    corresponding operand of "scf.reduce". The operands of the op are the values
+    corresponding operand of `scf.reduce`. The operands of the op are the values
     that should be reduce; one value per reduction.
 
     The i-th reduction (i.e., the i-th region and the i-th operand) corresponds
-    the i-th initial value and the i-th result of the enclosing "scf.parallel"
+    the i-th initial value and the i-th result of the enclosing `scf.parallel`
     op.
 
-    The "scf.reduce" operation contains regions whose entry blocks expect two
+    The `scf.reduce` operation contains regions whose entry blocks expect two
     arguments of the same type as the corresponding operand. As the iteration
     order of the enclosing parallel loop and hence reduction order is
     unspecified, the results of the reductions may be non-deterministic unless
     the reductions are associative and commutative.
 
-    The result of a reduction region ("scf.reduce.return" operand) must have the
-    same type as the corresponding "scf.reduce" operand and the corresponding
-    "scf.parallel" initial value.
+    The result of a reduction region (`scf.reduce.return` operand) must have the
+    same type as the corresponding `scf.reduce` operand and the corresponding
+    `scf.parallel` initial value.
 
     Example:
 
@@ -929,9 +929,9 @@ def ReduceReturnOp :
     SCF_Op<"reduce.return", [HasParent<"ReduceOp">, Pure, Terminator]> {
   let summary = "terminator for reduce operation";
   let description = [{
-    "scf.reduce.return" is a special terminator operation for the block inside
-    "scf.reduce" regions. It terminates the region. It should have the same
-    operand type as the corresponding operand of the enclosing "scf.reduce" op.
+    The `scf.reduce.return` operation is a special terminator operation for the block inside
+    `scf.reduce` regions. It terminates the region. It should have the same
+    operand type as the corresponding operand of the enclosing `scf.reduce` op.
 
     Example:
 
@@ -1172,12 +1172,12 @@ def YieldOp : SCF_Op<"yield", [Pure, ReturnLike, Terminator,
                  "WhileOp"]>]> {
   let summary = "loop yield and termination operation";
   let description = [{
-    "scf.yield" yields an SSA value from the SCF dialect op region and
+    The `scf.yield` operation yields an SSA value from the SCF dialect op region and
     terminates the regions. The semantics of how the values are yielded is
     defined by the parent operation.
-    If "scf.yield" has any operands, the operands must match the parent
+    If `scf.yield` has any operands, the operands must match the parent
     operation's results.
-    If the parent operation defines no values, then the "scf.yield" may be
+    If the parent operation defines no values, then the `scf.yield` may be
     left out in the custom syntax and the builders will insert one implicitly.
     Otherwise, it has to be present in the syntax to indicate which values are
     yielded.
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index c1f6d8bc5b361df..4c2e8682285c52f 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -267,6 +267,38 @@ static void wrapExternalFunction(OpBuilder &builder, Location loc,
   }
 }
 
+/// Inserts `llvm.load` ops in the function body to restore the expected pointee
+/// value from `llvm.byval`/`llvm.byref` function arguments that were converted
+/// to LLVM pointer types.
+static void restoreByValRefArgumentType(
+    ConversionPatternRewriter &rewriter, const LLVMTypeConverter &typeConverter,
+    ArrayRef<std::optional<NamedAttribute>> byValRefNonPtrAttrs,
+    LLVM::LLVMFuncOp funcOp) {
+  // Nothing to do for function declarations.
+  if (funcOp.isExternal())
+    return;
+
+  ConversionPatternRewriter::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPointToStart(&funcOp.getFunctionBody().front());
+
+  for (const auto &[arg, byValRefAttr] :
+       llvm::zip(funcOp.getArguments(), byValRefNonPtrAttrs)) {
+    // Skip argument if no `llvm.byval` or `llvm.byref` attribute.
+    if (!byValRefAttr)
+      continue;
+
+    // Insert load to retrieve the actual argument passed by value/reference.
+    assert(isa<LLVM::LLVMPointerType>(arg.getType()) &&
+           "Expected LLVM pointer type for argument with "
+           "`llvm.byval`/`llvm.byref` attribute");
+    Type resTy = typeConverter.convertType(
+        cast<TypeAttr>(byValRefAttr->getValue()).getValue());
+
+    auto valueArg = rewriter.create<LLVM::LoadOp>(arg.getLoc(), resTy, arg);
+    rewriter.replaceAllUsesExcept(arg, valueArg, valueArg);
+  }
+}
+
 FailureOr<LLVM::LLVMFuncOp>
 mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
                                 ConversionPatternRewriter &rewriter,
@@ -280,10 +312,14 @@ mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
   // Convert the original function arguments. They are converted using the
   // LLVMTypeConverter provided to this legalization pattern.
   auto varargsAttr = funcOp->getAttrOfType<BoolAttr>(varargsAttrName);
+  // Gather `llvm.byval` and `llvm.byref` arguments whose type convertion was
+  // overriden with an LLVM pointer type for later processing.
+  SmallVector<std::optional<NamedAttribute>> byValRefNonPtrAttrs;
   TypeConverter::SignatureConversion result(funcOp.getNumArguments());
   auto llvmType = converter.convertFunctionSignature(
-      funcTy, varargsAttr && varargsAttr.getValue(),
-      shouldUseBarePtrCallConv(funcOp, &converter), result);
+      funcOp, varargsAttr && varargsAttr.getValue(),
+      shouldUseBarePtrCallConv(funcOp, &converter), result,
+      byValRefNonPtrAttrs);
   if (!llvmType)
     return rewriter.notifyMatchFailure(funcOp, "signature conversion failed");
 
@@ -398,6 +434,12 @@ mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
                                        "region types conversion failed");
   }
 
+  // Fix the type mismatch between the materialized `llvm.ptr` and the expected
+  // pointee type in the function body when converting `llvm.byval`/`llvm.byref`
+  // function arguments.
+  restoreByValRefArgumentType(rewriter, converter, byValRefNonPtrAttrs,
+                              newFuncOp);
+
   if (!shouldUseBarePtrCallConv(funcOp, &converter)) {
     if (funcOp->getAttrOfType<UnitAttr>(
             LLVM::LLVMDialect::getEmitCWrapperAttrName())) {
diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index 17be4d91ee05465..5313a64ed47e3ae 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -270,13 +270,42 @@ Type LLVMTypeConverter::convertFunctionType(FunctionType type) const {
   return LLVM::LLVMPointerType::get(type.getContext());
 }
 
+/// Returns the `llvm.byval` or `llvm.byref` attributes that are present in the
+/// function arguments. Returns an empty container if none of these attributes
+/// are found in any of the arguments.
+static void
+filterByValRefArgAttrs(FunctionOpInterface funcOp,
+                       SmallVectorImpl<std::optional<NamedAttribute>> &result) {
+  assert(result.empty() && "Unexpected non-empty output");
+  result.resize(funcOp.getNumArguments(), std::nullopt);
+  bool foundByValByRefAttrs = false;
+  for (int argIdx : llvm::seq(funcOp.getNumArguments())) {
+    for (NamedAttribute namedAttr : funcOp.getArgAttrs(argIdx)) {
+      if ((namedAttr.getName() == LLVM::LLVMDialect::getByValAttrName() ||
+           namedAttr.getName() == LLVM::LLVMDialect::getByRefAttrName())) {
+        foundByValByRefAttrs = true;
+        result[argIdx] = namedAttr;
+        break;
+      }
+    }
+  }
+
+  if (!foundByValByRefAttrs)
+    result.clear();
+}
+
 // Function types are converted to LLVM Function types by recursively converting
-// argument and result types.  If MLIR Function has zero results, the LLVM
-// Function has one VoidType result.  If MLIR Function has more than one result,
+// argument and result types. If MLIR Function has zero results, the LLVM
+// Function has one VoidType result. If MLIR Function has more than one result,
 // they are into an LLVM StructType in their order of appearance.
-Type LLVMTypeConverter::convertFunctionSignature(
+// If `byValRefNonPtrAttrs` is provided, converted types of `llvm.byval` and
+// `llvm.byref` function arguments which are not LLVM pointers are overridden
+// with LLVM pointers. `llvm.byval` and `llvm.byref` arguments that were already
+// converted to LLVM pointer types are removed from 'byValRefNonPtrAttrs`.
+Type LLVMTypeConverter::convertFunctionSignatureImpl(
     FunctionType funcTy, bool isVariadic, bool useBarePtrCallConv,
-    LLVMTypeConverter::SignatureConversion &result) const {
+    LLVMTypeConverter::SignatureConversion &result,
+    SmallVectorImpl<std::optional<NamedAttribute>> *byValRefNonPtrAttrs) const {
   // Select the argument converter depending on the calling convention.
   useBarePtrCallConv = useBarePtrCallConv || options.useBarePtrCallConv;
   auto funcArgConverter = useBarePtrCallConv ? barePtrFuncArgTypeConverter
@@ -286,6 +315,19 @@ Type LLVMTypeConverter::convertFunctionSignature(
     SmallVector<Type, 8> converted;
     if (failed(funcArgConverter(*this, type, converted)))
       return {};
+
+    // Rewrite converted type of `llvm.byval` or `llvm.byref` function
+    // argument that was not converted to an LLVM pointer types.
+    if (byValRefNonPtrAttrs != nullptr && !byValRefNonPtrAttrs->empty() &&
+        converted.size() == 1 && (*byValRefNonPtrAttrs)[idx].has_value()) {
+      // If the argument was already converted to an LLVM pointer type, we stop
+      // tracking it as it doesn't need more processing.
+      if (isa<LLVM::LLVMPointerType>(converted[0]))
+        (*byValRefNonPtrAttrs)[idx] = std::nullopt;
+      else
+        converted[0] = LLVM::LLVMPointerType::get(&getContext());
+    }
+
     result.addInputs(idx, converted);
   }
 
@@ -302,6 +344,27 @@ Type LLVMTypeConverter::convertFunctionSignature(
                                      isVariadic);
 }
 
+Type LLVMTypeConverter::convertFunctionSignature(
+    FunctionType funcTy, bool isVariadic, bool useBarePtrCallConv,
+    LLVMTypeConverter::SignatureConversion &result) const {
+  return convertFunctionSignatureImpl(funcTy, isVariadic, useBarePtrCallConv,
+                                      result,
+                                      /*byValRefNonPtrAttrs=*/nullptr);
+}
+
+Type LLVMTypeConverter::convertFunctionSignature(
+    FunctionOpInterface funcOp, bool isVariadic, bool useBarePtrCallConv,
+    LLVMTypeConverter::SignatureConversion &result,
+    SmallVectorImpl<std::optional<NamedAttribute>> &byValRefNonPtrAttrs) const {
+  // Gather all `llvm.byval` and `llvm.byref` function arguments. Only those
+  // that were not converted to LLVM pointer types will be returned for further
+  // processing.
+  filterByValRefArgAttrs(funcOp, byValRefNonPtrAttrs);
+  auto funcTy = cast<FunctionType>(funcOp.getFunctionType());
+  return convertFunctionSignatureImpl(funcTy, isVariadic, useBarePtrCallConv,
+                                      result, &byValRefNonPtrAttrs);
+}
+
 /// Converts the function type to a C-compatible format, in particular using
 /// pointers to memref descriptors for arguments.
 std::pair<LLVM::LLVMFunctionType, LLVM::LLVMStructType>
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index fe58bd35684e4d0..b8734d81311f67d 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -2405,7 +2405,10 @@ BroadcastableToResult mlir::vector::isBroadcastableTo(
     bool srcDimScalableFlag = srcVectorType.getScalableDims()[dimIdx];
     bool dstDimScalableFlag = dstVectorType.getScalableDims()[lead + dimIdx];
     if ((srcDim == 1 && srcDimScalableFlag && dstDim != 1) ||
-        (srcDimScalableFlag != dstDimScalableFlag))
+        // 1 -> [N] is fine, everything else should be rejected when mixing
+        // fixed-width and scalable dims
+        (srcDimScalableFlag != dstDimScalableFlag &&
+         (srcDim != 1 || srcDimScalableFlag)))
       foundMismatchingDims = true;
 
     if (foundMismatchingDims) {
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
index 32e7eb27f5e29b3..6c36bbaee85237e 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
@@ -125,7 +125,8 @@ class BroadcastOpLowering : public OpRewritePattern<vector::BroadcastOp> {
     //   ..
     //   %x = [%a,%b,%c,%d]
     VectorType resType =
-        VectorType::get(dstType.getShape().drop_front(), eltType);
+        VectorType::get(dstType.getShape().drop_front(), eltType,
+                        dstType.getScalableDims().drop_front());
     Value result = rewriter.create<arith::ConstantOp>(
         loc, dstType, rewriter.getZeroAttr(dstType));
     if (m == 0) {
@@ -136,6 +137,10 @@ class BroadcastOpLowering : public OpRewritePattern<vector::BroadcastOp> {
         result = rewriter.create<vector::InsertOp>(loc, bcst, result, d);
     } else {
       // Stetch not at start.
+      if (dstType.getScalableDims()[0]) {
+        // TODO: For scalable vectors we should emit an scf.for loop.
+        return failure();
+      }
       for (int64_t d = 0, dim = dstType.getDimSize(0); d < dim; ++d) {
         Value ext = rewriter.create<vector::ExtractOp>(loc, op.getSource(), d);
         Value bcst = rewriter.create<vector::BroadcastOp>(loc, resType, ext);
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 55c1c6bad9f2a40..7f59a378e035124 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -1771,6 +1771,13 @@ struct DropUnitDimsFromTransposeOp final
       newPerm.push_back(idx - droppedDimsBefore[idx]);
     }
 
+    // Fixup for `newPerm`. The `sourceTypeWithoutUnitDims` could be vector<1xT>
+    // type when the dimensions are unit dimensions. In this case, the newPerm
+    // should be [0].
+    if (newPerm.empty()) {
+      newPerm.push_back(0);
+    }
+
     Location loc = op.getLoc();
     // Drop the unit dims via shape_cast.
     auto dropDimsShapeCast = rewriter.create<vector::ShapeCastOp>(
@@ -1782,7 +1789,7 @@ struct DropUnitDimsFromTransposeOp final
     rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(
         op, op.getResultVectorType(), tranposeWithoutUnitDims);
 
-    return failure();
+    return success();
   }
 };
 
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index e86da57fb915788..8f9b21b7ee1e5b9 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -2352,7 +2352,7 @@ struct OperationConverter {
   LogicalResult legalizeUnresolvedMaterializations(
       ConversionPatternRewriter &rewriter,
       ConversionPatternRewriterImpl &rewriterImpl,
-      std::optional<DenseMap<Value, SmallVector<Value>>> &inverseMapping);
+      DenseMap<Value, SmallVector<Value>> &inverseMapping);
 
   /// Legalize an operation result that was marked as "erased".
   LogicalResult
@@ -2454,10 +2454,12 @@ LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
 
 LogicalResult
 OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
-  std::optional<DenseMap<Value, SmallVector<Value>>> inverseMapping;
   ConversionPatternRewriterImpl &rewriterImpl = rewriter.getImpl();
-  if (failed(legalizeConvertedArgumentTypes(rewriter, rewriterImpl)) ||
-      failed(legalizeUnresolvedMaterializations(rewriter, rewriterImpl,
+  if (failed(legalizeConvertedArgumentTypes(rewriter, rewriterImpl)))
+    return failure();
+  DenseMap<Value, SmallVector<Value>> inverseMapping =
+      rewriterImpl.mapping.getInverse();
+  if (failed(legalizeUnresolvedMaterializations(rewriter, rewriterImpl,
                                                 inverseMapping)))
     return failure();
 
@@ -2483,15 +2485,11 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
       if (result.getType() == newValue.getType())
         continue;
 
-      // Compute the inverse mapping only if it is really needed.
-      if (!inverseMapping)
-        inverseMapping = rewriterImpl.mapping.getInverse();
-
       // Legalize this result.
       rewriter.setInsertionPoint(op);
       if (failed(legalizeChangedResultType(
               op, result, newValue, opReplacement->getConverter(), rewriter,
-              rewriterImpl, *inverseMapping)))
+              rewriterImpl, inverseMapping)))
         return failure();
     }
   }
@@ -2503,6 +2501,8 @@ LogicalResult OperationConverter::legalizeConvertedArgumentTypes(
     ConversionPatternRewriterImpl &rewriterImpl) {
   // Functor used to check if all users of a value will be dead after
   // conversion.
+  // TODO: This should probably query the inverse mapping, same as in
+  // `legalizeChangedResultType`.
   auto findLiveUser = [&](Value val) {
     auto liveUserIt = llvm::find_if_not(val.getUsers(), [&](Operation *user) {
       return rewriterImpl.isOpIgnored(user);
@@ -2796,20 +2796,18 @@ static LogicalResult legalizeUnresolvedMaterialization(
 LogicalResult OperationConverter::legalizeUnresolvedMaterializations(
     ConversionPatternRewriter &rewriter,
     ConversionPatternRewriterImpl &rewriterImpl,
-    std::optional<DenseMap<Value, SmallVector<Value>>> &inverseMapping) {
-  inverseMapping = rewriterImpl.mapping.getInverse();
-
+    DenseMap<Value, SmallVector<Value>> &inverseMapping) {
   // As an initial step, compute all of the inserted materializations that we
   // expect to persist beyond the conversion process.
   DenseMap<Operation *, UnresolvedMaterializationRewrite *> materializationOps;
   SetVector<UnresolvedMaterializationRewrite *> necessaryMaterializations;
   computeNecessaryMaterializations(materializationOps, rewriter, rewriterImpl,
-                                   *inverseMapping, necessaryMaterializations);
+                                   inverseMapping, necessaryMaterializations);
 
   // Once computed, legalize any necessary materializations.
   for (auto *mat : necessaryMaterializations) {
     if (failed(legalizeUnresolvedMaterialization(
-            *mat, materializationOps, rewriter, rewriterImpl, *inverseMapping)))
+            *mat, materializationOps, rewriter, rewriterImpl, inverseMapping)))
       return failure();
   }
   return success();
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index c310954b906e4e5..d164e8750979689 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -23,6 +23,15 @@ func.func @bitcast_f32_to_i32_vector(%input: vector<16xf32>) -> vector<16xi32> {
 // CHECK-SAME:  %[[input:.*]]: vector<16xf32>
 // CHECK:       llvm.bitcast %[[input]] : vector<16xf32> to vector<16xi32>
 
+func.func @bitcast_f32_to_i32_vector_scalable(%input: vector<[16]xf32>) -> vector<[16]xi32> {
+  %0 = vector.bitcast %input : vector<[16]xf32> to vector<[16]xi32>
+  return %0 : vector<[16]xi32>
+}
+
+// CHECK-LABEL: @bitcast_f32_to_i32_vector_scalable
+// CHECK-SAME:  %[[input:.*]]: vector<[16]xf32>
+// CHECK:       llvm.bitcast %[[input]] : vector<[16]xf32> to vector<[16]xi32>
+
 // -----
 
 func.func @bitcast_i8_to_f32_vector(%input: vector<64xi8>) -> vector<16xf32> {
@@ -34,6 +43,15 @@ func.func @bitcast_i8_to_f32_vector(%input: vector<64xi8>) -> vector<16xf32> {
 // CHECK-SAME:  %[[input:.*]]: vector<64xi8>
 // CHECK:       llvm.bitcast %[[input]] : vector<64xi8> to vector<16xf32>
 
+func.func @bitcast_i8_to_f32_vector_scalable(%input: vector<[64]xi8>) -> vector<[16]xf32> {
+  %0 = vector.bitcast %input : vector<[64]xi8> to vector<[16]xf32>
+  return %0 : vector<[16]xf32>
+}
+
+// CHECK-LABEL: @bitcast_i8_to_f32_vector_scalable
+// CHECK-SAME:  %[[input:.*]]: vector<[64]xi8>
+// CHECK:       llvm.bitcast %[[input]] : vector<[64]xi8> to vector<[16]xf32>
+
 // -----
 
 func.func @bitcast_index_to_i8_vector(%input: vector<16xindex>) -> vector<128xi8> {
@@ -46,6 +64,16 @@ func.func @bitcast_index_to_i8_vector(%input: vector<16xindex>) -> vector<128xi8
 // CHECK:       %[[T0:.*]] = builtin.unrealized_conversion_cast %[[input]] : vector<16xindex> to vector<16xi64>
 // CHECK:       llvm.bitcast %[[T0]] : vector<16xi64> to vector<128xi8>
 
+func.func @bitcast_index_to_i8_vector_scalable(%input: vector<[16]xindex>) -> vector<[128]xi8> {
+  %0 = vector.bitcast %input : vector<[16]xindex> to vector<[128]xi8>
+  return %0 : vector<[128]xi8>
+}
+
+// CHECK-LABEL: @bitcast_index_to_i8_vector_scalable
+// CHECK-SAME:  %[[input:.*]]: vector<[16]xindex>
+// CHECK:       %[[T0:.*]] = builtin.unrealized_conversion_cast %[[input]] : vector<[16]xindex> to vector<[16]xi64>
+// CHECK:       llvm.bitcast %[[T0]] : vector<[16]xi64> to vector<[128]xi8>
+
 // -----
 
 func.func @broadcast_vec0d_from_f32(%arg0: f32) -> vector<f32> {
@@ -80,6 +108,17 @@ func.func @broadcast_vec1d_from_f32(%arg0: f32) -> vector<2xf32> {
 // CHECK:       %[[T1:.*]] = llvm.shufflevector %[[T0]]
 // CHECK:       return %[[T1]] : vector<2xf32>
 
+
+func.func @broadcast_vec1d_from_f32_scalable(%arg0: f32) -> vector<[2]xf32> {
+  %0 = vector.broadcast %arg0 : f32 to vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+// CHECK-LABEL: @broadcast_vec1d_from_f32_scalable
+// CHECK-SAME:  %[[A:.*]]: f32)
+// CHECK:       %[[T0:.*]] = llvm.insertelement %[[A]]
+// CHECK:       %[[T1:.*]] = llvm.shufflevector %[[T0]]
+// CHECK:       return %[[T1]] : vector<[2]xf32>
+
 // -----
 
 func.func @broadcast_vec1d_from_index(%arg0: index) -> vector<2xindex> {
@@ -94,6 +133,18 @@ func.func @broadcast_vec1d_from_index(%arg0: index) -> vector<2xindex> {
 // CHECK:       %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<2xi64> to vector<2xindex>
 // CHECK:       return %[[T2]] : vector<2xindex>
 
+func.func @broadcast_vec1d_from_index_scalable(%arg0: index) -> vector<[2]xindex> {
+  %0 = vector.broadcast %arg0 : index to vector<[2]xindex>
+  return %0 : vector<[2]xindex>
+}
+// CHECK-LABEL: @broadcast_vec1d_from_index_scalable
+// CHECK-SAME:  %[[A:.*]]: index)
+// CHECK:       %[[A1:.*]] = builtin.unrealized_conversion_cast %[[A]] : index to i64
+// CHECK:       %[[T0:.*]] = llvm.insertelement %[[A1]]
+// CHECK:       %[[T1:.*]] = llvm.shufflevector %[[T0]]
+// CHECK:       %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<[2]xi64> to vector<[2]xindex>
+// CHECK:       return %[[T2]] : vector<[2]xindex>
+
 // -----
 
 func.func @broadcast_vec2d_from_scalar(%arg0: f32) -> vector<2x3xf32> {
@@ -109,6 +160,19 @@ func.func @broadcast_vec2d_from_scalar(%arg0: f32) -> vector<2x3xf32> {
 // CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %[[T3]] : !llvm.array<2 x vector<3xf32>> to vector<2x3xf32>
 // CHECK:       return %[[T4]] : vector<2x3xf32>
 
+func.func @broadcast_vec2d_from_scalar_scalable(%arg0: f32) -> vector<2x[3]xf32> {
+  %0 = vector.broadcast %arg0 : f32 to vector<2x[3]xf32>
+  return %0 : vector<2x[3]xf32>
+}
+// CHECK-LABEL: @broadcast_vec2d_from_scalar_scalable(
+// CHECK-SAME:  %[[A:.*]]: f32)
+// CHECK:       %[[T0:.*]] = llvm.insertelement %[[A]]
+// CHECK:       %[[T1:.*]] = llvm.shufflevector %[[T0]]
+// CHECK:       %[[T2:.*]] = llvm.insertvalue %[[T1]], %{{.*}}[0] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T3:.*]] = llvm.insertvalue %[[T1]], %{{.*}}[1] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %[[T3]] : !llvm.array<2 x vector<[3]xf32>> to vector<2x[3]xf32>
+// CHECK:       return %[[T4]] : vector<2x[3]xf32>
+
 // -----
 
 func.func @broadcast_vec3d_from_scalar(%arg0: f32) -> vector<2x3x4xf32> {
@@ -125,6 +189,21 @@ func.func @broadcast_vec3d_from_scalar(%arg0: f32) -> vector<2x3x4xf32> {
 // CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %[[T3]] : !llvm.array<2 x array<3 x vector<4xf32>>> to vector<2x3x4xf32>
 // CHECK:       return %[[T4]] : vector<2x3x4xf32>
 
+
+func.func @broadcast_vec3d_from_scalar_scalable(%arg0: f32) -> vector<2x3x[4]xf32> {
+  %0 = vector.broadcast %arg0 : f32 to vector<2x3x[4]xf32>
+  return %0 : vector<2x3x[4]xf32>
+}
+// CHECK-LABEL: @broadcast_vec3d_from_scalar_scalable(
+// CHECK-SAME:  %[[A:.*]]: f32)
+// CHECK:       %[[T0:.*]] = llvm.insertelement %[[A]]
+// CHECK:       %[[T1:.*]] = llvm.shufflevector %[[T0]]
+// CHECK:       %[[T2:.*]] = llvm.insertvalue %[[T1]], %{{.*}}[0, 0] : !llvm.array<2 x array<3 x vector<[4]xf32>>>
+// ...
+// CHECK:       %[[T3:.*]] = llvm.insertvalue %[[T1]], %{{.*}}[1, 2] : !llvm.array<2 x array<3 x vector<[4]xf32>>>
+// CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %[[T3]] : !llvm.array<2 x array<3 x vector<[4]xf32>>> to vector<2x3x[4]xf32>
+// CHECK:       return %[[T4]] : vector<2x3x[4]xf32>
+
 // -----
 
 func.func @broadcast_vec1d_from_vec1d(%arg0: vector<2xf32>) -> vector<2xf32> {
@@ -135,6 +214,14 @@ func.func @broadcast_vec1d_from_vec1d(%arg0: vector<2xf32>) -> vector<2xf32> {
 // CHECK-SAME:  %[[A:.*]]: vector<2xf32>)
 // CHECK:       return %[[A]] : vector<2xf32>
 
+func.func @broadcast_vec1d_from_vec1d_scalable(%arg0: vector<[2]xf32>) -> vector<[2]xf32> {
+  %0 = vector.broadcast %arg0 : vector<[2]xf32> to vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+// CHECK-LABEL: @broadcast_vec1d_from_vec1d_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<[2]xf32>)
+// CHECK:       return %[[A]] : vector<[2]xf32>
+
 // -----
 
 func.func @broadcast_vec2d_from_vec0d(%arg0: vector<f32>) -> vector<3x2xf32> {
@@ -172,6 +259,20 @@ func.func @broadcast_vec2d_from_vec1d(%arg0: vector<2xf32>) -> vector<3x2xf32> {
 // CHECK:       %[[T5:.*]] = builtin.unrealized_conversion_cast %[[T4]] : !llvm.array<3 x vector<2xf32>> to vector<3x2xf32>
 // CHECK:       return %[[T5]] : vector<3x2xf32>
 
+func.func @broadcast_vec2d_from_vec1d_scalable(%arg0: vector<[2]xf32>) -> vector<3x[2]xf32> {
+  %0 = vector.broadcast %arg0 : vector<[2]xf32> to vector<3x[2]xf32>
+  return %0 : vector<3x[2]xf32>
+}
+// CHECK-LABEL: @broadcast_vec2d_from_vec1d_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<[2]xf32>)
+// CHECK:       %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<3x[2]xf32>
+// CHECK:       %[[T1:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x[2]xf32> to !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T2:.*]] = llvm.insertvalue %[[A]], %[[T1]][0] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T3:.*]] = llvm.insertvalue %[[A]], %[[T2]][1] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T4:.*]] = llvm.insertvalue %[[A]], %[[T3]][2] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T5:.*]] = builtin.unrealized_conversion_cast %[[T4]] : !llvm.array<3 x vector<[2]xf32>> to vector<3x[2]xf32>
+// CHECK:       return %[[T5]] : vector<3x[2]xf32>
+
 // -----
 
 func.func @broadcast_vec2d_from_index_vec1d(%arg0: vector<2xindex>) -> vector<3x2xindex> {
@@ -188,6 +289,20 @@ func.func @broadcast_vec2d_from_index_vec1d(%arg0: vector<2xindex>) -> vector<3x
 // CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %{{.*}} : !llvm.array<3 x vector<2xi64>> to vector<3x2xindex>
 // CHECK:       return %[[T4]] : vector<3x2xindex>
 
+func.func @broadcast_vec2d_from_index_vec1d_scalable(%arg0: vector<[2]xindex>) -> vector<3x[2]xindex> {
+  %0 = vector.broadcast %arg0 : vector<[2]xindex> to vector<3x[2]xindex>
+  return %0 : vector<3x[2]xindex>
+}
+// CHECK-LABEL: @broadcast_vec2d_from_index_vec1d_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<[2]xindex>)
+// CHECK:       %[[T1:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<[2]xindex> to vector<[2]xi64>
+// CHECK:       %[[T0:.*]] = arith.constant dense<0> : vector<3x[2]xindex>
+// CHECK:       %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x[2]xindex> to !llvm.array<3 x vector<[2]xi64>>
+// CHECK:       %[[T3:.*]] = llvm.insertvalue %[[T1]], %[[T2]][0] : !llvm.array<3 x vector<[2]xi64>>
+
+// CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %{{.*}} : !llvm.array<3 x vector<[2]xi64>> to vector<3x[2]xindex>
+// CHECK:       return %[[T4]] : vector<3x[2]xindex>
+
 // -----
 
 func.func @broadcast_vec3d_from_vec1d(%arg0: vector<2xf32>) -> vector<4x3x2xf32> {
@@ -213,6 +328,29 @@ func.func @broadcast_vec3d_from_vec1d(%arg0: vector<2xf32>) -> vector<4x3x2xf32>
 // CHECK:       %[[T11:.*]] = builtin.unrealized_conversion_cast %[[T10]] : !llvm.array<4 x array<3 x vector<2xf32>>> to vector<4x3x2xf32>
 // CHECK:       return %[[T11]] : vector<4x3x2xf32>
 
+func.func @broadcast_vec3d_from_vec1d_scalable(%arg0: vector<[2]xf32>) -> vector<4x3x[2]xf32> {
+  %0 = vector.broadcast %arg0 : vector<[2]xf32> to vector<4x3x[2]xf32>
+  return %0 : vector<4x3x[2]xf32>
+}
+// CHECK-LABEL: @broadcast_vec3d_from_vec1d_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<[2]xf32>)
+// CHECK-DAG:   %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<3x[2]xf32>
+// CHECK-DAG:   %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x[2]xf32> to !llvm.array<3 x vector<[2]xf32>>
+// CHECK-DAG:   %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x[2]xf32>
+// CHECK-DAG:   %[[T6:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<4x3x[2]xf32> to !llvm.array<4 x array<3 x vector<[2]xf32>>>
+
+// CHECK:       %[[T3:.*]] = llvm.insertvalue %[[A]], %[[T2]][0] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T4:.*]] = llvm.insertvalue %[[A]], %[[T3]][1] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T5:.*]] = llvm.insertvalue %[[A]], %[[T4]][2] : !llvm.array<3 x vector<[2]xf32>>
+
+// CHECK:       %[[T7:.*]] = llvm.insertvalue %[[T5]], %[[T6]][0] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T8:.*]] = llvm.insertvalue %[[T5]], %[[T7]][1] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T9:.*]] = llvm.insertvalue %[[T5]], %[[T8]][2] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T10:.*]] = llvm.insertvalue %[[T5]], %[[T9]][3] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+
+// CHECK:       %[[T11:.*]] = builtin.unrealized_conversion_cast %[[T10]] : !llvm.array<4 x array<3 x vector<[2]xf32>>> to vector<4x3x[2]xf32>
+// CHECK:       return %[[T11]] : vector<4x3x[2]xf32>
+
 // -----
 
 func.func @broadcast_vec3d_from_vec2d(%arg0: vector<3x2xf32>) -> vector<4x3x2xf32> {
@@ -231,6 +369,22 @@ func.func @broadcast_vec3d_from_vec2d(%arg0: vector<3x2xf32>) -> vector<4x3x2xf3
 // CHECK:       %[[T10:.*]] = builtin.unrealized_conversion_cast %[[T9]] : !llvm.array<4 x array<3 x vector<2xf32>>> to vector<4x3x2xf32>
 // CHECK:       return %[[T10]] : vector<4x3x2xf32>
 
+func.func @broadcast_vec3d_from_vec2d_scalable(%arg0: vector<3x[2]xf32>) -> vector<4x3x[2]xf32> {
+  %0 = vector.broadcast %arg0 : vector<3x[2]xf32> to vector<4x3x[2]xf32>
+  return %0 : vector<4x3x[2]xf32>
+}
+// CHECK-LABEL: @broadcast_vec3d_from_vec2d_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<3x[2]xf32>)
+// CHECK:       %[[T1:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<3x[2]xf32> to !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x[2]xf32>
+// CHECK:       %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<4x3x[2]xf32> to !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T3:.*]] = llvm.insertvalue %[[T1]], %[[T2]][0] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T5:.*]] = llvm.insertvalue %[[T1]], %[[T3]][1] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T7:.*]] = llvm.insertvalue %[[T1]], %[[T5]][2] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T9:.*]] = llvm.insertvalue %[[T1]], %[[T7]][3] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T10:.*]] = builtin.unrealized_conversion_cast %[[T9]] : !llvm.array<4 x array<3 x vector<[2]xf32>>> to vector<4x3x[2]xf32>
+// CHECK:       return %[[T10]] : vector<4x3x[2]xf32>
+
 
 // -----
 
@@ -246,6 +400,18 @@ func.func @broadcast_stretch(%arg0: vector<1xf32>) -> vector<4xf32> {
 // CHECK:       %[[T4:.*]] = llvm.shufflevector %[[T3]]
 // CHECK:       return %[[T4]] : vector<4xf32>
 
+func.func @broadcast_stretch_scalable(%arg0: vector<1xf32>) -> vector<[4]xf32> {
+  %0 = vector.broadcast %arg0 : vector<1xf32> to vector<[4]xf32>
+  return %0 : vector<[4]xf32>
+}
+// CHECK-LABEL: @broadcast_stretch_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<1xf32>)
+// CHECK:       %[[T1:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:       %[[T2:.*]] = llvm.extractelement %[[A]]{{\[}}%[[T1]] : i64] : vector<1xf32>
+// CHECK:       %[[T3:.*]] = llvm.insertelement %[[T2]]
+// CHECK:       %[[T4:.*]] = llvm.shufflevector %[[T3]]
+// CHECK:       return %[[T4]] : vector<[4]xf32>
+
 // -----
 
 func.func @broadcast_stretch_at_start(%arg0: vector<1x4xf32>) -> vector<3x4xf32> {
@@ -264,6 +430,22 @@ func.func @broadcast_stretch_at_start(%arg0: vector<1x4xf32>) -> vector<3x4xf32>
 // CHECK:       %[[T8:.*]] = builtin.unrealized_conversion_cast %[[T7]] : !llvm.array<3 x vector<4xf32>> to vector<3x4xf32>
 // CHECK:       return %[[T8]] : vector<3x4xf32>
 
+func.func @broadcast_stretch_at_start_scalable(%arg0: vector<1x[4]xf32>) -> vector<3x[4]xf32> {
+  %0 = vector.broadcast %arg0 : vector<1x[4]xf32> to vector<3x[4]xf32>
+  return %0 : vector<3x[4]xf32>
+}
+// CHECK-LABEL: @broadcast_stretch_at_start_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<1x[4]xf32>)
+// CHECK:       %[[T2:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<1x[4]xf32> to !llvm.array<1 x vector<[4]xf32>>
+// CHECK:       %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<3x[4]xf32>
+// CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<3x[4]xf32> to !llvm.array<3 x vector<[4]xf32>>
+// CHECK:       %[[T3:.*]] = llvm.extractvalue %[[T2]][0] : !llvm.array<1 x vector<[4]xf32>>
+// CHECK:       %[[T5:.*]] = llvm.insertvalue %[[T3]], %[[T4]][0] : !llvm.array<3 x vector<[4]xf32>>
+// CHECK:       %[[T6:.*]] = llvm.insertvalue %[[T3]], %[[T5]][1] : !llvm.array<3 x vector<[4]xf32>>
+// CHECK:       %[[T7:.*]] = llvm.insertvalue %[[T3]], %[[T6]][2] : !llvm.array<3 x vector<[4]xf32>>
+// CHECK:       %[[T8:.*]] = builtin.unrealized_conversion_cast %[[T7]] : !llvm.array<3 x vector<[4]xf32>> to vector<3x[4]xf32>
+// CHECK:       return %[[T8]] : vector<3x[4]xf32>
+
 // -----
 
 func.func @broadcast_stretch_at_end(%arg0: vector<4x1xf32>) -> vector<4x3xf32> {
@@ -302,6 +484,16 @@ func.func @broadcast_stretch_at_end(%arg0: vector<4x1xf32>) -> vector<4x3xf32> {
 // CHECK:       %[[T27:.*]] = builtin.unrealized_conversion_cast %[[T26]] : !llvm.array<4 x vector<3xf32>> to vector<4x3xf32>
 // CHECK:       return %[[T27]] : vector<4x3xf32>
 
+// TODO: Add support for scalable vectors
+
+func.func @broadcast_stretch_at_end_scalable(%arg0: vector<[4]x1xf32>) -> vector<[4]x3xf32> {
+  %0 = vector.broadcast %arg0 : vector<[4]x1xf32> to vector<[4]x3xf32>
+  return %0 : vector<[4]x3xf32>
+}
+// CHECK-LABEL: @broadcast_stretch_at_end_scalable
+// CHECK-SAME:  %[[A:.*]]: vector<[4]x1xf32>)
+// CHECK: vector.broadcast %[[A]] : vector<[4]x1xf32> to vector<[4]x3xf32>
+
 // -----
 
 func.func @broadcast_stretch_in_middle(%arg0: vector<4x1x2xf32>) -> vector<4x3x2xf32> {
@@ -338,6 +530,50 @@ func.func @broadcast_stretch_in_middle(%arg0: vector<4x1x2xf32>) -> vector<4x3x2
 // CHECK:       %[[T32:.*]] = builtin.unrealized_conversion_cast %[[T31]] : !llvm.array<4 x array<3 x vector<2xf32>>> to vector<4x3x2xf32>
 // CHECK:       return %[[T32]] : vector<4x3x2xf32>
 
+func.func @broadcast_stretch_in_middle_scalable_v1(%arg0: vector<4x1x[2]xf32>) -> vector<4x3x[2]xf32> {
+  %0 = vector.broadcast %arg0 : vector<4x1x[2]xf32> to vector<4x3x[2]xf32>
+  return %0 : vector<4x3x[2]xf32>
+}
+// CHECK-LABEL: @broadcast_stretch_in_middle_scalable_v1(
+// CHECK-SAME:  %[[A:.*]]: vector<4x1x[2]xf32>) -> vector<4x3x[2]xf32> {
+// CHECK:       %[[T3:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<4x1x[2]xf32> to !llvm.array<4 x array<1 x vector<[2]xf32>>>
+// CHECK:       %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x[2]xf32>
+// CHECK:       %[[T9:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<4x3x[2]xf32> to !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T2:.*]] = arith.constant dense<0.000000e+00> : vector<3x[2]xf32>
+// CHECK:       %[[T5:.*]] = builtin.unrealized_conversion_cast %[[T2]] : vector<3x[2]xf32> to !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T4:.*]] = llvm.extractvalue %[[T3]][0, 0] : !llvm.array<4 x array<1 x vector<[2]xf32>>>
+// CHECK:       %[[T6:.*]] = llvm.insertvalue %[[T4]], %[[T5]][0] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T7:.*]] = llvm.insertvalue %[[T4]], %[[T6]][1] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T8:.*]] = llvm.insertvalue %[[T4]], %[[T7]][2] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T10:.*]] = llvm.insertvalue %[[T8]], %[[T9]][0] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T12:.*]] = llvm.extractvalue %[[T3]][1, 0] : !llvm.array<4 x array<1 x vector<[2]xf32>>>
+// CHECK:       %[[T14:.*]] = llvm.insertvalue %[[T12]], %[[T5]][0] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T15:.*]] = llvm.insertvalue %[[T12]], %[[T14]][1] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T16:.*]] = llvm.insertvalue %[[T12]], %[[T15]][2] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T17:.*]] = llvm.insertvalue %[[T16]], %[[T10]][1] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T19:.*]] = llvm.extractvalue %[[T3]][2, 0] : !llvm.array<4 x array<1 x vector<[2]xf32>>>
+// CHECK:       %[[T21:.*]] = llvm.insertvalue %[[T19]], %[[T5]][0] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T22:.*]] = llvm.insertvalue %[[T19]], %[[T21]][1] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T23:.*]] = llvm.insertvalue %[[T19]], %[[T22]][2] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T24:.*]] = llvm.insertvalue %[[T23]], %[[T17]][2] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T26:.*]] = llvm.extractvalue %[[T3]][3, 0] : !llvm.array<4 x array<1 x vector<[2]xf32>>>
+// CHECK:       %[[T28:.*]] = llvm.insertvalue %[[T26]], %[[T5]][0] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T29:.*]] = llvm.insertvalue %[[T26]], %[[T28]][1] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T30:.*]] = llvm.insertvalue %[[T26]], %[[T29]][2] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T31:.*]] = llvm.insertvalue %[[T30]], %[[T24]][3] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T32:.*]] = builtin.unrealized_conversion_cast %[[T31]] : !llvm.array<4 x array<3 x vector<[2]xf32>>> to vector<4x3x[2]xf32>
+// CHECK:       return %[[T32]] : vector<4x3x[2]xf32>
+
+// TODO: Add support for scalable vectors
+
+func.func @broadcast_stretch_in_middle_scalable_v2(%arg0: vector<[4]x1x2xf32>) -> vector<[4]x3x2xf32> {
+  %0 = vector.broadcast %arg0 : vector<[4]x1x2xf32> to vector<[4]x3x2xf32>
+  return %0 : vector<[4]x3x2xf32>
+}
+// CHECK-LABEL: @broadcast_stretch_in_middle_scalable_v2(
+// CHECK-SAME:  %[[A:.*]]: vector<[4]x1x2xf32>) -> vector<[4]x3x2xf32> {
+// CHECK:  vector.broadcast %[[A]] : vector<[4]x1x2xf32> to vector<[4]x3x2xf32>
+
 // -----
 
 func.func @outerproduct(%arg0: vector<2xf32>, %arg1: vector<3xf32>) -> vector<2x3xf32> {
@@ -364,6 +600,30 @@ func.func @outerproduct(%arg0: vector<2xf32>, %arg1: vector<3xf32>) -> vector<2x
 // CHECK:       %[[T14:.*]] = builtin.unrealized_conversion_cast %[[T13]] : !llvm.array<2 x vector<3xf32>> to vector<2x3xf32>
 // CHECK:       return %[[T14]] : vector<2x3xf32>
 
+func.func @outerproduct_scalable(%arg0: vector<2xf32>, %arg1: vector<[3]xf32>) -> vector<2x[3]xf32> {
+  %2 = vector.outerproduct %arg0, %arg1 : vector<2xf32>, vector<[3]xf32>
+  return %2 : vector<2x[3]xf32>
+}
+// CHECK-LABEL: @outerproduct_scalable
+// CHECK-SAME:  %[[A:.*]]: vector<2xf32>,
+// CHECK-SAME:  %[[B:.*]]: vector<[3]xf32>)
+// CHECK:       %[[T2:.*]] = arith.constant dense<0.000000e+00> : vector<2x[3]xf32>
+// CHECK:       %[[T7:.*]] = builtin.unrealized_conversion_cast %[[T2]] : vector<2x[3]xf32> to !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T3:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:       %[[T4:.*]] = llvm.extractelement %[[A]]{{\[}}%[[T3]] : i64] : vector<2xf32>
+// CHECK:       %[[T5Insert:.*]] = llvm.insertelement %[[T4]]
+// CHECK:       %[[T5:.*]] = llvm.shufflevector %[[T5Insert]]
+// CHECK:       %[[T6:.*]] = arith.mulf %[[T5]], %[[B]] : vector<[3]xf32>
+// CHECK:       %[[T8:.*]] = llvm.insertvalue %[[T6]], %[[T7]][0] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T9:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:       %[[T10:.*]] = llvm.extractelement %[[A]]{{\[}}%[[T9]] : i64] : vector<2xf32>
+// CHECK:       %[[T11Insert:.*]] = llvm.insertelement %[[T10]]
+// CHECK:       %[[T11:.*]] = llvm.shufflevector %[[T11Insert]]
+// CHECK:       %[[T12:.*]] = arith.mulf %[[T11]], %[[B]] : vector<[3]xf32>
+// CHECK:       %[[T13:.*]] = llvm.insertvalue %[[T12]], %[[T8]][1] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T14:.*]] = builtin.unrealized_conversion_cast %[[T13]] : !llvm.array<2 x vector<[3]xf32>> to vector<2x[3]xf32>
+// CHECK:       return %[[T14]] : vector<2x[3]xf32>
+
 // -----
 
 func.func @outerproduct_index(%arg0: vector<2xindex>, %arg1: vector<3xindex>) -> vector<2x3xindex> {
@@ -385,6 +645,25 @@ func.func @outerproduct_index(%arg0: vector<2xindex>, %arg1: vector<3xindex>) ->
 // CHECK:       %[[T7:.*]] = builtin.unrealized_conversion_cast %[[T6]] : vector<3xindex> to vector<3xi64>
 // CHECK:       %{{.*}} = llvm.insertvalue %[[T7]], %[[T8]][0] : !llvm.array<2 x vector<3xi64>>
 
+func.func @outerproduct_index_scalable(%arg0: vector<2xindex>, %arg1: vector<[3]xindex>) -> vector<2x[3]xindex> {
+  %2 = vector.outerproduct %arg0, %arg1 : vector<2xindex>, vector<[3]xindex>
+  return %2 : vector<2x[3]xindex>
+}
+// CHECK-LABEL: @outerproduct_index_scalable
+// CHECK-SAME:  %[[A:.*]]: vector<2xindex>,
+// CHECK-SAME:  %[[B:.*]]: vector<[3]xindex>)
+// CHECK:       %[[T1:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<2xindex> to vector<2xi64>
+// CHECK:       %[[T0:.*]] = arith.constant dense<0> : vector<2x[3]xindex>
+// CHECK:       %[[T8:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<2x[3]xindex> to !llvm.array<2 x vector<[3]xi64>>
+// CHECK:       %[[T2:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:       %[[T3:.*]] = llvm.extractelement %[[T1]]{{\[}}%[[T2]] : i64] : vector<2xi64>
+// CHECK:       %[[T4:.*]] = llvm.insertelement %[[T3]]
+// CHECK:       %[[T5:.*]] = llvm.shufflevector %[[T4]]
+// CHECK:       %[[T5Cast:.*]] = builtin.unrealized_conversion_cast %[[T5]] : vector<[3]xi64> to vector<[3]xindex>
+// CHECK:       %[[T6:.*]] = arith.muli %[[T5Cast]], %[[B]] : vector<[3]xindex>
+// CHECK:       %[[T7:.*]] = builtin.unrealized_conversion_cast %[[T6]] : vector<[3]xindex> to vector<[3]xi64>
+// CHECK:       %{{.*}} = llvm.insertvalue %[[T7]], %[[T8]][0] : !llvm.array<2 x vector<[3]xi64>>
+
 // -----
 
 func.func @outerproduct_add(%arg0: vector<2xf32>, %arg1: vector<3xf32>, %arg2: vector<2x3xf32>) -> vector<2x3xf32> {
@@ -415,6 +694,34 @@ func.func @outerproduct_add(%arg0: vector<2xf32>, %arg1: vector<3xf32>, %arg2: v
 // CHECK:       %[[T19:.*]] = builtin.unrealized_conversion_cast %[[T18]] : !llvm.array<2 x vector<3xf32>> to vector<2x3xf32>
 // CHECK:       return %[[T19]] : vector<2x3xf32>
 
+func.func @outerproduct_add_scalable(%arg0: vector<2xf32>, %arg1: vector<[3]xf32>, %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32> {
+  %2 = vector.outerproduct %arg0, %arg1, %arg2 : vector<2xf32>, vector<[3]xf32>
+  return %2 : vector<2x[3]xf32>
+}
+// CHECK-LABEL: @outerproduct_add_scalable
+// CHECK-SAME:  %[[A:.*]]: vector<2xf32>,
+// CHECK-SAME:  %[[B:.*]]: vector<[3]xf32>,
+// CHECK-SAME:  %[[C:.*]]: vector<2x[3]xf32>) -> vector<2x[3]xf32>
+// CHECK:       %[[T7:.*]] = builtin.unrealized_conversion_cast %[[C]] : vector<2x[3]xf32> to !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T3:.*]] = arith.constant dense<0.000000e+00> : vector<2x[3]xf32>
+// CHECK:       %[[T10:.*]] = builtin.unrealized_conversion_cast %[[T3]] : vector<2x[3]xf32> to !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T4:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:       %[[T5:.*]] = llvm.extractelement %[[A]]{{\[}}%[[T4]] : i64] : vector<2xf32>
+// CHECK:       %[[T6Insert:.*]] = llvm.insertelement %[[T5]]
+// CHECK:       %[[T6:.*]] = llvm.shufflevector %[[T6Insert]]
+// CHECK:       %[[T8:.*]] = llvm.extractvalue %[[T7]][0] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T9:.*]] = llvm.intr.fmuladd(%[[T6]], %[[B]], %[[T8]]) : (vector<[3]xf32>, vector<[3]xf32>, vector<[3]xf32>) -> vector<[3]xf32>
+// CHECK:       %[[T11:.*]] = llvm.insertvalue %[[T9]], %[[T10]][0] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T12:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:       %[[T13:.*]] = llvm.extractelement %[[A]]{{\[}}%[[T12]] : i64] : vector<2xf32>
+// CHECK:       %[[T14Insert:.*]] = llvm.insertelement %[[T13]]
+// CHECK:       %[[T14:.*]] = llvm.shufflevector %[[T14Insert]]
+// CHECK:       %[[T16:.*]] = llvm.extractvalue %[[T7]][1] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T17:.*]] = llvm.intr.fmuladd(%[[T14]], %[[B]], %[[T16]]) : (vector<[3]xf32>, vector<[3]xf32>, vector<[3]xf32>) -> vector<[3]xf32>
+// CHECK:       %[[T18:.*]] = llvm.insertvalue %[[T17]], %[[T11]][1] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T19:.*]] = builtin.unrealized_conversion_cast %[[T18]] : !llvm.array<2 x vector<[3]xf32>> to vector<2x[3]xf32>
+// CHECK:       return %[[T19]] : vector<2x[3]xf32>
+
 // -----
 
 func.func @masked_float_add_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: vector<2xf32>, %m: vector<2xi1>) -> vector<2xf32> {
@@ -427,6 +734,16 @@ func.func @masked_float_add_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: v
 // CHECK:           %[[VAL_8:.*]] = llvm.intr.fmuladd(%[[VAL_0]], %{{.*}}, %[[VAL_2]])  : (vector<2xf32>, vector<2xf32>, vector<2xf32>) -> vector<2xf32>
 // CHECK:           %[[VAL_9:.*]] = arith.select %[[VAL_3]], %[[VAL_8]], %[[VAL_2]] : vector<2xi1>, vector<2xf32>
 
+func.func @masked_float_add_outerprod_scalable(%arg0: vector<[2]xf32>, %arg1: f32, %arg2: vector<[2]xf32>, %m: vector<[2]xi1>) -> vector<[2]xf32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<add>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+
+// CHECK-LABEL:   func.func @masked_float_add_outerprod_scalable(
+// CHECK-SAME:                                                   %[[VAL_0:.*]]: vector<[2]xf32>, %[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: vector<[2]xf32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xf32> {
+// CHECK:           %[[VAL_8:.*]] = llvm.intr.fmuladd(%[[VAL_0]], %{{.*}}, %[[VAL_2]])  : (vector<[2]xf32>, vector<[2]xf32>, vector<[2]xf32>) -> vector<[2]xf32>
+// CHECK:           %[[VAL_9:.*]] = arith.select %[[VAL_3]], %[[VAL_8]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xf32>
+
 // -----
 
 func.func @masked_float_mul_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: vector<2xf32>, %m: vector<2xi1>) -> vector<2xf32> {
@@ -440,6 +757,17 @@ func.func @masked_float_mul_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: v
 // CHECK:           %[[VAL_9:.*]] = arith.mulf %[[VAL_8]], %[[VAL_2]] : vector<2xf32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xf32>
 
+func.func @masked_float_mul_outerprod_scalable(%arg0: vector<[2]xf32>, %arg1: f32, %arg2: vector<[2]xf32>, %m: vector<[2]xi1>) -> vector<[2]xf32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<mul>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+
+// CHECK-LABEL:   func.func @masked_float_mul_outerprod_scalable(
+// CHECK-SAME:                                                   %[[VAL_0:.*]]: vector<[2]xf32>, %[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: vector<[2]xf32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xf32> {
+// CHECK:           %[[VAL_8:.*]] = arith.mulf %[[VAL_0]], %{{.*}} : vector<[2]xf32>
+// CHECK:           %[[VAL_9:.*]] = arith.mulf %[[VAL_8]], %[[VAL_2]] : vector<[2]xf32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xf32>
+
 // -----
 
 func.func @masked_float_max_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: vector<2xf32>, %m: vector<2xi1>) -> vector<2xf32> {
@@ -453,6 +781,17 @@ func.func @masked_float_max_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: v
 // CHECK:           %[[VAL_9:.*]] = arith.maxnumf %[[VAL_8]], %[[VAL_2]] : vector<2xf32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xf32>
 
+func.func @masked_float_max_outerprod_scalable(%arg0: vector<[2]xf32>, %arg1: f32, %arg2: vector<[2]xf32>, %m: vector<[2]xi1>) -> vector<[2]xf32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<maxnumf>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+
+// CHECK-LABEL:   func.func @masked_float_max_outerprod_scalable(
+// CHECK-SAME:                                                   %[[VAL_0:.*]]: vector<[2]xf32>, %[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: vector<[2]xf32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xf32> {
+// CHECK:           %[[VAL_8:.*]] = arith.mulf %[[VAL_0]], %{{.*}} : vector<[2]xf32>
+// CHECK:           %[[VAL_9:.*]] = arith.maxnumf %[[VAL_8]], %[[VAL_2]] : vector<[2]xf32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xf32>
+
 // -----
 
 func.func @masked_float_min_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: vector<2xf32>, %m: vector<2xi1>) -> vector<2xf32> {
@@ -466,6 +805,17 @@ func.func @masked_float_min_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: v
 // CHECK:           %[[VAL_9:.*]] = arith.minnumf %[[VAL_8]], %[[VAL_2]] : vector<2xf32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xf32>
 
+func.func @masked_float_min_outerprod_scalable(%arg0: vector<[2]xf32>, %arg1: f32, %arg2: vector<[2]xf32>, %m: vector<[2]xi1>) -> vector<[2]xf32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<minnumf>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+
+// CHECK-LABEL:   func.func @masked_float_min_outerprod_scalable(
+// CHECK-SAME:                                                   %[[VAL_0:.*]]: vector<[2]xf32>, %[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: vector<[2]xf32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xf32> {
+// CHECK:           %[[VAL_8:.*]] = arith.mulf %[[VAL_0]], %{{.*}} : vector<[2]xf32>
+// CHECK:           %[[VAL_9:.*]] = arith.minnumf %[[VAL_8]], %[[VAL_2]] : vector<[2]xf32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xf32>
+
 // -----
 
 func.func @masked_int_add_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vector<2xi32>, %m: vector<2xi1>) -> vector<2xi32> {
@@ -479,6 +829,17 @@ func.func @masked_int_add_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vec
 // CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_2]] : vector<2xi32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xi32>
 
+func.func @masked_int_add_outerprod_scalable(%arg0: vector<[2]xi32>, %arg1: i32, %arg2: vector<[2]xi32>, %m: vector<[2]xi1>) -> vector<[2]xi32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<add>} : vector<[2]xi32>, i32 } : vector<[2]xi1> -> vector<[2]xi32>
+  return %0 : vector<[2]xi32>
+}
+
+// CHECK-LABEL:   func.func @masked_int_add_outerprod_scalable(
+// CHECK-SAME:                                                 %[[VAL_0:.*]]: vector<[2]xi32>, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: vector<[2]xi32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xi32> {
+// CHECK:           %[[VAL_8:.*]] = arith.muli %[[VAL_0]], %{{.*}} : vector<[2]xi32>
+// CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_2]] : vector<[2]xi32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xi32>
+
 // -----
 
 func.func @masked_int_mul_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vector<2xi32>, %m: vector<2xi1>) -> vector<2xi32> {
@@ -492,6 +853,17 @@ func.func @masked_int_mul_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vec
 // CHECK:           %[[VAL_9:.*]] = arith.muli %[[VAL_8]], %[[VAL_2]] : vector<2xi32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xi32>
 
+func.func @masked_int_mul_outerprod_scalable(%arg0: vector<[2]xi32>, %arg1: i32, %arg2: vector<[2]xi32>, %m: vector<[2]xi1>) -> vector<[2]xi32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<mul>} : vector<[2]xi32>, i32 } : vector<[2]xi1> -> vector<[2]xi32>
+  return %0 : vector<[2]xi32>
+}
+
+// CHECK-LABEL:   func.func @masked_int_mul_outerprod_scalable(
+// CHECK-SAME:                                                 %[[VAL_0:.*]]: vector<[2]xi32>, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: vector<[2]xi32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xi32> {
+// CHECK:           %[[VAL_8:.*]] = arith.muli %[[VAL_0]], %{{.*}} : vector<[2]xi32>
+// CHECK:           %[[VAL_9:.*]] = arith.muli %[[VAL_8]], %[[VAL_2]] : vector<[2]xi32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xi32>
+
 // -----
 
 func.func @masked_int_max_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vector<2xi32>, %m: vector<2xi1>) -> vector<2xi32> {
@@ -505,6 +877,17 @@ func.func @masked_int_max_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vec
 // CHECK:           %[[VAL_9:.*]] = arith.maxsi %[[VAL_8]], %[[VAL_2]] : vector<2xi32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xi32>
 
+func.func @masked_int_max_outerprod_scalable(%arg0: vector<[2]xi32>, %arg1: i32, %arg2: vector<[2]xi32>, %m: vector<[2]xi1>) -> vector<[2]xi32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<maxsi>} : vector<[2]xi32>, i32 } : vector<[2]xi1> -> vector<[2]xi32>
+  return %0 : vector<[2]xi32>
+}
+
+// CHECK-LABEL:   func.func @masked_int_max_outerprod_scalable(
+// CHECK-SAME:                                                 %[[VAL_0:.*]]: vector<[2]xi32>, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: vector<[2]xi32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xi32> {
+// CHECK:           %[[VAL_8:.*]] = arith.muli %[[VAL_0]], %{{.*}} : vector<[2]xi32>
+// CHECK:           %[[VAL_9:.*]] = arith.maxsi %[[VAL_8]], %[[VAL_2]] : vector<[2]xi32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xi32>
+
 // -----
 
 func.func @masked_int_min_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vector<2xi32>, %m: vector<2xi1>) -> vector<2xi32> {
@@ -518,6 +901,17 @@ func.func @masked_int_min_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vec
 // CHECK:           %[[VAL_9:.*]] = arith.minui %[[VAL_8]], %[[VAL_2]] : vector<2xi32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xi32>
 
+func.func @masked_int_min_outerprod_scalable(%arg0: vector<[2]xi32>, %arg1: i32, %arg2: vector<[2]xi32>, %m: vector<[2]xi1>) -> vector<[2]xi32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<minui>} : vector<[2]xi32>, i32 } : vector<[2]xi1> -> vector<[2]xi32>
+  return %0 : vector<[2]xi32>
+}
+
+// CHECK-LABEL:   func.func @masked_int_min_outerprod_scalable(
+// CHECK-SAME:                                                 %[[VAL_0:.*]]: vector<[2]xi32>, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: vector<[2]xi32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xi32> {
+// CHECK:           %[[VAL_8:.*]] = arith.muli %[[VAL_0]], %{{.*}} : vector<[2]xi32>
+// CHECK:           %[[VAL_9:.*]] = arith.minui %[[VAL_8]], %[[VAL_2]] : vector<[2]xi32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xi32>
+
 // -----
 
 func.func @masked_int_and_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vector<2xi32>, %m: vector<2xi1>) -> vector<2xi32> {
@@ -531,6 +925,17 @@ func.func @masked_int_and_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vec
 // CHECK:           %[[VAL_9:.*]] = arith.andi %[[VAL_8]], %[[VAL_2]] : vector<2xi32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xi32>
 
+func.func @masked_int_and_outerprod_scalable(%arg0: vector<[2]xi32>, %arg1: i32, %arg2: vector<[2]xi32>, %m: vector<[2]xi1>) -> vector<[2]xi32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<and>} : vector<[2]xi32>, i32 } : vector<[2]xi1> -> vector<[2]xi32>
+  return %0 : vector<[2]xi32>
+}
+
+// CHECK-LABEL:   func.func @masked_int_and_outerprod_scalable(
+// CHECK-SAME:                                                 %[[VAL_0:.*]]: vector<[2]xi32>, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: vector<[2]xi32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xi32> {
+// CHECK:           %[[VAL_8:.*]] = arith.muli %[[VAL_0]], %{{.*}} : vector<[2]xi32>
+// CHECK:           %[[VAL_9:.*]] = arith.andi %[[VAL_8]], %[[VAL_2]] : vector<[2]xi32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xi32>
+
 // -----
 
 func.func @masked_int_or_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vector<2xi32>, %m: vector<2xi1>) -> vector<2xi32> {
@@ -544,6 +949,17 @@ func.func @masked_int_or_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vect
 // CHECK:           %[[VAL_9:.*]] = arith.ori %[[VAL_8]], %[[VAL_2]] : vector<2xi32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xi32>
 
+func.func @masked_int_or_outerprod_scalable(%arg0: vector<[2]xi32>, %arg1: i32, %arg2: vector<[2]xi32>, %m: vector<[2]xi1>) -> vector<[2]xi32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<or>} : vector<[2]xi32>, i32 } : vector<[2]xi1> -> vector<[2]xi32>
+  return %0 : vector<[2]xi32>
+}
+
+// CHECK-LABEL:   func.func @masked_int_or_outerprod_scalable
+// CHECK-SAME:                                       %[[VAL_0:.*]]: vector<[2]xi32>, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: vector<[2]xi32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xi32> {
+// CHECK:           %[[VAL_8:.*]] = arith.muli %[[VAL_0]], %{{.*}} : vector<[2]xi32>
+// CHECK:           %[[VAL_9:.*]] = arith.ori %[[VAL_8]], %[[VAL_2]] : vector<[2]xi32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xi32>
+
 // -----
 
 func.func @shuffle_0D_direct(%arg0: vector<f32>) -> vector<3xf32> {
diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
index 937dbf22bb713f8..85afdf7a7dc7718 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
@@ -6,6 +6,8 @@
 ///----------------------------------------------------------------------------------------
 /// vector.transfer_read
 /// [Pattern: FlattenContiguousRowMajorTransferReadPattern]
+///
+/// NOTE: Scalable vectors are not supported
 ///----------------------------------------------------------------------------------------
 
 func.func @transfer_read_dims_match_contiguous(
@@ -28,6 +30,22 @@ func.func @transfer_read_dims_match_contiguous(
 // CHECK-128B-LABEL: func @transfer_read_dims_match_contiguous
 //       CHECK-128B:   memref.collapse_shape
 
+func.func @transfer_read_dims_match_contiguous_scalable(
+    %mem : memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>) -> vector<5x4x3x[2]xi8> {
+
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0 : i8
+  %res = vector.transfer_read %mem[%c0, %c0, %c0, %c0], %cst :
+    memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>, vector<5x4x3x[2]xi8>
+  return %res : vector<5x4x3x[2]xi8>
+}
+
+// CHECK-LABEL: func @transfer_read_dims_match_contiguous_scalable
+// CHECK-NOT: memref.collapse_shape
+
+// CHECK-128B-LABEL: func @transfer_read_dims_match_contiguous_scalable
+//   CHECK-128B-NOT:   memref.collapse_shape
+
 // -----
 
 func.func @transfer_read_dims_match_contiguous_empty_stride(
@@ -259,6 +277,8 @@ func.func @transfer_read_non_contiguous_src(
 ///----------------------------------------------------------------------------------------
 /// vector.transfer_write
 /// [Pattern: FlattenContiguousRowMajorTransferWritePattern]
+///
+/// NOTE: Scalable vectors are not supported
 ///----------------------------------------------------------------------------------------
 
 func.func @transfer_write_dims_match_contiguous(
@@ -281,6 +301,22 @@ func.func @transfer_write_dims_match_contiguous(
 // CHECK-128B-LABEL: func @transfer_write_dims_match_contiguous(
 //       CHECK-128B:   memref.collapse_shape
 
+func.func @transfer_write_dims_match_contiguous_scalable(
+    %mem : memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>,
+    %vec : vector<5x4x3x[2]xi8>) {
+
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %vec, %mem [%c0, %c0, %c0, %c0] :
+    vector<5x4x3x[2]xi8>, memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>
+  return
+}
+
+// CHECK-LABEL: func @transfer_write_dims_match_contiguous_scalable(
+// CHECK-NOT:   memref.collapse_shape
+
+// CHECK-128B-LABEL: func @transfer_write_dims_match_contiguous_scalable
+//   CHECK-128B-NOT:   memref.collapse_shape
+
 // -----
 
 func.func @transfer_write_dims_match_contiguous_empty_stride(
@@ -504,7 +540,11 @@ func.func @transfer_write_non_contiguous_src(
 
 ///----------------------------------------------------------------------------------------
 /// [Pattern: DropUnitDimFromElementwiseOps]
+///
 /// TODO: Move to a dedicated file - there's no "flattening" in the following tests
+/// TODO: Potential duplication with tests from:
+///   * "vector-dropleadunitdim-transforms.mlir" 
+///   * "vector-transfer-drop-unit-dims-patterns.mlir"
 ///----------------------------------------------------------------------------------------
 
 func.func @fold_unit_dim_add_basic(%vec : vector<1x8xi32>) -> vector<1x8xi32> {
@@ -737,6 +777,18 @@ func.func @transpose_with_scalable_unit_dims(%vec: vector<[1]x1x2x4x1xf32>) -> v
 
 // -----
 
+func.func @transpose_with_all_unit_dims(%vec: vector<1x1x1xf32>) -> vector<1x1x1xf32> {
+  %res = vector.transpose %vec, [0, 2, 1] : vector<1x1x1xf32> to vector<1x1x1xf32>
+  return %res : vector<1x1x1xf32>
+}
+// The `vec` is returned because there are other flattening patterns that fold
+// vector.shape_cast ops away.
+// CHECK-LABEL: func.func @transpose_with_all_unit_dims
+// CHECK-SAME:      %[[VEC:.[a-zA-Z0-9]+]]
+// CHECK-NEXT:    return %[[VEC]]
+
+// -----
+
 func.func @negative_transpose_with_no_unit_dims(%vec: vector<4x2x3xf32>) -> vector<4x3x2xf32> {
   %res = vector.transpose %vec, [0, 2, 1] : vector<4x2x3xf32> to vector<4x3x2xf32>
   return %res : vector<4x3x2xf32>
diff --git a/mlir/test/Transforms/test-convert-func-op.mlir b/mlir/test/Transforms/test-convert-func-op.mlir
index 6e96703cda57891..180f16a32991b32 100644
--- a/mlir/test/Transforms/test-convert-func-op.mlir
+++ b/mlir/test/Transforms/test-convert-func-op.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -test-convert-func-op | FileCheck %s
+// RUN: mlir-opt %s -test-convert-func-op --split-input-file | FileCheck %s
 
 // CHECK-LABEL: llvm.func @add
 func.func @add(%arg0: i32, %arg1: i32) -> i32 attributes { llvm.emit_c_interface } {
@@ -10,3 +10,31 @@ func.func @add(%arg0: i32, %arg1: i32) -> i32 attributes { llvm.emit_c_interface
 // CHECK-SAME: [[ARG1:%[a-zA-Z0-9_]+]]: i32
 // CHECK-NEXT: [[RES:%.*]] = llvm.call @add([[ARG0]], [[ARG1]])
 // CHECK-NEXT: llvm.return [[RES]]
+
+// -----
+
+// Test that `llvm.byval` arguments are converted to `llvm.ptr` and the actual
+// value is retrieved within the `llvm.func`.
+
+// CHECK-LABEL: llvm.func @byval
+func.func @byval(%arg0: !test.smpla {llvm.byval = !test.smpla}) -> !test.smpla {
+  return %arg0 : !test.smpla
+}
+
+// CHECK-SAME: (%[[ARG0:.*]]: !llvm.ptr {llvm.byval = !llvm.struct<(i8, i8)>}) -> !llvm.struct<(i8, i8)>
+//      CHECK: %[[LD:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> !llvm.struct<(i8, i8)>
+//      CHECK: llvm.return %[[LD]] : !llvm.struct<(i8, i8)>
+
+// -----
+
+// Test that `llvm.byref` arguments are converted to `llvm.ptr` and the actual
+// value is retrieved within the `llvm.func`.
+
+// CHECK-LABEL: llvm.func @byref
+func.func @byref(%arg0: !test.smpla {llvm.byref = !test.smpla}) -> !test.smpla {
+  return %arg0 : !test.smpla
+}
+
+// CHECK-SAME: (%[[ARG0:.*]]: !llvm.ptr {llvm.byref = !llvm.struct<(i8, i8)>}) -> !llvm.struct<(i8, i8)>
+//      CHECK: %[[LD:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> !llvm.struct<(i8, i8)>
+//      CHECK: llvm.return %[[LD]] : !llvm.struct<(i8, i8)>
diff --git a/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp b/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp
index e25e890e2290a48..75168dde93130f6 100644
--- a/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp
+++ b/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp
@@ -47,12 +47,23 @@ struct ReturnOpConversion : public ConvertOpToLLVMPattern<func::ReturnOp> {
   LogicalResult
   matchAndRewrite(func::ReturnOp returnOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(returnOp,
-                                                returnOp->getOperands());
+    SmallVector<Type> resTys;
+    if (failed(typeConverter->convertTypes(returnOp->getResultTypes(), resTys)))
+      return failure();
+
+    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(returnOp, resTys,
+                                                adaptor.getOperands());
     return success();
   }
 };
 
+static std::optional<Type>
+convertSimpleATypeToStruct(test::SimpleAType simpleTy) {
+  MLIRContext *ctx = simpleTy.getContext();
+  SmallVector<Type> memberTys(2, IntegerType::get(ctx, /*width=*/8));
+  return LLVM::LLVMStructType::getLiteral(ctx, memberTys);
+}
+
 struct TestConvertFuncOp
     : public PassWrapper<TestConvertFuncOp, OperationPass<ModuleOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestConvertFuncOp)
@@ -74,6 +85,7 @@ struct TestConvertFuncOp
     LowerToLLVMOptions options(ctx);
     // Populate type conversions.
     LLVMTypeConverter typeConverter(ctx, options);
+    typeConverter.addConversion(convertSimpleATypeToStruct);
 
     RewritePatternSet patterns(ctx);
     patterns.add<FuncOpConversion>(typeConverter);
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index 1209c53d81bfb28..8b79de58fa1028b 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -102,9 +102,9 @@ add_mlir_tool(mlir-opt
   DEPENDS
   ${LIBS}
   SUPPORT_PLUGINS
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
 target_link_libraries(mlir-opt PRIVATE ${LIBS})
 llvm_update_compile_flags(mlir-opt)
 
 mlir_check_all_link_libraries(mlir-opt)
+export_executable_symbols_for_plugins(mlir-opt)
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index e81e677dc58dd21..bedd363edd1bdee 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -664,8 +664,6 @@ libc_support_library(
         ":__support_cpp_optional",
         ":__support_cpp_string_view",
         ":__support_ctype_utils",
-        ":__support_fputil_dyadic_float",
-        ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
         ":__support_fputil_rounding_mode",
         ":__support_str_to_integer",