From 1a92cc5a0ad108c515b1c383645ff70069c12077 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 8 Aug 2024 06:45:42 -0500
Subject: [PATCH 001/112] [libc] Implement 'getenv' on the GPU target (#102376)

Summary:
This patch implements 'getenv'. I was torn on how to implement this,
since realistically we only have access to this environment pointer in
the "loader" interface. An alternative would be to use an RPC call every
time, but I think that's overkill for what this will be used for. A
better solution is just to emit a common `DataEnvironment` that contains
all of the host visible resources to initialize. Right now this is the
`env_ptr`, `clock_freq`, and `rpc_client`.

I did this by making the `app.h` interface that Linux uses more general,
could possibly move that into a separate patch, but I figured it's
easier to see with the usage.
---
 libc/config/CMakeLists.txt                    | 10 +++++--
 libc/config/app.h                             | 20 +++++++++++++
 libc/config/gpu/app.h                         | 28 +++++++++++++++++++
 libc/config/gpu/entrypoints.txt               |  1 +
 libc/config/linux/CMakeLists.txt              |  7 -----
 .../__support/threads/linux/CMakeLists.txt    |  2 +-
 libc/src/__support/threads/linux/thread.cpp   |  2 +-
 libc/src/stdlib/CMakeLists.txt                |  2 +-
 libc/src/stdlib/getenv.cpp                    |  2 +-
 libc/src/sys/auxv/linux/CMakeLists.txt        |  2 +-
 libc/src/sys/auxv/linux/getauxval.cpp         |  2 +-
 libc/startup/gpu/CMakeLists.txt               |  2 +-
 libc/startup/gpu/amdgpu/CMakeLists.txt        |  1 +
 libc/startup/gpu/amdgpu/start.cpp             |  5 ++++
 libc/startup/gpu/nvptx/CMakeLists.txt         |  1 +
 libc/startup/gpu/nvptx/start.cpp              |  6 ++++
 libc/startup/linux/CMakeLists.txt             |  2 +-
 libc/startup/linux/aarch64/CMakeLists.txt     |  4 +--
 libc/startup/linux/do_start.h                 |  2 +-
 libc/startup/linux/riscv/CMakeLists.txt       |  4 +--
 libc/startup/linux/x86_64/CMakeLists.txt      |  4 +--
 .../integration/src/stdlib/CMakeLists.txt     |  1 -
 22 files changed, 84 insertions(+), 26 deletions(-)
 create mode 100644 libc/config/app.h
 create mode 100644 libc/config/gpu/app.h
 delete mode 100644 libc/config/linux/CMakeLists.txt

diff --git a/libc/config/CMakeLists.txt b/libc/config/CMakeLists.txt
index 853854b03be4e4e..cf38ae3eed72677 100644
--- a/libc/config/CMakeLists.txt
+++ b/libc/config/CMakeLists.txt
@@ -1,3 +1,7 @@
-#TODO: Properly select the correct subdirectory.
-
-add_subdirectory(linux)
+add_header_library(
+  app_h
+  HDRS
+    app.h
+  DEPENDS
+    libc.src.__support.common
+)
diff --git a/libc/config/app.h b/libc/config/app.h
new file mode 100644
index 000000000000000..27f4141d80c4bee
--- /dev/null
+++ b/libc/config/app.h
@@ -0,0 +1,20 @@
+//===-- Classes to capture properites of applications -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_CONFIG_APP_H
+#define LLVM_LIBC_CONFIG_APP_H
+
+#include "src/__support/macros/properties/architectures.h"
+
+#if defined(LIBC_TARGET_ARCH_IS_GPU)
+#include "gpu/app.h"
+#elif defined(__linux__)
+#include "linux/app.h"
+#endif
+
+#endif // LLVM_LIBC_CONFIG_APP_H
diff --git a/libc/config/gpu/app.h b/libc/config/gpu/app.h
new file mode 100644
index 000000000000000..148c51b702203f7
--- /dev/null
+++ b/libc/config/gpu/app.h
@@ -0,0 +1,28 @@
+//===-- Classes to capture properites of GPU applications -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_CONFIG_GPU_APP_H
+#define LLVM_LIBC_CONFIG_GPU_APP_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/architectures.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+// TODO: Move other global values here and export them to the host.
+struct DataEnvironment {
+  uintptr_t *env_ptr;
+};
+
+extern DataEnvironment app;
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_CONFIG_GPU_APP_H
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index e1a16a3b688789b..3e0e6aff5524565 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -167,6 +167,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.strtoull
     libc.src.stdlib.at_quick_exit
     libc.src.stdlib.quick_exit
+    libc.src.stdlib.getenv
 
     # TODO: Implement these correctly
     libc.src.stdlib.aligned_alloc
diff --git a/libc/config/linux/CMakeLists.txt b/libc/config/linux/CMakeLists.txt
deleted file mode 100644
index cf38ae3eed72677..000000000000000
--- a/libc/config/linux/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_header_library(
-  app_h
-  HDRS
-    app.h
-  DEPENDS
-    libc.src.__support.common
-)
diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
index c2f0ed0cb233db3..b6796f40adce7bd 100644
--- a/libc/src/__support/threads/linux/CMakeLists.txt
+++ b/libc/src/__support/threads/linux/CMakeLists.txt
@@ -77,7 +77,7 @@ add_object_library(
     thread.cpp
   DEPENDS
     .futex_utils
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.include.sys_syscall
     libc.include.fcntl
     libc.src.errno.errno
diff --git a/libc/src/__support/threads/linux/thread.cpp b/libc/src/__support/threads/linux/thread.cpp
index 36b4a88eba9b422..ee3f63fa3cde32e 100644
--- a/libc/src/__support/threads/linux/thread.cpp
+++ b/libc/src/__support/threads/linux/thread.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/threads/thread.h"
-#include "config/linux/app.h"
+#include "config/app.h"
 #include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/CPP/stringstream.h"
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index 29789f5e2adc2b8..ce12e66cf3e57f8 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -62,7 +62,7 @@ add_entrypoint_object(
   HDRS
     getenv.h
   DEPENDS
-    libc.config.linux.app_h
+  libc.config.app_h
 )
 
 add_entrypoint_object(
diff --git a/libc/src/stdlib/getenv.cpp b/libc/src/stdlib/getenv.cpp
index 6b1bb693a6d8311..e6ef03fad5c51ef 100644
--- a/libc/src/stdlib/getenv.cpp
+++ b/libc/src/stdlib/getenv.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/stdlib/getenv.h"
-#include "config/linux/app.h"
+#include "config/app.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
diff --git a/libc/src/sys/auxv/linux/CMakeLists.txt b/libc/src/sys/auxv/linux/CMakeLists.txt
index 383c29eafda8d89..4884184cc605390 100644
--- a/libc/src/sys/auxv/linux/CMakeLists.txt
+++ b/libc/src/sys/auxv/linux/CMakeLists.txt
@@ -11,7 +11,7 @@ add_entrypoint_object(
     libc.src.__support.threads.callonce
     libc.src.__support.common
     libc.src.errno.errno
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.src.fcntl.open
     libc.src.unistd.read
     libc.src.unistd.close
diff --git a/libc/src/sys/auxv/linux/getauxval.cpp b/libc/src/sys/auxv/linux/getauxval.cpp
index bfa6b23b5ef913b..236fd25698f6597 100644
--- a/libc/src/sys/auxv/linux/getauxval.cpp
+++ b/libc/src/sys/auxv/linux/getauxval.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/sys/auxv/getauxval.h"
-#include "config/linux/app.h"
+#include "config/app.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/errno/libc_errno.h"
diff --git a/libc/startup/gpu/CMakeLists.txt b/libc/startup/gpu/CMakeLists.txt
index 5e5745063fc8c39..9d0e0885dff9390 100644
--- a/libc/startup/gpu/CMakeLists.txt
+++ b/libc/startup/gpu/CMakeLists.txt
@@ -34,7 +34,7 @@ function(add_startup_object name)
       RUNTIME_OUTPUT_DIRECTORY ${LIBC_LIBRARY_DIR}
       RUNTIME_OUTPUT_NAME ${name}.o)
     target_link_options(${fq_target_name}.exe PRIVATE
-                        "-nostdlib" "-flto" "-Wl,--lto-emit-llvm")
+                        "-r" "-nostdlib" "-flto" "-Wl,--lto-emit-llvm")
   endif()
 endfunction()
 
diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt
index 3ac104ee8ba94a4..b67a5a2cc89fb24 100644
--- a/libc/startup/gpu/amdgpu/CMakeLists.txt
+++ b/libc/startup/gpu/amdgpu/CMakeLists.txt
@@ -3,6 +3,7 @@ add_startup_object(
   SRC
     start.cpp
   DEPENDS
+    libc.config.app_h
     libc.src.__support.RPC.rpc_client
     libc.src.__support.GPU.utils
     libc.src.stdlib.exit
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 6bda151023c8fce..5aaa7e938d27924 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "config/gpu/app.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/macros/config.h"
@@ -16,6 +17,8 @@ extern "C" int main(int argc, char **argv, char **envp);
 
 namespace LIBC_NAMESPACE_DECL {
 
+DataEnvironment app;
+
 extern "C" uintptr_t __init_array_start[];
 extern "C" uintptr_t __init_array_end[];
 extern "C" uintptr_t __fini_array_start[];
@@ -40,6 +43,8 @@ static void call_fini_array_callbacks() {
 
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
 _begin(int argc, char **argv, char **env) {
+  __atomic_store_n(&LIBC_NAMESPACE::app.env_ptr,
+                   reinterpret_cast<uintptr_t *>(env), __ATOMIC_RELAXED);
   // We want the fini array callbacks to be run after other atexit
   // callbacks are run. So, we register them before running the init
   // array callbacks as they can potentially register their own atexit
diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
index 3ac104ee8ba94a4..b67a5a2cc89fb24 100644
--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -3,6 +3,7 @@ add_startup_object(
   SRC
     start.cpp
   DEPENDS
+    libc.config.app_h
     libc.src.__support.RPC.rpc_client
     libc.src.__support.GPU.utils
     libc.src.stdlib.exit
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index b1ef944c4aa2894..ef1e63e5161a61a 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "config/gpu/app.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/macros/config.h"
@@ -16,6 +17,8 @@ extern "C" int main(int argc, char **argv, char **envp);
 
 namespace LIBC_NAMESPACE_DECL {
 
+DataEnvironment app;
+
 extern "C" {
 // Nvidia's 'nvlink' linker does not provide these symbols. We instead need
 // to manually create them and update the globals in the loader implememtation.
@@ -46,6 +49,9 @@ static void call_fini_array_callbacks() {
 
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
 _begin(int argc, char **argv, char **env) {
+  __atomic_store_n(&LIBC_NAMESPACE::app.env_ptr,
+                   reinterpret_cast<uintptr_t *>(env), __ATOMIC_RELAXED);
+
   // We want the fini array callbacks to be run after other atexit
   // callbacks are run. So, we register them before running the init
   // array callbacks as they can potentially register their own atexit
diff --git a/libc/startup/linux/CMakeLists.txt b/libc/startup/linux/CMakeLists.txt
index 336c5d0f6bfa279..71f187ca05f29fe 100644
--- a/libc/startup/linux/CMakeLists.txt
+++ b/libc/startup/linux/CMakeLists.txt
@@ -95,7 +95,7 @@ add_object_library(
   HDRS
     do_start.h
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.include.sys_mman
     libc.include.sys_syscall
     libc.include.llvm-libc-macros.link_macros
diff --git a/libc/startup/linux/aarch64/CMakeLists.txt b/libc/startup/linux/aarch64/CMakeLists.txt
index 5ea6ae59abcb284..5564f0a8f687e69 100644
--- a/libc/startup/linux/aarch64/CMakeLists.txt
+++ b/libc/startup/linux/aarch64/CMakeLists.txt
@@ -3,7 +3,7 @@ add_startup_object(
   SRC
     tls.cpp
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.include.sys_mman
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -18,7 +18,7 @@ add_startup_object(
   SRC
     start.cpp
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
   COMPILE_OPTIONS
     -fno-omit-frame-pointer
     -ffreestanding # To avoid compiler warnings about calling the main function.
diff --git a/libc/startup/linux/do_start.h b/libc/startup/linux/do_start.h
index dd41c9bd384e7b9..8fc8c3716f2ac0d 100644
--- a/libc/startup/linux/do_start.h
+++ b/libc/startup/linux/do_start.h
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "config/linux/app.h"
+#include "config/app.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/startup/linux/riscv/CMakeLists.txt b/libc/startup/linux/riscv/CMakeLists.txt
index 3717784233c1513..2a61f8289067de2 100644
--- a/libc/startup/linux/riscv/CMakeLists.txt
+++ b/libc/startup/linux/riscv/CMakeLists.txt
@@ -3,7 +3,7 @@ add_startup_object(
   SRC
     tls.cpp
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.include.sys_mman
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -18,7 +18,7 @@ add_startup_object(
   SRC
     start.cpp
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.src.__support.macros.attributes
   COMPILE_OPTIONS
     -fno-omit-frame-pointer
diff --git a/libc/startup/linux/x86_64/CMakeLists.txt b/libc/startup/linux/x86_64/CMakeLists.txt
index 30da7ab4e1ec3df..4f482eaf5d18eb6 100644
--- a/libc/startup/linux/x86_64/CMakeLists.txt
+++ b/libc/startup/linux/x86_64/CMakeLists.txt
@@ -3,7 +3,7 @@ add_startup_object(
   SRC
     tls.cpp
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.include.sys_mman
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
@@ -20,7 +20,7 @@ add_startup_object(
   SRC
     start.cpp
   DEPENDS
-    libc.config.linux.app_h
+    libc.config.app_h
     libc.src.__support.macros.attributes
   COMPILE_OPTIONS
     -fno-stack-protector
diff --git a/libc/test/integration/src/stdlib/CMakeLists.txt b/libc/test/integration/src/stdlib/CMakeLists.txt
index 0985a80ce7a0907..1efdf607defe9b7 100644
--- a/libc/test/integration/src/stdlib/CMakeLists.txt
+++ b/libc/test/integration/src/stdlib/CMakeLists.txt
@@ -13,4 +13,3 @@ add_integration_test(
     FRANCE=Paris
     GERMANY=Berlin
 )
-

From 7e7a439705f4926a798785579aeb9bfff07049c7 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 8 Aug 2024 08:05:33 -0400
Subject: [PATCH 002/112] [SLP][NFC]Introduce CombinedVectorize nodes, NFC.
 (#99309)

This adds combined vectorized node. It simplifies handling of the
combined nodes, like select/cmp, which can be reduced to min/max,
mul/add transformed to fma, etc. Improves cost mode handling and may end
up with better codegen in future (direct emission of the intrinsics).
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 139 +++++++++++-------
 1 file changed, 85 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7619e744f7a2f72..d14dd1dabd5fcd2 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2971,13 +2971,24 @@ class BoUpSLP {
     /// (either with vector instruction or with scatter/gather
     /// intrinsics for store/load)?
     enum EntryState {
-      Vectorize,
-      ScatterVectorize,
-      StridedVectorize,
-      NeedToGather
+      Vectorize,         ///< The node is regularly vectorized.
+      ScatterVectorize,  ///< Masked scatter/gather node.
+      StridedVectorize,  ///< Strided loads (and stores)
+      NeedToGather,      ///< Gather/buildvector node.
+      CombinedVectorize, ///< Vectorized node, combined with its user into more
+                         ///< complex node like select/cmp to minmax, mul/add to
+                         ///< fma, etc. Must be used for the following nodes in
+                         ///< the pattern, not the very first one.
     };
     EntryState State;
 
+    /// List of combined opcodes supported by the vectorizer.
+    enum CombinedOpcode {
+      NotCombinedOp = -1,
+      MinMax = Instruction::OtherOpsEnd + 1,
+    };
+    CombinedOpcode CombinedOp = NotCombinedOp;
+
     /// Does this sequence require some shuffling?
     SmallVector<int, 4> ReuseShuffleIndices;
 
@@ -3165,6 +3176,9 @@ class BoUpSLP {
       case NeedToGather:
         dbgs() << "NeedToGather\n";
         break;
+      case CombinedVectorize:
+        dbgs() << "CombinedVectorize\n";
+        break;
       }
       dbgs() << "MainOp: ";
       if (MainOp)
@@ -7213,6 +7227,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         buildTree_rec(PointerOps, Depth + 1, {TE, 0});
         LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
         break;
+      case TreeEntry::CombinedVectorize:
       case TreeEntry::NeedToGather:
         llvm_unreachable("Unexpected loads state.");
       }
@@ -8294,6 +8309,22 @@ void BoUpSLP::transformNodes() {
       }
       break;
     }
+    case Instruction::Select: {
+      if (E.State != TreeEntry::Vectorize)
+        break;
+      auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
+      if (MinMaxID == Intrinsic::not_intrinsic)
+        break;
+      // This node is a minmax node.
+      E.CombinedOp = TreeEntry::MinMax;
+      TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
+      if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
+          CondEntry->State == TreeEntry::Vectorize) {
+        // The condition node is part of the combined minmax node.
+        CondEntry->State = TreeEntry::CombinedVectorize;
+      }
+      break;
+    }
     default:
       break;
     }
@@ -9430,6 +9461,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   Instruction *VL0 = E->getMainOp();
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+  if (E->CombinedOp != TreeEntry::NotCombinedOp)
+    ShuffleOrOp = E->CombinedOp;
   SetVector<Value *> UniqueValues(VL.begin(), VL.end());
   const unsigned Sz = UniqueValues.size();
   SmallBitVector UsedScalars(Sz, false);
@@ -9515,6 +9548,31 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     return VecCost - ScalarCost;
   };
 
+  auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
+    auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
+    if (MinMaxID == Intrinsic::not_intrinsic)
+      return InstructionCost::getInvalid();
+    Type *CanonicalType = Ty;
+    if (CanonicalType->isPtrOrPtrVectorTy())
+      CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
+          CanonicalType->getContext(),
+          DL->getTypeSizeInBits(CanonicalType->getScalarType())));
+
+    IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
+                                      {CanonicalType, CanonicalType});
+    InstructionCost IntrinsicCost =
+        TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+    // If the selects are the only uses of the compares, they will be
+    // dead and we can adjust the cost by removing their cost.
+    if (VI && SelectOnly) {
+      assert(!Ty->isVectorTy() && "Expected only for scalar type.");
+      auto *CI = cast<CmpInst>(VI->getOperand(0));
+      IntrinsicCost -=
+          TTI->getCmpSelInstrCost(CI->getOpcode(), Ty, Builder.getInt1Ty(),
+                                  CI->getPredicate(), CostKind, CI);
+    }
+    return IntrinsicCost;
+  };
   switch (ShuffleOrOp) {
   case Instruction::PHI: {
     // Count reused scalars.
@@ -9775,28 +9833,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
           E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
           CostKind, VI);
-      auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI);
-      if (MinMaxID != Intrinsic::not_intrinsic) {
-        Type *CanonicalType = OrigScalarTy;
-        if (CanonicalType->isPtrOrPtrVectorTy())
-          CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
-              CanonicalType->getContext(),
-              DL->getTypeSizeInBits(CanonicalType->getScalarType())));
-
-        IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
-                                          {CanonicalType, CanonicalType});
-        InstructionCost IntrinsicCost =
-            TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
-        // If the selects are the only uses of the compares, they will be
-        // dead and we can adjust the cost by removing their cost.
-        if (SelectOnly) {
-          auto *CI = cast<CmpInst>(VI->getOperand(0));
-          IntrinsicCost -= TTI->getCmpSelInstrCost(
-              CI->getOpcode(), OrigScalarTy, Builder.getInt1Ty(),
-              CI->getPredicate(), CostKind, CI);
-        }
-        ScalarCost = std::min(ScalarCost, IntrinsicCost);
-      }
+      InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
+      if (IntrinsicCost.isValid())
+        ScalarCost = IntrinsicCost;
 
       return ScalarCost;
     };
@@ -9805,30 +9844,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
 
       InstructionCost VecCost = TTI->getCmpSelInstrCost(
           E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
-      // Check if it is possible and profitable to use min/max for selects
-      // in VL.
-      //
-      auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL);
-      if (MinMaxID != Intrinsic::not_intrinsic) {
-        Type *CanonicalType = VecTy;
-        if (CanonicalType->isPtrOrPtrVectorTy())
-          CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
-              CanonicalType->getContext(),
-              DL->getTypeSizeInBits(CanonicalType->getScalarType())));
-        IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
-                                          {CanonicalType, CanonicalType});
-        InstructionCost IntrinsicCost =
-            TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
-        // If the selects are the only uses of the compares, they will be
-        // dead and we can adjust the cost by removing their cost.
-        if (SelectOnly) {
-          auto *CI =
-              cast<CmpInst>(cast<Instruction>(VL.front())->getOperand(0));
-          IntrinsicCost -= TTI->getCmpSelInstrCost(CI->getOpcode(), VecTy,
-                                                   MaskTy, VecPred, CostKind);
-        }
-        VecCost = std::min(VecCost, IntrinsicCost);
-      }
       if (auto *SI = dyn_cast<SelectInst>(VL0)) {
         auto *CondType =
             getWidenedType(SI->getCondition()->getType(), VL.size());
@@ -9850,6 +9865,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
   }
+  case TreeEntry::MinMax: {
+    auto GetScalarCost = [&](unsigned Idx) {
+      return GetMinMaxCost(OrigScalarTy);
+    };
+    auto GetVectorCost = [&](InstructionCost CommonCost) {
+      InstructionCost VecCost = GetMinMaxCost(VecTy);
+      return VecCost + CommonCost;
+    };
+    return GetCostDiff(GetScalarCost, GetVectorCost);
+  }
   case Instruction::FNeg:
   case Instruction::Add:
   case Instruction::FAdd:
@@ -10588,6 +10613,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   SmallPtrSet<Value *, 4> CheckedExtracts;
   for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
     TreeEntry &TE = *VectorizableTree[I];
+    // No need to count the cost for combined entries, they are combined and
+    // just skip their cost.
+    if (TE.State == TreeEntry::CombinedVectorize) {
+      LLVM_DEBUG(
+          dbgs() << "SLP: Skipping cost for combined node that starts with "
+                 << *TE.Scalars[0] << ".\n";
+          TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
+      continue;
+    }
     if (TE.isGather()) {
       if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
           E && E->getVectorFactor() == TE.getVectorFactor() &&
@@ -12956,10 +12990,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
     return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
   };
 
-  assert((E->State == TreeEntry::Vectorize ||
-          E->State == TreeEntry::ScatterVectorize ||
-          E->State == TreeEntry::StridedVectorize) &&
-         "Unhandled state");
+  assert(!E->isGather() && "Unhandled state");
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   Instruction *VL0 = E->getMainOp();

From 135fecd4441068667ef23f56098befd0c42f89f2 Mon Sep 17 00:00:00 2001
From: Abhina Sree <Abhina.Sreeskantharajan@ibm.com>
Date: Thu, 8 Aug 2024 08:35:22 -0400
Subject: [PATCH 003/112] [SystemZ][z/OS] __ptr32 support for z/OS (#101696)

Enabling __ptr32 keyword to support in Clang for z/OS. It is represented
by addrspace(1) in LLVM IR. Unlike existing implementation, __ptr32 is
not mangled into symbol names for z/OS.
---
 clang/include/clang/Basic/LangOptions.def     |   1 +
 clang/include/clang/Basic/TokenKinds.def      |   3 +-
 clang/include/clang/Driver/Options.td         |   4 +
 clang/lib/AST/ItaniumMangle.cpp               |   8 +-
 clang/lib/Basic/IdentifierTable.cpp           |  18 +-
 clang/lib/Basic/Targets/SystemZ.h             |  40 ++-
 clang/lib/Driver/ToolChains/Clang.cpp         |   6 +
 clang/lib/Frontend/CompilerInvocation.cpp     |   8 +
 clang/lib/Sema/SemaType.cpp                   |   3 +-
 .../SystemZ/zos-mixed-ptr-sizes-definitions.c |  53 ++++
 .../SystemZ/zos-mixed-ptr-sizes-malloc.c      |  84 +++++
 .../SystemZ/zos-mixed-ptr-sizes-sizeof.c      |  94 ++++++
 .../CodeGen/SystemZ/zos-mixed-ptr-sizes.c     | 298 ++++++++++++++++++
 clang/test/CodeGen/target-data.c              |   2 +-
 .../zos-mangle-ptr-size-address-space.cpp     |  17 +
 clang/test/Sema/ZOSExtensions.cpp             | 120 +++++++
 clang/test/Sema/attr-print-zos.c              |  31 ++
 .../Target/SystemZ/SystemZTargetMachine.cpp   |   8 +
 18 files changed, 786 insertions(+), 12 deletions(-)
 create mode 100644 clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-definitions.c
 create mode 100644 clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-malloc.c
 create mode 100644 clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-sizeof.c
 create mode 100644 clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes.c
 create mode 100644 clang/test/CodeGenCXX/zos-mangle-ptr-size-address-space.cpp
 create mode 100644 clang/test/Sema/ZOSExtensions.cpp
 create mode 100644 clang/test/Sema/attr-print-zos.c

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 6945f8b01e91cde..54e689a7a42213d 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -91,6 +91,7 @@ LANGOPT(C2y               , 1, 0, "C2y")
 LANGOPT(MSVCCompat        , 1, 0, "Microsoft Visual C++ full compatibility mode")
 LANGOPT(Kernel            , 1, 0, "Kernel mode")
 LANGOPT(MicrosoftExt      , 1, 0, "Microsoft C++ extensions")
+LANGOPT(ZOSExt            , 1, 0, "z/OS extensions")
 LANGOPT(AsmBlocks         , 1, 0, "Microsoft inline asm blocks")
 LANGOPT(Borland           , 1, 0, "Borland extensions")
 LANGOPT(CPlusPlus         , 1, 0, "C++")
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 2cea64e2bd590b7..421dbb413fed939 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -292,6 +292,7 @@ PUNCTUATOR(caretcaret,            "^^")
 //   CHAR8SUPPORT - This is a keyword if 'char8_t' is a built-in type
 //   KEYFIXEDPOINT - This is a keyword according to the N1169 fixed point
 //                   extension.
+//   KEYZOS - This is a keyword in C/C++ on z/OS
 //
 KEYWORD(auto                        , KEYALL)
 KEYWORD(break                       , KEYALL)
@@ -725,7 +726,7 @@ KEYWORD(__funcref                     , KEYALL)
 
 // Microsoft extensions which should be disabled in strict conformance mode
 KEYWORD(__ptr64                       , KEYMS)
-KEYWORD(__ptr32                       , KEYMS)
+KEYWORD(__ptr32                       , KEYMS | KEYZOS)
 KEYWORD(__sptr                        , KEYMS)
 KEYWORD(__uptr                        , KEYMS)
 KEYWORD(__w64                         , KEYMS)
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 51ec29f1dc32120..e196c3dc5cb3be8 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3066,6 +3066,10 @@ dll version.}]>;
 def fms_omit_default_lib : Joined<["-"], "fms-omit-default-lib">,
   Group<f_Group>, Flags<[]>,
   Visibility<[ClangOption, CLOption]>;
+def fzos_extensions : Flag<["-"], "fzos-extensions">, Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Accept some non-standard constructs supported by the z/OS compiler">;
+def fno_zos_extensions : Flag<["-"], "fno-zos-extensions">, Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Do not accept non-standard constructs supported by the z/OS compiler">;
 defm delayed_template_parsing : BoolFOption<"delayed-template-parsing",
   LangOpts<"DelayedTemplateParsing">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option],
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 07c6a404e08b50f..dc317f2a7870b8d 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -2727,6 +2727,8 @@ void CXXNameMangler::mangleQualifiers(Qualifiers Quals, const DependentAddressSp
     //   <type> ::= U <OpenCL-addrspace>
     //   <type> ::= U <CUDA-addrspace>
 
+    llvm::Triple Triple = getASTContext().getTargetInfo().getTriple();
+
     SmallString<64> ASString;
     LangAS AS = Quals.getAddressSpace();
 
@@ -2795,7 +2797,11 @@ void CXXNameMangler::mangleQualifiers(Qualifiers Quals, const DependentAddressSp
         ASString = "ptr32_sptr";
         break;
       case LangAS::ptr32_uptr:
-        ASString = "ptr32_uptr";
+        // For z/OS, there are no special mangling rules applied to the ptr32
+        // qualifier. Ex: void foo(int * __ptr32 p) -> _Z3f2Pi. The mangling for
+        // "p" is treated the same as a regular integer pointer.
+        if (!Triple.isOSzOS())
+          ASString = "ptr32_uptr";
         break;
       case LangAS::ptr64:
         ASString = "ptr64";
diff --git a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
index 9cf081e9e26c139..c9c9d927a5902ef 100644
--- a/clang/lib/Basic/IdentifierTable.cpp
+++ b/clang/lib/Basic/IdentifierTable.cpp
@@ -107,12 +107,14 @@ enum TokenKey : unsigned {
   KEYMSCOMPAT = 0x400000,
   KEYSYCL = 0x800000,
   KEYCUDA = 0x1000000,
-  KEYHLSL = 0x2000000,
-  KEYFIXEDPOINT = 0x4000000,
+  KEYZOS = 0x2000000,
+  KEYNOZOS = 0x4000000,
+  KEYHLSL = 0x8000000,
+  KEYFIXEDPOINT = 0x10000000,
   KEYMAX = KEYFIXEDPOINT, // The maximum key
   KEYALLCXX = KEYCXX | KEYCXX11 | KEYCXX20,
-  KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 &
-           ~KEYNOOPENCL // KEYNOMS18 and KEYNOOPENCL are used to exclude.
+  KEYALL = (KEYMAX | (KEYMAX - 1)) & ~KEYNOMS18 & ~KEYNOOPENCL &
+           ~KEYNOZOS // KEYNOMS18, KEYNOOPENCL, KEYNOZOS are excluded.
 };
 
 /// How a keyword is treated in the selected standard. This enum is ordered
@@ -199,6 +201,8 @@ static KeywordStatus getKeywordStatusHelper(const LangOptions &LangOpts,
     return LangOpts.isSYCL() ? KS_Enabled : KS_Unknown;
   case KEYCUDA:
     return LangOpts.CUDA ? KS_Enabled : KS_Unknown;
+  case KEYZOS:
+    return LangOpts.ZOSExt ? KS_Enabled : KS_Unknown;
   case KEYHLSL:
     return LangOpts.HLSL ? KS_Enabled : KS_Unknown;
   case KEYNOCXX:
@@ -206,9 +210,8 @@ static KeywordStatus getKeywordStatusHelper(const LangOptions &LangOpts,
     // reasons as well.
     return LangOpts.CPlusPlus ? KS_Unknown : KS_Enabled;
   case KEYNOOPENCL:
-    // The disable behavior for this is handled in getKeywordStatus.
-    return KS_Unknown;
   case KEYNOMS18:
+  case KEYNOZOS:
     // The disable behavior for this is handled in getKeywordStatus.
     return KS_Unknown;
   case KEYFIXEDPOINT:
@@ -230,7 +233,8 @@ static KeywordStatus getKeywordStatus(const LangOptions &LangOpts,
   if (LangOpts.MSVCCompat && (Flags & KEYNOMS18) &&
       !LangOpts.isCompatibleWithMSVC(LangOptions::MSVC2015))
     return KS_Disabled;
-
+  if (LangOpts.ZOSExt && (Flags & KEYNOZOS))
+    return KS_Disabled;
   KeywordStatus CurStatus = KS_Unknown;
 
   while (Flags != 0) {
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index 3bc6f2c1d308328..7390f25d6efb1de 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -21,6 +21,30 @@
 namespace clang {
 namespace targets {
 
+static const unsigned ZOSAddressMap[] = {
+    0, // Default
+    0, // opencl_global
+    0, // opencl_local
+    0, // opencl_constant
+    0, // opencl_private
+    0, // opencl_generic
+    0, // opencl_global_device
+    0, // opencl_global_host
+    0, // cuda_device
+    0, // cuda_constant
+    0, // cuda_shared
+    0, // sycl_global
+    0, // sycl_global_device
+    0, // sycl_global_host
+    0, // sycl_local
+    0, // sycl_private
+    0, // ptr32_sptr
+    1, // ptr32_uptr
+    0, // ptr64
+    0, // hlsl_groupshared
+    0  // wasm_funcref
+};
+
 class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
 
   static const char *const GCCRegNames[];
@@ -30,6 +54,7 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
   bool HasVector;
   bool SoftFloat;
   bool UnalignedSymbols;
+  enum AddrSpace { ptr32 = 1 };
 
 public:
   SystemZTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
@@ -49,6 +74,9 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
     MinGlobalAlign = 16;
     HasUnalignedAccess = true;
     if (Triple.isOSzOS()) {
+      if (Triple.isArch64Bit()) {
+        AddrSpaceMap = &ZOSAddressMap;
+      }
       TLSSupported = false;
       // All vector types are default aligned on an 8-byte boundary, even if the
       // vector facility is not available. That is different from Linux.
@@ -56,7 +84,7 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
       // Compared to Linux/ELF, the data layout differs only in some details:
       // - name mangling is GOFF.
       // - 32 bit pointers, either as default or special address space
-      resetDataLayout("E-m:l-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-"
+      resetDataLayout("E-m:l-p1:32:32-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-"
                       "a:8:16-n32:64");
     } else {
       TLSSupported = true;
@@ -224,6 +252,16 @@ class LLVM_LIBRARY_VISIBILITY SystemZTargetInfo : public TargetInfo {
   std::pair<unsigned, unsigned> hardwareInterferenceSizes() const override {
     return std::make_pair(256, 256);
   }
+  uint64_t getPointerWidthV(LangAS AddrSpace) const override {
+    return (getTriple().isOSzOS() && getTriple().isArch64Bit() &&
+            getTargetAddressSpace(AddrSpace) == ptr32)
+               ? 32
+               : PointerWidth;
+  }
+
+  uint64_t getPointerAlignV(LangAS AddrSpace) const override {
+    return getPointerWidthV(AddrSpace);
+  }
 };
 } // namespace targets
 } // namespace clang
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 2054c8fe928e2e4..c698d38b80e578f 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7514,6 +7514,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       (C.isForDiagnostics() && !HaveModules))
     CmdArgs.push_back("-frewrite-includes");
 
+  if (Args.hasFlag(options::OPT_fzos_extensions,
+                   options::OPT_fno_zos_extensions, false))
+    CmdArgs.push_back("-fzos-extensions");
+  else if (Args.hasArg(options::OPT_fno_zos_extensions))
+    CmdArgs.push_back("-fno-zos-extensions");
+
   // Only allow -traditional or -traditional-cpp outside in preprocessing modes.
   if (Arg *A = Args.getLastArg(options::OPT_traditional,
                                options::OPT_traditional_cpp)) {
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 225bd6416ce5fc3..4dbb13fb723eaf6 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3650,6 +3650,11 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
       GenerateArg(Consumer, OPT_ftrigraphs);
   }
 
+  if (T.isOSzOS() && !Opts.ZOSExt)
+    GenerateArg(Consumer, OPT_fno_zos_extensions);
+  else if (Opts.ZOSExt)
+    GenerateArg(Consumer, OPT_fzos_extensions);
+
   if (Opts.Blocks && !(Opts.OpenCL && Opts.OpenCLVersion == 200))
     GenerateArg(Consumer, OPT_fblocks);
 
@@ -4051,6 +4056,9 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
   Opts.Trigraphs =
       Args.hasFlag(OPT_ftrigraphs, OPT_fno_trigraphs, Opts.Trigraphs);
 
+  Opts.ZOSExt =
+      Args.hasFlag(OPT_fzos_extensions, OPT_fno_zos_extensions, T.isOSzOS());
+
   Opts.Blocks = Args.hasArg(OPT_fblocks) || (Opts.OpenCL
     && Opts.OpenCLVersion == 200);
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 8f8e9183330ec96..91c8f18648fa473 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -7057,6 +7057,7 @@ static bool handleMSPointerTypeQualifierAttr(TypeProcessingState &State,
 
   // Add address space to type based on its attributes.
   LangAS ASIdx = LangAS::Default;
+  llvm::Triple Triple = S.Context.getTargetInfo().getTriple();
   uint64_t PtrWidth =
       S.Context.getTargetInfo().getPointerWidth(LangAS::Default);
   if (PtrWidth == 32) {
@@ -7065,7 +7066,7 @@ static bool handleMSPointerTypeQualifierAttr(TypeProcessingState &State,
     else if (Attrs[attr::UPtr])
       ASIdx = LangAS::ptr32_uptr;
   } else if (PtrWidth == 64 && Attrs[attr::Ptr32]) {
-    if (Attrs[attr::UPtr])
+    if (Triple.isOSzOS() || Attrs[attr::UPtr])
       ASIdx = LangAS::ptr32_uptr;
     else
       ASIdx = LangAS::ptr32_sptr;
diff --git a/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-definitions.c b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-definitions.c
new file mode 100644
index 000000000000000..75fc396e3e0afe9
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-definitions.c
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -triple s390x-ibm-zos -emit-llvm < %s | FileCheck %s --check-prefix=PTR32-ZOS
+// RUN: %clang_cc1 -triple s390x-ibm-linux -fzos-extensions -emit-llvm < %s | FileCheck %s --check-prefix=PTR32-LINUX
+// RUN: %clang_cc1 -triple s390x-linux-gnu -fzos-extensions -emit-llvm < %s | FileCheck %s --check-prefix=PTR32-LINUX
+
+void ptr32_declarations() {
+  // PTR32-ZOS-LABEL: @ptr32_declarations()
+  // PTR32-LINUX-LABEL: @ptr32_declarations()
+
+  // PTR32-ZOS: %p1 = alloca ptr addrspace(1), align 4
+  // PTR32-LINUX-NOT: %p1 = alloca i8 addrspace(1)*, align 4
+  // PTR32-LINUX: %p1 = alloca ptr, align 8
+  char * __ptr32 p1;
+
+  // PTR32-ZOS: %p2 = alloca ptr, align 8
+  // PTR32-LINUX-NOT: %p2 = alloca ptr addrspace(1), align 8
+  // PTR32-LINUX: %p2 = alloca ptr, align 8
+  char * __ptr32 *p2;
+
+  // PTR32-ZOS: %p3 = alloca ptr addrspace(1), align 4
+  // PTR32-LINUX-NOT: %p3 = alloca i8* addrspace(1)*, align 4
+  // PTR32-LINUX: %p3 = alloca ptr, align 8
+  char ** __ptr32 p3;
+
+  // PTR32-ZOS: %p4 = alloca ptr, align 8
+  // PTR32-LINUX-NOT: %p4 = alloca ptr addrspace(1), align 8
+  // PTR32-LINUX: %p4 = alloca ptr, align 8
+  char ** __ptr32 *p4;
+
+  // PTR32-ZOS: %p5 = alloca ptr, align 8
+  // PTR32-LINUX-NOT:  %p5 = alloca ptr addrspace(1), align 8
+  // PTR32-LINUX: %p5 = alloca ptr, align 8
+  char *** __ptr32 *p5;
+
+  // PTR32-ZOS: %p6 = alloca ptr, align 8
+  // PTR32-LINUX: %p6 = alloca ptr, align 8
+  char **p6;
+
+  // PTR32-ZOS: %p7 = alloca ptr addrspace(1), align 4
+  // PTR32-LINUX-NOT: %p7 = alloca i8 addrspace(1)* addrspace(1)*, align 4
+  // PTR32-LINUX: %p7 = alloca ptr, align 8
+  char * __ptr32 * __ptr32 p7;
+
+  // PTR32-ZOS: %p8 = alloca ptr addrspace(1), align 4
+  // PTR32-LINUX-NOT: %p8 = alloca i8* addrspace(1)* addrspace(1)*, align 4
+  // PTR32-LINUX: %p8 = alloca ptr, align 8
+  char ** __ptr32 * __ptr32 p8;
+
+  // PTR32-ZOS: %p9 = alloca ptr, align 8
+  // PTR32-LINUX-NOT: %p9 = alloca i8* addrspace(1)* addrspace(1)**, align 8
+  // PTR32-LINUX: %p9 = alloca ptr, align 8
+  char ** __ptr32 * __ptr32 *p9;
+
+}
diff --git a/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-malloc.c b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-malloc.c
new file mode 100644
index 000000000000000..b84bca6b42f483e
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-malloc.c
@@ -0,0 +1,84 @@
+// RUN: %clang -target s390x-ibm-zos -emit-llvm -S -O2 %s -o - | FileCheck %s --check-prefix=X64
+#include <stddef.h>
+void *__malloc31(size_t);
+
+int test_1() {
+  // X64-LABEL: define {{.*}} i32 @test_1()
+  // X64: ret i32 135
+  int *__ptr32 a;
+  int *b;
+  int i;
+  int sum1, sum2, sum3;
+
+  a = (int *__ptr32)__malloc31(sizeof(int) * 10);
+
+  b = a;
+  sum1 = 0;
+  for (i = 0; i < 10; ++i) {
+    a[i] = i;
+    sum1 += i;
+  }
+
+  sum2 = 0;
+  for (i = 0; i < 10; ++i) {
+    sum2 += a[i];
+  }
+  sum3 = 0;
+  for (i = 0; i < 10; ++i) {
+    sum3 += b[i];
+  }
+
+  return (sum1 + sum2 + sum3);
+}
+
+int test_2() {
+  // X64-LABEL: define {{.*}} i32 @test_2()
+  // X64: ret i32 4
+  int *a = (int *)__malloc31(sizeof(int));
+  int *__ptr32 b;
+
+  *a = 99;
+  b = a;
+  *b = 44;
+
+  // Test should return 4
+  return (*b - 40);
+}
+
+int test_3() {
+  // X64-LABEL: define {{.*}} i32 @test_3()
+  // X64: ret i32 4
+  int *a = (int *)__malloc31(sizeof(int));
+  int *__ptr32 b;
+
+  *a = 99;
+  b = a;
+
+  // Test should return 4
+  return (*b - 95);
+}
+
+int test_4() {
+  // X64-LABEL: define {{.*}} i32 @test_4()
+  // X64: ret i32 1
+  int *a = (int *)__malloc31(sizeof(int));
+  float *d = (float *)__malloc31(sizeof(float));
+
+  int *__ptr32 b;
+  int *c;
+
+  float *__ptr32 e;
+  float *f;
+
+  *a = 0;
+  *d = 0.0;
+
+  b = a;
+  c = a;
+  e = d;
+  f = d;
+
+  // Test should return 1
+  return (b == c && e == f);
+}
+
diff --git a/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-sizeof.c b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-sizeof.c
new file mode 100644
index 000000000000000..6b434a926f706b0
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-sizeof.c
@@ -0,0 +1,94 @@
+// RUN: %clang_cc1 -emit-llvm -triple s390x-ibm-zos -fzos-extensions -fdump-record-layouts < %s | FileCheck %s --check-prefix=PTR32-ZOS
+// RUN: %clang_cc1 -emit-llvm -triple s390x-ibm-linux -fzos-extensions -fdump-record-layouts < %s | FileCheck %s --check-prefix=PTR32-LINUX
+// RUN: %clang_cc1 -emit-llvm -triple s390x-linux-gnu -fzos-extensions -fdump-record-layouts < %s | FileCheck %s --check-prefix=PTR32-LINUX
+
+// PTR32-ZOS:          0  | struct s1
+// PTR32-ZOS-NEXT:     0  | long a
+// PTR32-ZOS-NEXT:     8  | int b
+// PTR32-ZOS-NEXT:     12 | int * __ptr32 c
+// PTR32-ZOS-NEXT:     16 | int d
+// PTR32-ZOS-NEXT:        | [sizeof=24, align=8]
+
+// PTR32-LINUX:        0  | struct s1
+// PTR32-LINUX-NEXT:   0  | long a
+// PTR32-LINUX-NEXT:   8  | int b
+// PTR32-LINUX-NEXT:   16 | int * __ptr32 c
+// PTR32-LINUX-NEXT:   24 | int d
+// PTR32-LINUX-NEXT:      | [sizeof=32, align=8]
+struct s1 {
+  long a;
+  int b;
+  int * __ptr32 c;
+  int d;
+} S1;
+
+// PTR32-ZOS:          0  | struct s2
+// PTR32-ZOS-NEXT:     0  | long a
+// PTR32-ZOS-NEXT:     8  | int b
+// PTR32-ZOS-NEXT:     16 | int * c
+// PTR32-ZOS-NEXT:     24 | int d
+// PTR32-ZOS-NEXT:        | [sizeof=32, align=8]
+
+// PTR32-LINUX:        0  | struct s2
+// PTR32-LINUX-NEXT:   0  | long a
+// PTR32-LINUX-NEXT:   8  | int b
+// PTR32-LINUX-NEXT:   16 | int * c
+// PTR32-LINUX-NEXT:   24 | int d
+// PTR32-LINUX-NEXT:      | [sizeof=32, align=8]
+struct s2 {
+  long a;
+  int b;
+  int *c;
+  int d;
+} S2;
+
+// PTR32-ZOS:          0  | struct s3
+// PTR32-ZOS-NEXT:     0  | int a
+// PTR32-ZOS-NEXT:     4  | int * __ptr32 b
+// PTR32-ZOS-NEXT:     8  | int * __ptr32 c
+// PTR32-ZOS-NEXT:     12 | int * d
+// PTR32-ZOS-NEXT:        | [sizeof=20, align=1]
+
+struct __attribute__((packed)) s3 {
+  int a;
+  int *__ptr32 b;
+  int *__ptr32 c;
+  int *d;
+};
+struct s3 S3;
+
+// PTR32-ZOS:          0 | union u1
+// PTR32-ZOS-NEXT:     0 | int * __ptr32 a
+// PTR32-ZOS-NEXT:     0 | int * b
+// PTR32-ZOS-NEXT:       | [sizeof=8, align=8]
+
+// PTR32-LINUX:        0 | union u1
+// PTR32-LINUX-NEXT:   0 | int * __ptr32 a
+// PTR32-LINUX-NEXT:   0 | int * b
+// PTR32-LINUX-NEXT:     | [sizeof=8, align=8]
+union u1 {
+  int *__ptr32 a;
+  int *b;
+} U1;
+
+// PTR32-ZOS:          0 | union u2
+// PTR32-ZOS-NEXT:     0 | int * __ptr32 a
+// PTR32-ZOS-NEXT:     0 | int * b
+// PTR32-ZOS-NEXT:       | [sizeof=8, align=1]
+
+union __attribute__((packed)) u2 {
+  int *__ptr32 a;
+  int *b;
+};
+union u2 U2;
+
+// PTR32-ZOS:          0 | union u3
+// PTR32-ZOS-NEXT:     0 | int * __ptr32 a
+// PTR32-ZOS-NEXT:     0 | short b
+// PTR32-ZOS-NEXT:       | [sizeof=4, align=1]
+
+union __attribute__((packed)) u3 {
+  int *__ptr32 a;
+  short b;
+};
+union u3 U3;
diff --git a/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes.c b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes.c
new file mode 100644
index 000000000000000..6194c9b1804fb0b
--- /dev/null
+++ b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes.c
@@ -0,0 +1,298 @@
+// RUN: %clang_cc1 -triple s390x-ibm-zos -emit-llvm -O2 < %s | FileCheck %s --check-prefix=X64
+
+#define PSA_PTR 0x00
+#define PSAAOLD 0x224
+
+struct Foo {
+  int * __ptr32 p32;
+  int *p64;
+  char *cp64;
+};
+
+void use_foo(struct Foo *f);
+
+void ptr32_to_ptr(struct Foo *f, int * __ptr32 i) {
+  // X64-LABEL: define void @ptr32_to_ptr(ptr noundef %f, ptr addrspace(1) noundef %i)
+  // X64: %{{.+}} = addrspacecast ptr addrspace(1) %i to ptr
+  f->p64= i;
+  use_foo(f);
+}
+
+void ptr_to_ptr32(struct Foo *f, int *i) {
+  // X64-LABEL: define void @ptr_to_ptr32(ptr noundef %f, ptr noundef %i)
+  // X64: %{{.+}} = addrspacecast ptr %i to ptr addrspace(1)
+  f->p32 = i;
+  use_foo(f);
+}
+
+void ptr32_to_ptr32(struct Foo *f, int * __ptr32 i) {
+  // X64-LABEL: define void @ptr32_to_ptr32(ptr noundef %f, ptr addrspace(1) noundef %i)
+  // X64-NOT: addrspacecast
+  f->p32 = i;
+  use_foo(f);
+}
+
+void ptr_to_ptr32_explicit_cast(struct Foo *f, int *i) {
+  // X64-LABEL: define void @ptr_to_ptr32_explicit_cast(ptr noundef %f, ptr noundef %i)
+  // X64: %{{.+}} = addrspacecast ptr %i to ptr addrspace(1)
+  f->p32 = (int * __ptr32)i;
+  use_foo(f);
+}
+
+void test_indexing(struct Foo *f) {
+  // X64-LABEL: define void @test_indexing(ptr noundef %f)
+  // X64: addrspacecast ptr addrspace(1) {{%[0-9]}} to ptr
+  f->cp64 = ((char * __ptr32 *)1028)[1];
+  use_foo(f);
+}
+
+void test_indexing_2(struct Foo *f) {
+  // X64-LABEL: define void @test_indexing_2(ptr noundef %f)
+  // X64: getelementptr inbounds i8, ptr addrspace(1) {{%[0-9]}}, i32 16
+  // X64: getelementptr inbounds i8, ptr {{%[0-9]}}, i64 24
+  f->cp64 = ((char *** __ptr32 *)1028)[1][2][3];
+  use_foo(f);
+}
+
+unsigned long* test_misc() {
+  // X64-LABEL: define ptr @test_misc()
+  // X64: %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %0, i32 88
+  // X64-NEXT: %1 = load ptr, ptr addrspace(1) %arrayidx
+  // X64-NEXT: %arrayidx1 = getelementptr inbounds i8, ptr %1, i64 8
+  // X64-NEXT: %2 = load ptr, ptr %arrayidx1
+  // X64-NEXT: %arrayidx2 = getelementptr inbounds i8, ptr %2, i64 904
+  // X64-NEXT: %3 = load ptr, ptr %arrayidx2
+  // X64-NEXT: %arrayidx3 = getelementptr inbounds i8, ptr %3, i64 1192
+  unsigned long* x = (unsigned long*)((char***** __ptr32*)1208)[0][11][1][113][149];
+  return x;
+}
+
+char* __ptr32* __ptr32 test_misc_2() {
+  // X64-LABEL: define ptr addrspace(1) @test_misc_2()
+  // X64: br i1 %cmp, label %if.then, label %if.end
+  // X64: %1 = load ptr addrspace(1), ptr inttoptr (i64 16 to ptr)
+  // X64-NEXT: %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %1, i32 544
+  // X64-NEXT: %2 = load ptr addrspace(1), ptr addrspace(1) %arrayidx
+  // X64-NEXT: %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %2, i32 24
+  // X64-NEXT: %3 = load ptr addrspace(1), ptr addrspace(1) %arrayidx1
+  // X64-NEXT: store ptr addrspace(1) %3, ptr @test_misc_2.res
+  // X64: ret ptr addrspace(1)
+  static char* __ptr32* __ptr32 res = 0;
+  if (res == 0) {
+    res = ((char* __ptr32* __ptr32* __ptr32* __ptr32*)0)[4][136][6];
+  }
+  return res;
+}
+
+unsigned short test_misc_3() {
+  // X64-LABEL: define zeroext i16 @test_misc_3()
+  // X64: %0 = load ptr addrspace(1), ptr inttoptr (i64 548 to ptr)
+  // X64-NEXT: %1 = addrspacecast ptr addrspace(1) %0 to ptr
+  // X64-NEXT: %arrayidx = getelementptr inbounds i8, ptr %1, i64 36
+  // X64-NEXT: %2 = load i16, ptr %arrayidx, align 2
+  // X64-NEXT: ret i16 %2
+  unsigned short this_asid = ((unsigned short*)(*(char* __ptr32*)(0x224)))[18];
+  return this_asid;
+}
+
+int test_misc_4() {
+  // X64-LABEL: define signext range(i32 0, 2) i32 @test_misc_4()
+  // X64: getelementptr inbounds i8, ptr addrspace(1) {{%[0-9]}}, i32 88
+  // X64: getelementptr inbounds i8, ptr {{%[0-9]}}, i64 8
+  // X64: getelementptr inbounds i8, ptr {{%[0-9]}}, i64 984
+  // X64: getelementptr inbounds i8, ptr %3, i64 80
+  // X64: icmp sgt i32 {{.*[0-9]}}, 67240703
+  // X64: ret i32
+  int a = (*(int*)(80 + ((char**** __ptr32*)1208)[0][11][1][123]) > 0x040202FF);
+  return a;
+}
+
+void test_misc_5(struct Foo *f) {
+  // X64-LABEL: define void @test_misc_5(ptr noundef %f)
+  // X64: addrspacecast ptr addrspace(1) %0 to ptr
+  f->cp64  = *(char* __ptr32 *)(PSA_PTR + PSAAOLD);
+  use_foo(f);
+}
+
+int test_misc_6() {
+  // X64-LABEL: define {{.*}} i32 @test_misc_6()
+  // X64: ret i32 8
+  int * __ptr32 ip32;
+  int *ip64;
+  ip64 = ip32;
+  return sizeof(ip64);
+}
+
+int test_misc_7() {
+  // X64-LABEL: define {{.*}} i32 @test_misc_7()
+  // X64: ret i32 12
+  int foo = 12;
+
+  int *ip64;
+  int * __ptr32 ip32;
+
+  ip64 = &foo;
+  ip32 = (int * __ptr32) ip64;
+
+  return *ip32;
+}
+
+int test_misc_8() {
+  // X64-LABEL: define {{.*}} i32 @test_misc_8()
+  // X64: ret i32 97
+  char foo = 'a';
+
+  char *cp64;
+  char * __ptr32 cp32;
+
+  cp64 = &foo;
+  cp32 = (char * __ptr32) cp64;
+
+  return *cp32;
+}
+
+int test_misc_9() {
+  // X64-LABEL: define {{.*}} i32 @test_misc_9()
+  // X64: ret i32 15
+  int foo = 15;
+
+  int *ip64;
+  int * __ptr32 ip32;
+
+  ip32 = &foo;
+  ip64 = (int *)ip32;
+
+  return *ip64;
+}
+
+int test_misc_10() {
+  // X64-LABEL: define {{.*}} i32 @test_misc_10()
+  // X64: ret i32 97
+  char foo = 'a';
+
+  char *cp64;
+  char * __ptr32 cp32;
+
+  cp32 = &foo;
+  cp64= (char *)cp32;
+
+  return *cp64;
+}
+
+int test_function_ptr32_is_32bit() {
+  // X64-LABEL: define {{.*}} i32 @test_function_ptr32_is_32bit()
+  // X64: ret i32 4
+  int (* __ptr32 a)(int a);
+  return sizeof(a);
+}
+
+int get_processor_count() {
+  // X64-LABEL: define signext range(i32 -128, 128) i32 @get_processor_count()
+  // X64: load ptr addrspace(1), ptr inttoptr (i64 16 to ptr)
+  // X64-NEXT: [[ARR_IDX1:%[a-z].*]] = getelementptr inbounds i8, ptr addrspace(1) %0, i32 660
+  // X64: load ptr addrspace(1), ptr addrspace(1) [[ARR_IDX1]]
+  // X64: load i8, ptr addrspace(1) {{%[a-z].*}}
+  // X64: sext i8 {{%[0-9]}} to i32
+  // X64-NEXT: ret i32
+  return ((char * __ptr32 * __ptr32 *)0)[4][165][53];
+}
+
+int get_sizes_ptr32() {
+  // X64-LABEL: define {{.*}} i32 @get_sizes_ptr32()
+  // X64: ret i32 72
+  char * __ptr32 a;
+  signed char * __ptr32 b;
+  unsigned char *__ptr32 c;
+  int * __ptr32 d;
+  signed int * __ptr32 e;
+  unsigned int *__ptr32 f;
+  short * __ptr32 g;
+  signed short * __ptr32 h;
+  unsigned short * __ptr32 i;
+  long * __ptr32 j;
+  signed * __ptr32 k;
+  unsigned * __ptr32 l;
+  long long * __ptr32 m;
+  signed long long * __ptr32 n;
+  unsigned long long * __ptr32 o;
+  float * __ptr32 p;
+  double * __ptr32 q;
+  long double * __ptr32 r;
+
+  int sum = 0;
+  sum += sizeof(a);
+  sum += sizeof(b);
+  sum += sizeof(c);
+  sum += sizeof(d);
+  sum += sizeof(e);
+  sum += sizeof(f);
+  sum += sizeof(g);
+  sum += sizeof(h);
+  sum += sizeof(i);
+  sum += sizeof(j);
+  sum += sizeof(k);
+  sum += sizeof(l);
+  sum += sizeof(m);
+  sum += sizeof(n);
+  sum += sizeof(o);
+  sum += sizeof(p);
+  sum += sizeof(q);
+  sum += sizeof(r);
+
+  return sum;
+}
+
+int get_sizes_p64() {
+  // X64-LABEL: define {{.*}} i32 @get_sizes_p64()
+  // X64: ret i32 144
+  char *a;
+  signed char *b;
+  unsigned char *c;
+  int *d;
+  signed int *e;
+  unsigned int *f;
+  short *g;
+  signed short *h;
+  unsigned short *i;
+  long *j;
+  signed *k;
+  unsigned *l;
+  long long *m;
+  signed long long *n;
+  unsigned long long *o;
+  float *p;
+  double *q;
+  long double *r;
+
+  int sum = 0;
+  sum += sizeof(a);
+  sum += sizeof(b);
+  sum += sizeof(c);
+  sum += sizeof(d);
+  sum += sizeof(e);
+  sum += sizeof(f);
+  sum += sizeof(g);
+  sum += sizeof(h);
+  sum += sizeof(i);
+  sum += sizeof(j);
+  sum += sizeof(k);
+  sum += sizeof(l);
+  sum += sizeof(m);
+  sum += sizeof(n);
+  sum += sizeof(o);
+  sum += sizeof(p);
+  sum += sizeof(q);
+  sum += sizeof(r);
+
+  return sum;
+
+}
+
+int host_cpu() {
+  char *__ptr32 CVT = *(char * __ptr32 *__ptr32) 16;
+  unsigned short Id = *(unsigned short *)&CVT[-6];
+  Id = ((((Id >> 12) & 0x0f) * 10 + ((Id >> 8) & 0x0f)) * 10 + ((Id >> 4) & 0x0f)) * 10 + (Id & 0x0f);
+  int HaveVectorSupport = CVT[244] & 0x80;
+  int z13 = (Id >= 2964 && HaveVectorSupport);
+  return z13;
+}
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index 9a9fda70226fc2a..41cbd5a0219d5ec 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -235,7 +235,7 @@
 // RUN: FileCheck %s -check-prefix=ZOS
 // RUN: %clang_cc1 -triple s390x-none-zos -target-cpu z13 -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=ZOS
-// ZOS: target datalayout = "E-m:l-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
+// ZOS: target datalayout = "E-m:l-p1:32:32-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
 
 // RUN: %clang_cc1 -triple msp430-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=MSP430
diff --git a/clang/test/CodeGenCXX/zos-mangle-ptr-size-address-space.cpp b/clang/test/CodeGenCXX/zos-mangle-ptr-size-address-space.cpp
new file mode 100644
index 000000000000000..d14ce117b2be4ae
--- /dev/null
+++ b/clang/test/CodeGenCXX/zos-mangle-ptr-size-address-space.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -fzos-extensions -emit-llvm -triple s390x-ibm-zos -x c++ -o - %s | FileCheck %s --check-prefixes=CHECK
+
+// CHECK-LABEL: define void @_Z2f1v()
+void f1() {}
+
+// CHECK-LABEL: define void @_Z2f2Pi(ptr addrspace(1) noundef %p32)
+void f2(int * __ptr32 p32) {}
+
+// CHECK-LABEL: define noundef ptr addrspace(1) @_Z2f3Pi(ptr addrspace(1) noundef %p32)
+int * __ptr32 f3(int * __ptr32 p32) {
+  return p32;
+}
+
+// CHECK-LABEL: define noundef ptr @_Z2f4PPi(ptr noundef %p32)
+int * __ptr32 *f4(int * __ptr32 *p32) {
+  return p32;
+}
diff --git a/clang/test/Sema/ZOSExtensions.cpp b/clang/test/Sema/ZOSExtensions.cpp
new file mode 100644
index 000000000000000..9b2d3cdb34529cb
--- /dev/null
+++ b/clang/test/Sema/ZOSExtensions.cpp
@@ -0,0 +1,120 @@
+// RUN: %clang_cc1 -triple s390x-ibm-zos %s -fsyntax-only -fzos-extensions -verify
+// RUN: %clang_cc1 -triple s390x-ibm-zos %s -fsyntax-only -verify
+
+struct A {
+  int a;
+  short b;
+  float q;
+  double z;
+};
+
+union B {
+  int a;
+  short b;
+  float q;
+  double z;
+};
+
+class C {
+  int a;
+  short b;
+  float q;
+  double z;
+};
+
+// ************************
+// INCORRECT DECLARATION
+// ************************
+int * __ptr64 p64; // expected-error {{expected ';' after top level declarator}}
+int *wrong_var3 __ptr32; // expected-error {{expected ';' after top level declarator}} expected-warning {{declaration does not declare anything}}
+
+// **************************
+// INCORRECT USAGES OF PTR32
+// **************************
+struct D {
+  int __ptr32 *a; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+};
+
+union E {
+  int __ptr32 *b; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+};
+
+char __ptr32 *a; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+signed char __ptr32 *b; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+unsigned char __ptr32 *c; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 *d; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+signed int __ptr32 *e; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+unsigned int __ptr32 *f; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+short int __ptr32 *g; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+signed short int __ptr32 *h; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+unsigned short int __ptr32 *i; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+long int __ptr32 *j; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+signed long int __ptr32 *k; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+unsigned long int __ptr32 *l; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+long long int __ptr32 *m; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+signed long long int __ptr32 *n; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+unsigned long long int __ptr32 *o; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+float __ptr32 *p;                  // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+double __ptr32 *q;                 // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 **r;                   // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 *__ptr32 *s;           // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 *__ptr32 *__ptr32 t;   // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 *__ptr32 *__ptr32 *__ptr32 u; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 __ptr32 **v_i;                // expected-error {{'__ptr32' attribute only applies to pointer arguments}} expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int __ptr32 __ptr32 __ptr32 w_i;          // expected-error {{'__ptr32' attribute only applies to pointer arguments}} expected-error {{'__ptr32' attribute only applies to pointer arguments}} expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+
+__ptr32 int wrong_var; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+
+struct A __ptr32 *c1;                  // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+struct A __ptr32 **e1;                 // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+struct A __ptr32 *__ptr32 *f1;         // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+struct A __ptr32 *__ptr32 *__ptr32 g1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+union B __ptr32 *d1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+union B __ptr32 **h1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+union B __ptr32 * __ptr32 *i1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+union B __ptr32 * __ptr32 * __ptr32 j1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+
+C __ptr32 **k1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+C __ptr32 * __ptr32 *l1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+C __ptr32 * __ptr32 * __ptr32 m1; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+
+struct D n1;
+union E o1;
+
+int incorrect_func() {
+  int __ptr32 = 1; // expected-error {{expected unqualified-id}}
+  return __ptr32; // expected-error {{expected expression}}
+}
+
+typedef int __ptr32; // expected-warning {{typedef requires a name}}
+int incorrect_func2() {
+  return 1;
+}
+
+typedef int __ptr32 *v; // expected-error {{'__ptr32' attribute only applies to pointer arguments}}
+int incorrect_func3() {
+  v v1;
+  return 0;
+}
+
+int *__ptr32 a_ptr; //expected-note {{previous definition is here}}
+int *a_ptr;         // expected-error {{redefinition of 'a_ptr' with a different type: 'int *' vs 'int * __ptr32'}}
+
+// *******************************************************
+// FUNCTION OVERLOADING BETWEEN PTR32 AND REGULAR POINTERS
+// *******************************************************
+void func(int * __ptr32 p32) {} // expected-note {{previous definition is here}}
+void func(int *p64) {}          // expected-error {{redefinition of 'func'}}
+
+// Overloads between ptr32 and other non-pointer types are permissible
+void func1(int *__ptr32 p32) {}
+void func1(int p64) {}
+
+// ******
+// MISC
+// ******
+void func2() {
+  char * __ptr32 v = ((char * __ptr32 *)1028)[0];
+  char *v1 = ((char ** __ptr32 *)1028)[0][1];
+}
+
diff --git a/clang/test/Sema/attr-print-zos.c b/clang/test/Sema/attr-print-zos.c
new file mode 100644
index 000000000000000..f19926c131a4f39
--- /dev/null
+++ b/clang/test/Sema/attr-print-zos.c
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 %s -triple s390x-ibm-zos -ast-print -fzos-extensions | FileCheck %s
+
+// CHECK: int * __ptr32 p32;
+int * __ptr32 p32;
+
+// CHECK: char * __ptr32 c32;
+char * __ptr32 c32;
+
+// CHECK: void * __ptr32 v32;
+void * __ptr32 v32;
+
+// CHECK: int * __ptr32 *q;
+int * __ptr32 *q;
+
+// CHECK: void *func(int * __ptr32 p);
+void *func(int * __ptr32 p);
+
+// CHECK: int * __ptr32 func1(int * __ptr32 p);
+int * __ptr32 func1(int * __ptr32 p);
+
+// CHECK: int *func2(void * __ptr32 p);
+int *func2(void * __ptr32 p);
+
+// CHECK: int *const __ptr32 r;
+int * __ptr32 const r;
+
+// CHECK: int ** __ptr32 *v;
+int * *__ptr32* v;
+
+// CHECK: int *** __ptr32 *z;
+int ** * __ptr32 * z;
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 6f76839724ee9f8..53ed46f14f14dcc 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -59,6 +59,14 @@ static std::string computeDataLayout(const Triple &TT) {
   // Data mangling.
   Ret += DataLayout::getManglingComponent(TT);
 
+  // Special features for z/OS.
+  if (TT.isOSzOS()) {
+    if (TT.isArch64Bit()) {
+      // Custom address space for ptr32.
+      Ret += "-p1:32:32";
+    }
+  }
+
   // Make sure that global data has at least 16 bits of alignment by
   // default, so that we can refer to it using LARL.  We don't have any
   // special requirements for stack variables though.

From fbb0619fe2acea4ac8764d13b754505ed8f1b578 Mon Sep 17 00:00:00 2001
From: Alexis Engelke <engelke@in.tum.de>
Date: Thu, 8 Aug 2024 14:45:20 +0200
Subject: [PATCH 004/112] [Support][ADT] Minor cleanups after #101706 (#102180)

Fix GraphHasNodeNumbers indicator for graphs where NodeRef is not the
graph type (e.g., Inverse<...>).

Add static_assert tests to MachineBasicBlock GraphTraits.

Use GraphHasNodeNumbers in DomTreeBase.

Don't include ad-hoc numbering map for numbered graphs in DomTreeBase.
---
 llvm/include/llvm/ADT/GraphTraits.h           |  3 +-
 llvm/include/llvm/CodeGen/MachineBasicBlock.h | 12 ++++++++
 llvm/include/llvm/Support/GenericDomTree.h    | 29 ++++++++++---------
 3 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/ADT/GraphTraits.h b/llvm/include/llvm/ADT/GraphTraits.h
index 0764ecb4bb56823..20bb27f50e17f95 100644
--- a/llvm/include/llvm/ADT/GraphTraits.h
+++ b/llvm/include/llvm/ADT/GraphTraits.h
@@ -97,7 +97,8 @@ struct GraphTraits {
 
 namespace detail {
 template <typename T>
-using has_number_t = decltype(GraphTraits<T>::getNumber(std::declval<T>()));
+using has_number_t = decltype(GraphTraits<T>::getNumber(
+    std::declval<typename GraphTraits<T>::NodeRef>()));
 } // namespace detail
 
 /// Indicate whether a GraphTraits<NodeT>::getNumber() is supported.
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 797e29d071dd9a4..2d238326ee1a307 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -1304,6 +1304,9 @@ template <> struct GraphTraits<MachineBasicBlock *> {
   }
 };
 
+static_assert(GraphHasNodeNumbers<MachineBasicBlock *>,
+              "GraphTraits getNumber() not detected");
+
 template <> struct GraphTraits<const MachineBasicBlock *> {
   using NodeRef = const MachineBasicBlock *;
   using ChildIteratorType = MachineBasicBlock::const_succ_iterator;
@@ -1318,6 +1321,9 @@ template <> struct GraphTraits<const MachineBasicBlock *> {
   }
 };
 
+static_assert(GraphHasNodeNumbers<const MachineBasicBlock *>,
+              "GraphTraits getNumber() not detected");
+
 // Provide specializations of GraphTraits to be able to treat a
 // MachineFunction as a graph of MachineBasicBlocks and to walk it
 // in inverse order.  Inverse order for a function is considered
@@ -1341,6 +1347,9 @@ template <> struct GraphTraits<Inverse<MachineBasicBlock*>> {
   }
 };
 
+static_assert(GraphHasNodeNumbers<Inverse<MachineBasicBlock *>>,
+              "GraphTraits getNumber() not detected");
+
 template <> struct GraphTraits<Inverse<const MachineBasicBlock*>> {
   using NodeRef = const MachineBasicBlock *;
   using ChildIteratorType = MachineBasicBlock::const_pred_iterator;
@@ -1358,6 +1367,9 @@ template <> struct GraphTraits<Inverse<const MachineBasicBlock*>> {
   }
 };
 
+static_assert(GraphHasNodeNumbers<Inverse<const MachineBasicBlock *>>,
+              "GraphTraits getNumber() not detected");
+
 // These accessors are handy for sharing templated code between IR and MIR.
 inline auto successors(const MachineBasicBlock *BB) { return BB->successors(); }
 inline auto predecessors(const MachineBasicBlock *BB) {
diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h
index d7b94d50e631117..7e2b68e6faea29c 100644
--- a/llvm/include/llvm/Support/GenericDomTree.h
+++ b/llvm/include/llvm/Support/GenericDomTree.h
@@ -262,7 +262,10 @@ class DominatorTreeBase {
       SmallVector<std::unique_ptr<DomTreeNodeBase<NodeT>>>;
   DomTreeNodeStorageTy DomTreeNodes;
   // For graphs where blocks don't have numbers, create a numbering here.
-  DenseMap<const NodeT *, unsigned> NodeNumberMap;
+  // TODO: use an empty struct with [[no_unique_address]] in C++20.
+  std::conditional_t<!GraphHasNodeNumbers<NodeT *>,
+                     DenseMap<const NodeT *, unsigned>, std::tuple<>>
+      NodeNumberMap;
   DomTreeNodeBase<NodeT> *RootNode = nullptr;
   ParentPtr Parent = nullptr;
 
@@ -355,12 +358,8 @@ class DominatorTreeBase {
   }
 
 private:
-  template <typename T>
-  using has_number_t =
-      decltype(GraphTraits<T *>::getNumber(std::declval<T *>()));
-
   std::optional<unsigned> getNodeIndex(const NodeT *BB) const {
-    if constexpr (is_detected<has_number_t, NodeT>::value) {
+    if constexpr (GraphHasNodeNumbers<NodeT *>) {
       // BB can be nullptr, map nullptr to index 0.
       assert(BlockNumberEpoch ==
                  GraphTraits<ParentPtr>::getNumberEpoch(Parent) &&
@@ -374,7 +373,7 @@ class DominatorTreeBase {
   }
 
   unsigned getNodeIndexForInsert(const NodeT *BB) {
-    if constexpr (is_detected<has_number_t, NodeT>::value) {
+    if constexpr (GraphHasNodeNumbers<NodeT *>) {
       // getNodeIndex will never fail if nodes have getNumber().
       unsigned Idx = *getNodeIndex(BB);
       if (Idx >= DomTreeNodes.size()) {
@@ -736,7 +735,8 @@ class DominatorTreeBase {
     }
 
     DomTreeNodes[*IdxOpt] = nullptr;
-    NodeNumberMap.erase(BB);
+    if constexpr (!GraphHasNodeNumbers<NodeT *>)
+      NodeNumberMap.erase(BB);
 
     if (!IsPostDom) return;
 
@@ -830,7 +830,7 @@ class DominatorTreeBase {
 private:
   void updateBlockNumberEpoch() {
     // Nothing to do for graphs that don't number their blocks.
-    if constexpr (is_detected<has_number_t, NodeT>::value)
+    if constexpr (GraphHasNodeNumbers<NodeT *>)
       BlockNumberEpoch = GraphTraits<ParentPtr>::getNumberEpoch(Parent);
   }
 
@@ -849,9 +849,8 @@ class DominatorTreeBase {
   }
 
   /// Update dominator tree after renumbering blocks.
-  template <class T_ = NodeT>
-  std::enable_if_t<is_detected<has_number_t, T_>::value, void>
-  updateBlockNumbers() {
+  template <typename T = NodeT>
+  std::enable_if_t<GraphHasNodeNumbers<T *>, void> updateBlockNumbers() {
     updateBlockNumberEpoch();
 
     unsigned MaxNumber = GraphTraits<ParentPtr>::getMaxNumber(Parent);
@@ -889,7 +888,8 @@ class DominatorTreeBase {
 
   void reset() {
     DomTreeNodes.clear();
-    NodeNumberMap.clear();
+    if constexpr (!GraphHasNodeNumbers<NodeT *>)
+      NodeNumberMap.clear();
     Roots.clear();
     RootNode = nullptr;
     Parent = nullptr;
@@ -989,7 +989,8 @@ class DominatorTreeBase {
   /// assignable and destroyable state, but otherwise invalid.
   void wipe() {
     DomTreeNodes.clear();
-    NodeNumberMap.clear();
+    if constexpr (!GraphHasNodeNumbers<NodeT *>)
+      NodeNumberMap.clear();
     RootNode = nullptr;
     Parent = nullptr;
   }

From 15dacb452f3ae210837f5fc0839489d9801c8fc3 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Thu, 8 Aug 2024 15:46:06 +0300
Subject: [PATCH 005/112] [AMDGPU][AsmParser] Print names of parsed registers
 in debug output. (#102328)

Knowing just their numeric values is not very helpful.

Part of <https://github.com/llvm/llvm-project/issues/62629>.
---
 llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index c8b594ffbc64522..bbd9d75aac0e916 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDKernelCodeT.h"
+#include "MCTargetDesc/AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCExpr.h"
 #include "MCTargetDesc/AMDGPUMCKernelDescriptor.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -1133,7 +1134,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   void print(raw_ostream &OS) const override {
     switch (Kind) {
     case Register:
-      OS << "<register " << getReg() << " mods: " << Reg.Mods << '>';
+      OS << "<register " << AMDGPUInstPrinter::getRegisterName(getReg())
+         << " mods: " << Reg.Mods << '>';
       break;
     case Immediate:
       OS << '<' << getImm();

From b8c560f1592648197d01f402dec5592a72d46c96 Mon Sep 17 00:00:00 2001
From: Steven Wu <stevenwu@apple.com>
Date: Thu, 8 Aug 2024 06:00:11 -0700
Subject: [PATCH 006/112] [CMake] Remove EXPORT_SYMBOLS_FOR_PLUGINS from
 #102138 (#102396)

Partially remove some of the changes from #102138 as
EXPORT_SYMBOLS_FOR_PLUGINS doesn't work on all the configurations.
---
 clang-tools-extra/clang-tidy/tool/CMakeLists.txt |  6 ++++--
 clang/cmake/modules/AddClang.cmake               |  2 +-
 clang/tools/clang-linker-wrapper/CMakeLists.txt  |  3 ++-
 clang/tools/clang-repl/CMakeLists.txt            |  4 ++--
 clang/tools/driver/CMakeLists.txt                |  7 +++++--
 flang/tools/flang-driver/CMakeLists.txt          | 16 +++++++---------
 lld/cmake/modules/AddLLD.cmake                   |  2 +-
 lld/tools/lld/CMakeLists.txt                     |  2 +-
 llvm/cmake/modules/AddLLVM.cmake                 | 12 +++++-------
 llvm/tools/bugpoint/CMakeLists.txt               |  2 +-
 llvm/tools/llc/CMakeLists.txt                    |  3 ++-
 llvm/tools/llvm-lto2/CMakeLists.txt              |  3 +--
 llvm/tools/opt/CMakeLists.txt                    |  3 ++-
 llvm/unittests/Analysis/CMakeLists.txt           | 11 +++++------
 llvm/unittests/Passes/Plugins/CMakeLists.txt     |  3 +--
 mlir/tools/mlir-opt/CMakeLists.txt               |  2 +-
 16 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/tool/CMakeLists.txt b/clang-tools-extra/clang-tidy/tool/CMakeLists.txt
index 9f327ce838b7073..b220cbea80f1b6d 100644
--- a/clang-tools-extra/clang-tidy/tool/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/tool/CMakeLists.txt
@@ -33,7 +33,6 @@ clang_target_link_libraries(clangTidyMain
 # Support plugins.
 if(CLANG_PLUGIN_SUPPORT)
   set(support_plugins SUPPORT_PLUGINS)
-  set(export_symbols EXPORT_SYMBOLS_FOR_PLUGINS)
 endif()
 
 add_clang_tool(clang-tidy
@@ -42,7 +41,6 @@ add_clang_tool(clang-tidy
   DEPENDS
   clang-resource-headers
   ${support_plugins}
-  ${export_symbols}
   )
 clang_target_link_libraries(clang-tidy
   PRIVATE
@@ -59,6 +57,10 @@ target_link_libraries(clang-tidy
   ${ALL_CLANG_TIDY_CHECKS}
   )
 
+if(CLANG_PLUGIN_SUPPORT)
+  export_executable_symbols_for_plugins(clang-tidy)
+endif()
+
 install(PROGRAMS clang-tidy-diff.py
   DESTINATION "${CMAKE_INSTALL_DATADIR}/clang"
   COMPONENT clang-tidy)
diff --git a/clang/cmake/modules/AddClang.cmake b/clang/cmake/modules/AddClang.cmake
index 9f264720b1e9ee9..5327b5d2f089288 100644
--- a/clang/cmake/modules/AddClang.cmake
+++ b/clang/cmake/modules/AddClang.cmake
@@ -160,7 +160,7 @@ macro(add_clang_tool name)
      AND (NOT LLVM_DISTRIBUTION_COMPONENTS OR ${name} IN_LIST LLVM_DISTRIBUTION_COMPONENTS)
     )
     set(get_obj_args ${ARGN})
-    list(FILTER get_obj_args EXCLUDE REGEX "^(SUPPORT_PLUGINS|EXPORT_SYMBOLS_FOR_PLUGINS)$")
+    list(FILTER get_obj_args EXCLUDE REGEX "^SUPPORT_PLUGINS$")
     generate_llvm_objects(${name} ${get_obj_args})
     add_custom_target(${name} DEPENDS llvm-driver clang-resource-headers)
   else()
diff --git a/clang/tools/clang-linker-wrapper/CMakeLists.txt b/clang/tools/clang-linker-wrapper/CMakeLists.txt
index 4a16c3ca9f0903c..bf37d8031025ed3 100644
--- a/clang/tools/clang-linker-wrapper/CMakeLists.txt
+++ b/clang/tools/clang-linker-wrapper/CMakeLists.txt
@@ -31,7 +31,6 @@ add_clang_tool(clang-linker-wrapper
 
   DEPENDS
   ${tablegen_deps}
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
 
 set(CLANG_LINKER_WRAPPER_LIB_DEPS
@@ -42,3 +41,5 @@ target_link_libraries(clang-linker-wrapper
   PRIVATE
   ${CLANG_LINKER_WRAPPER_LIB_DEPS}
   )
+
+export_executable_symbols_for_plugins(clang-linker-wrapper)
diff --git a/clang/tools/clang-repl/CMakeLists.txt b/clang/tools/clang-repl/CMakeLists.txt
index 52b740b356284e4..a35ff13494e1153 100644
--- a/clang/tools/clang-repl/CMakeLists.txt
+++ b/clang/tools/clang-repl/CMakeLists.txt
@@ -9,8 +9,6 @@ set( LLVM_LINK_COMPONENTS
 
 add_clang_tool(clang-repl
   ClangRepl.cpp
-
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
 
 if(MSVC)
@@ -63,6 +61,8 @@ clang_target_link_libraries(clang-repl PRIVATE
   clangInterpreter
   )
 
+export_executable_symbols_for_plugins(clang-repl)
+
 # The clang-repl binary can get huge with static linking in debug mode.
 # Some 32-bit targets use PLT slots with limited branch range by default and we
 # start to exceed this limit, e.g. when linking for arm-linux-gnueabihf with
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index 805dffb0d9b7064..018605c2fd4f268 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -21,7 +21,6 @@ set( LLVM_LINK_COMPONENTS
 # Support plugins.
 if(CLANG_PLUGIN_SUPPORT)
   set(support_plugins SUPPORT_PLUGINS)
-  set(export_symbols EXPORT_SYMBOLS_FOR_PLUGINS)
 endif()
 
 add_clang_tool(clang
@@ -36,7 +35,6 @@ add_clang_tool(clang
   ARMTargetParserTableGen
   AArch64TargetParserTableGen
   ${support_plugins}
-  ${export_symbols}
   GENERATE_DRIVER
   )
 
@@ -56,6 +54,11 @@ else()
   set_target_properties(clang PROPERTIES VERSION ${CLANG_EXECUTABLE_VERSION})
 endif()
 
+# Support plugins.
+if(CLANG_PLUGIN_SUPPORT)
+  export_executable_symbols_for_plugins(clang)
+endif()
+
 add_dependencies(clang clang-resource-headers)
 
 if(NOT CLANG_LINKS_TO_CREATE)
diff --git a/flang/tools/flang-driver/CMakeLists.txt b/flang/tools/flang-driver/CMakeLists.txt
index baa949600283517..9f33cdfe3fa90f7 100644
--- a/flang/tools/flang-driver/CMakeLists.txt
+++ b/flang/tools/flang-driver/CMakeLists.txt
@@ -11,18 +11,9 @@ set( LLVM_LINK_COMPONENTS
   TargetParser
 )
 
-option(FLANG_PLUGIN_SUPPORT "Build Flang with plugin support." ON)
-
-# Enable support for plugins, which need access to symbols from flang-new
-if(FLANG_PLUGIN_SUPPORT)
-  set(export_symbols EXPORT_SYMBOLS_FOR_PLUGINS)
-endif()
-
 add_flang_tool(flang-new
   driver.cpp
   fc1_main.cpp
-
-  ${export_symbols}
 )
 
 target_link_libraries(flang-new
@@ -37,4 +28,11 @@ clang_target_link_libraries(flang-new
   clangBasic
 )
 
+option(FLANG_PLUGIN_SUPPORT "Build Flang with plugin support." ON)
+
+# Enable support for plugins, which need access to symbols from flang-new
+if(FLANG_PLUGIN_SUPPORT)
+  export_executable_symbols_for_plugins(flang-new)
+endif()
+
 install(TARGETS flang-new DESTINATION "${CMAKE_INSTALL_BINDIR}")
diff --git a/lld/cmake/modules/AddLLD.cmake b/lld/cmake/modules/AddLLD.cmake
index 34f9974efbf50af..9f2684b6f933eca 100644
--- a/lld/cmake/modules/AddLLD.cmake
+++ b/lld/cmake/modules/AddLLD.cmake
@@ -44,7 +44,7 @@ macro(add_lld_tool name)
     AND (NOT LLVM_DISTRIBUTION_COMPONENTS OR ${name} IN_LIST LLVM_DISTRIBUTION_COMPONENTS)
   )
     set(get_obj_args ${ARGN})
-    list(FILTER get_obj_args EXCLUDE REGEX "^(SUPPORT_PLUGINS|EXPORT_SYMBOLS_FOR_PLUGINS)$")
+    list(FILTER get_obj_args EXCLUDE REGEX "^SUPPORT_PLUGINS$")
     generate_llvm_objects(${name} ${get_obj_args})
     add_custom_target(${name} DEPENDS llvm-driver)
   else()
diff --git a/lld/tools/lld/CMakeLists.txt b/lld/tools/lld/CMakeLists.txt
index 630d38f770a7fee..8498a91597a930c 100644
--- a/lld/tools/lld/CMakeLists.txt
+++ b/lld/tools/lld/CMakeLists.txt
@@ -8,8 +8,8 @@ add_lld_tool(lld
 
   SUPPORT_PLUGINS
   GENERATE_DRIVER
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
+export_executable_symbols_for_plugins(lld)
 
 function(lld_target_link_libraries target type)
   if (TARGET obj.${target})
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 257dc2250bb4ef8..3e7e3a965559afa 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1010,7 +1010,7 @@ endmacro()
 
 macro(add_llvm_executable name)
   cmake_parse_arguments(ARG
-    "DISABLE_LLVM_LINK_LLVM_DYLIB;IGNORE_EXTERNALIZE_DEBUGINFO;NO_INSTALL_RPATH;SUPPORT_PLUGINS;EXPORT_SYMBOLS;EXPORT_SYMBOLS_FOR_PLUGINS"
+    "DISABLE_LLVM_LINK_LLVM_DYLIB;IGNORE_EXTERNALIZE_DEBUGINFO;NO_INSTALL_RPATH;SUPPORT_PLUGINS;EXPORT_SYMBOLS"
     "ENTITLEMENTS;BUNDLE_PATH"
     ""
     ${ARGN})
@@ -1081,12 +1081,6 @@ macro(add_llvm_executable name)
     endif()
   endif()
 
-  if (ARG_EXPORT_SYMBOLS)
-    export_executable_symbols(${name})
-  elseif(ARG_EXPORT_SYMBOLS_FOR_PLUGINS)
-    export_executable_symbols_for_plugins(${name})
-  endif()
-
   if (LLVM_LINK_LLVM_DYLIB AND NOT ARG_DISABLE_LLVM_LINK_LLVM_DYLIB)
     set(USE_SHARED USE_SHARED)
   endif()
@@ -1118,6 +1112,10 @@ macro(add_llvm_executable name)
   endif()
 
   llvm_codesign(${name} ENTITLEMENTS ${ARG_ENTITLEMENTS} BUNDLE_PATH ${ARG_BUNDLE_PATH})
+
+  if (ARG_EXPORT_SYMBOLS)
+    export_executable_symbols(${name})
+  endif()
 endmacro(add_llvm_executable name)
 
 # add_llvm_pass_plugin(name [NO_MODULE] ...)
diff --git a/llvm/tools/bugpoint/CMakeLists.txt b/llvm/tools/bugpoint/CMakeLists.txt
index f846aed24b75ebb..b0e71910c7cc38f 100644
--- a/llvm/tools/bugpoint/CMakeLists.txt
+++ b/llvm/tools/bugpoint/CMakeLists.txt
@@ -37,5 +37,5 @@ add_llvm_tool(bugpoint
   DEPENDS
   intrinsics_gen
   SUPPORT_PLUGINS
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
+export_executable_symbols_for_plugins(bugpoint)
diff --git a/llvm/tools/llc/CMakeLists.txt b/llvm/tools/llc/CMakeLists.txt
index c5407944dd2138c..01825c6e4c64c77 100644
--- a/llvm/tools/llc/CMakeLists.txt
+++ b/llvm/tools/llc/CMakeLists.txt
@@ -30,5 +30,6 @@ add_llvm_tool(llc
   DEPENDS
   intrinsics_gen
   SUPPORT_PLUGINS
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
+
+export_executable_symbols_for_plugins(llc)
diff --git a/llvm/tools/llvm-lto2/CMakeLists.txt b/llvm/tools/llvm-lto2/CMakeLists.txt
index 335392fb8990a07..3b4644d6e277150 100644
--- a/llvm/tools/llvm-lto2/CMakeLists.txt
+++ b/llvm/tools/llvm-lto2/CMakeLists.txt
@@ -21,6 +21,5 @@ add_llvm_tool(llvm-lto2
 
   DEPENDS
   intrinsics_gen
-
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
+export_executable_symbols_for_plugins(llvm-lto2)
diff --git a/llvm/tools/opt/CMakeLists.txt b/llvm/tools/opt/CMakeLists.txt
index c235fcf1ac9605b..8d031b2cc57c786 100644
--- a/llvm/tools/opt/CMakeLists.txt
+++ b/llvm/tools/opt/CMakeLists.txt
@@ -45,7 +45,8 @@ add_llvm_tool(opt
   DEPENDS
   intrinsics_gen
   SUPPORT_PLUGINS
-  EXPORT_SYMBOLS_FOR_PLUGINS
 
   )
 target_link_libraries(opt PRIVATE LLVMOptDriver)
+
+export_executable_symbols_for_plugins(opt)
diff --git a/llvm/unittests/Analysis/CMakeLists.txt b/llvm/unittests/Analysis/CMakeLists.txt
index a1199adba076a7d..3cba630867a83b3 100644
--- a/llvm/unittests/Analysis/CMakeLists.txt
+++ b/llvm/unittests/Analysis/CMakeLists.txt
@@ -62,14 +62,8 @@ else()
   LIST(APPEND LLVM_OPTIONAL_SOURCES ${MLGO_TESTS})
 endif()
 
-# Export symbols from the plugins shared objects.
-if(NOT WIN32)
-  set(export_symbols EXPORT_SYMBOLS_FOR_PLUGINS)
-endif()
-
 add_llvm_unittest_with_input_files(AnalysisTests
   ${ANALYSIS_TEST_SOURCES}
-  ${export_symbols}
   )
 
 add_dependencies(AnalysisTests intrinsics_gen)
@@ -82,5 +76,10 @@ if(CMAKE_SYSTEM_NAME STREQUAL "AIX")
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-brtl")
 endif()
 
+# Export symbols from the plugins shared objects.
+if(NOT WIN32)
+  export_executable_symbols_for_plugins(AnalysisTests)
+endif()
+
 add_subdirectory(InlineAdvisorPlugin)
 add_subdirectory(InlineOrderPlugin)
diff --git a/llvm/unittests/Passes/Plugins/CMakeLists.txt b/llvm/unittests/Passes/Plugins/CMakeLists.txt
index 55d7e715014f4b7..e90cae167bc2223 100644
--- a/llvm/unittests/Passes/Plugins/CMakeLists.txt
+++ b/llvm/unittests/Passes/Plugins/CMakeLists.txt
@@ -6,9 +6,8 @@ if (NOT WIN32 AND NOT CYGWIN)
   set(LLVM_LINK_COMPONENTS Support Passes Core AsmParser)
   add_llvm_unittest(PluginsTests
     PluginsTest.cpp
-
-    EXPORT_SYMBOLS_FOR_PLUGINS
     )
+  export_executable_symbols_for_plugins(PluginsTests)
   target_link_libraries(PluginsTests PRIVATE LLVMTestingSupport)
 
   unset(LLVM_LINK_COMPONENTS)
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index 1209c53d81bfb28..8b79de58fa1028b 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -102,9 +102,9 @@ add_mlir_tool(mlir-opt
   DEPENDS
   ${LIBS}
   SUPPORT_PLUGINS
-  EXPORT_SYMBOLS_FOR_PLUGINS
   )
 target_link_libraries(mlir-opt PRIVATE ${LIBS})
 llvm_update_compile_flags(mlir-opt)
 
 mlir_check_all_link_libraries(mlir-opt)
+export_executable_symbols_for_plugins(mlir-opt)

From ea1dee76c7bc88ada21686fb2579a3ed7f284028 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Thu, 8 Aug 2024 15:03:03 +0200
Subject: [PATCH 007/112] [clang][Interp] Don't diagnose indexing arrays with
 index 0 (#102454)

Even if the array is of unknown bounds, index 0 is fine.
---
 clang/lib/AST/Interp/Interp.h      |  4 ++--
 clang/test/AST/Interp/literals.cpp | 12 +++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 423508b3adb996f..832fc028ad6696c 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -2421,7 +2421,7 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.peek<Pointer>();
 
-  if (!Ptr.isZero()) {
+  if (!Ptr.isZero() && !Offset.isZero()) {
     if (!CheckArray(S, OpPC, Ptr))
       return false;
   }
@@ -2437,7 +2437,7 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (!Ptr.isZero()) {
+  if (!Ptr.isZero() && !Offset.isZero()) {
     if (!CheckArray(S, OpPC, Ptr))
       return false;
   }
diff --git a/clang/test/AST/Interp/literals.cpp b/clang/test/AST/Interp/literals.cpp
index 815fb67b9bbfc53..a46f6ed747ec2f3 100644
--- a/clang/test/AST/Interp/literals.cpp
+++ b/clang/test/AST/Interp/literals.cpp
@@ -1196,13 +1196,15 @@ namespace incdecbool {
 }
 
 #if __cplusplus >= 201402L
-/// NOTE: The diagnostics of the two interpreters are a little
-/// different here, but they both make sense.
 constexpr int externvar1() { // both-error {{never produces a constant expression}}
-  extern char arr[]; // ref-note {{declared here}}
-   return arr[0]; // ref-note {{read of non-constexpr variable 'arr'}} \
-                  // expected-note {{indexing of array without known bound}}
+  extern char arr[]; // both-note {{declared here}}
+   return arr[0]; // both-note {{read of non-constexpr variable 'arr'}}
 }
+namespace externarr {
+  extern int arr[];
+  constexpr int *externarrindex = &arr[0]; /// No diagnostic.
+}
+
 
 namespace StmtExprs {
   constexpr int foo() {

From 47d831f2c90225a7d267ec5e3149d5584287dbbc Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 8 Aug 2024 17:06:11 +0400
Subject: [PATCH 008/112] TTI: Check legalization cost of min/max ISD nodes
 (#100514)

Instead of counting the cost of the assumed expansion.

The AMDGPU costs for the i64 case look too high to me.

Preserve default expansion logic
---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  38 ++--
 .../CostModel/AMDGPU/arith-sminmax.ll         | 164 +++++++-----------
 .../CostModel/AMDGPU/arith-uminmax.ll         | 164 +++++++-----------
 llvm/test/Analysis/CostModel/ARM/cmps.ll      |   4 +-
 .../Analysis/CostModel/ARM/reduce-smax.ll     |  12 +-
 .../Analysis/CostModel/ARM/reduce-smin.ll     |  12 +-
 .../Analysis/CostModel/ARM/reduce-umax.ll     |  12 +-
 .../Analysis/CostModel/ARM/reduce-umin.ll     |  12 +-
 .../SLPVectorizer/AMDGPU/min_max.ll           |  76 ++++++--
 9 files changed, 226 insertions(+), 268 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 93086d4ac2c8df7..c0ee50db37e6761 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2135,21 +2135,17 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       return Cost;
     }
     case Intrinsic::smax:
+      ISD = ISD::SMAX;
+      break;
     case Intrinsic::smin:
+      ISD = ISD::SMIN;
+      break;
     case Intrinsic::umax:
-    case Intrinsic::umin: {
-      // minmax(X,Y) = select(icmp(X,Y),X,Y)
-      Type *CondTy = RetTy->getWithNewBitWidth(1);
-      bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
-      CmpInst::Predicate Pred =
-          IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
-      InstructionCost Cost = 0;
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
-                                          Pred, CostKind);
-      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
-                                          Pred, CostKind);
-      return Cost;
-    }
+      ISD = ISD::UMAX;
+      break;
+    case Intrinsic::umin:
+      ISD = ISD::UMIN;
+      break;
     case Intrinsic::sadd_sat:
       ISD = ISD::SADDSAT;
       break;
@@ -2345,6 +2341,22 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
     }
+    case Intrinsic::smin:
+    case Intrinsic::smax:
+    case Intrinsic::umin:
+    case Intrinsic::umax: {
+      // minmax(X,Y) = select(icmp(X,Y),X,Y)
+      Type *CondTy = RetTy->getWithNewBitWidth(1);
+      bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
+      CmpInst::Predicate Pred =
+          IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
+      InstructionCost Cost = 0;
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+                                          Pred, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                          Pred, CostKind);
+      return Cost;
+    }
     case Intrinsic::sadd_sat:
     case Intrinsic::ssub_sat: {
       // Assume a default expansion.
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-sminmax.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-sminmax.ll
index c73b8b716bc5417..8afb26722cc16b7 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-sminmax.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-sminmax.ll
@@ -42,24 +42,24 @@ define i32 @smax(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'smax'
@@ -67,50 +67,25 @@ define i32 @smax(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'smax'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.smax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.smax.v64i8(<64 x i8> undef, <64 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -173,24 +148,24 @@ define i32 @smin(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'smin'
@@ -198,50 +173,25 @@ define i32 @smin(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'smin'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.smin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.smin.v64i8(<64 x i8> undef, <64 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -271,3 +221,5 @@ define i32 @smin(i32 %arg) {
 
   ret i32 undef
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
index 68687743d75b610..2b3728b556d9e5b 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-uminmax.ll
@@ -42,24 +42,24 @@ define i32 @umax(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'umax'
@@ -67,50 +67,25 @@ define i32 @umax(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'umax'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umax.v32i8(<32 x i8> undef, <32 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.umax.v64i8(<64 x i8> undef, <64 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -173,24 +148,24 @@ define i32 @umin(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'umin'
@@ -198,50 +173,25 @@ define i32 @umin(i32 %arg) {
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; SLOW-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
+; SLOW-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; SLOW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
-;
-; ALL-SIZE-LABEL: 'umin'
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.umin.v32i8(<32 x i8> undef, <32 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.umin.v64i8(<64 x i8> undef, <64 x i8> undef)
-; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef)
   %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef)
@@ -271,3 +221,5 @@ define i32 @umin(i32 %arg) {
 
   ret i32 undef
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL-SIZE: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/ARM/cmps.ll b/llvm/test/Analysis/CostModel/ARM/cmps.ll
index 184b7076d02bea9..e86a1b95a7b36b7 100644
--- a/llvm/test/Analysis/CostModel/ARM/cmps.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cmps.ll
@@ -227,7 +227,7 @@ define void @minmax() {
 ; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c3 = icmp slt i32 undef, undef
 ; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s3 = select i1 %c3, i32 undef, i32 undef
 ; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %c4 = icmp slt <4 x i32> undef, undef
-; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
+; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
 ; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c5 = icmp slt ptr undef, undef
 ; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s5 = select i1 %c5, ptr undef, ptr undef
 ; CHECK-V8R-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c6 = icmp slt <4 x ptr> undef, undef
@@ -287,7 +287,7 @@ define void @minmax() {
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c3 = icmp slt i32 undef, undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s3 = select i1 %c3, i32 undef, i32 undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %c4 = icmp slt <4 x i32> undef, undef
-; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
+; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c5 = icmp slt ptr undef, undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s5 = select i1 %c5, ptr undef, ptr undef
 ; CHECK-V8R-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c6 = icmp slt <4 x ptr> undef, undef
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
index 4ef35f154789604..7dcab51e0a1cf3f 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll
@@ -48,7 +48,7 @@ define i32 @reduce_i32(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
@@ -82,8 +82,8 @@ define i32 @reduce_i16(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
@@ -120,9 +120,9 @@ define i32 @reduce_i8(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
index f8bce922abcd040..617c6b4605189cf 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll
@@ -48,7 +48,7 @@ define i32 @reduce_i32(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef)
@@ -82,8 +82,8 @@ define i32 @reduce_i16(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef)
@@ -120,9 +120,9 @@ define i32 @reduce_i8(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
index 24f76e60cd31027..764034d18bee026 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll
@@ -48,7 +48,7 @@ define i32 @reduce_i32(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef)
@@ -82,8 +82,8 @@ define i32 @reduce_i16(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef)
@@ -120,9 +120,9 @@ define i32 @reduce_i8(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
index ee7025cf5db2501..b5431f63bdca977 100644
--- a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
+++ b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll
@@ -48,7 +48,7 @@ define i32 @reduce_i32(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i32'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef)
@@ -82,8 +82,8 @@ define i32 @reduce_i16(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i16'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef)
@@ -120,9 +120,9 @@ define i32 @reduce_i8(i32 %arg) {
 ; V8M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-LABEL: 'reduce_i8'
-; NEON-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 55 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
-; NEON-NEXT:  Cost Model: Found an estimated cost of 153 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef)
 ; NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef)
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
index f44705a925d5a73..46c6c10125b95ff 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/min_max.ll
@@ -18,8 +18,15 @@ define <2 x i16> @uadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v2i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.umin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
-; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+; GFX8-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX8-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX8-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
 ; GFX9-LABEL: @uadd_sat_v2i16(
 ; GFX9-NEXT:  bb:
@@ -53,8 +60,15 @@ define <2 x i16> @usub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @usub_sat_v2i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.umax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
-; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+; GFX8-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX8-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX8-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
 ; GFX9-LABEL: @usub_sat_v2i16(
 ; GFX9-NEXT:  bb:
@@ -88,8 +102,15 @@ define <2 x i16> @sadd_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @sadd_sat_v2i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
-; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+; GFX8-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX8-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX8-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
 ; GFX9-LABEL: @sadd_sat_v2i16(
 ; GFX9-NEXT:  bb:
@@ -123,8 +144,15 @@ define <2 x i16> @ssub_sat_v2i16(<2 x i16> %arg0, <2 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @ssub_sat_v2i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <2 x i16> @llvm.smax.v2i16(<2 x i16> [[ARG0:%.*]], <2 x i16> [[ARG1:%.*]])
-; GFX8-NEXT:    ret <2 x i16> [[TMP0]]
+; GFX8-NEXT:    [[ARG0_0:%.*]] = extractelement <2 x i16> [[ARG0:%.*]], i64 0
+; GFX8-NEXT:    [[ARG0_1:%.*]] = extractelement <2 x i16> [[ARG0]], i64 1
+; GFX8-NEXT:    [[ARG1_0:%.*]] = extractelement <2 x i16> [[ARG1:%.*]], i64 0
+; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <2 x i16> [[ARG1]], i64 1
+; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.smax.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <2 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <2 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX8-NEXT:    ret <2 x i16> [[INS_1]]
 ;
 ; GFX9-LABEL: @ssub_sat_v2i16(
 ; GFX9-NEXT:  bb:
@@ -262,11 +290,18 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v3i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
-; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
-; GFX8-NEXT:    [[TMP0:%.*]] = call <3 x i16> @llvm.umin.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
+; GFX8-NEXT:    [[ARG0_0:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 0
+; GFX8-NEXT:    [[ARG0_1:%.*]] = extractelement <3 x i16> [[ARG0]], i64 1
+; GFX8-NEXT:    [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2
+; GFX8-NEXT:    [[ARG1_0:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 0
+; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <3 x i16> [[ARG1]], i64 1
+; GFX8-NEXT:    [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2
+; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
 ; GFX8-NEXT:    [[ADD_2:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
-; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[TMP0]], i16 [[ADD_2]], i64 2
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <3 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX8-NEXT:    [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2
 ; GFX8-NEXT:    ret <3 x i16> [[INS_2]]
 ;
 ; GFX9-LABEL: @uadd_sat_v3i16(
@@ -317,11 +352,18 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
 ;
 ; GFX8-LABEL: @uadd_sat_v4i16(
 ; GFX8-NEXT:  bb:
-; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG1:%.*]])
-; GFX8-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
-; GFX8-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; GFX8-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; GFX8-NEXT:    [[ARG0_0:%.*]] = extractelement <4 x i16> [[ARG0:%.*]], i64 0
+; GFX8-NEXT:    [[ARG0_1:%.*]] = extractelement <4 x i16> [[ARG0]], i64 1
+; GFX8-NEXT:    [[ARG1_0:%.*]] = extractelement <4 x i16> [[ARG1:%.*]], i64 0
+; GFX8-NEXT:    [[ARG1_1:%.*]] = extractelement <4 x i16> [[ARG1]], i64 1
+; GFX8-NEXT:    [[ADD_0:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_0]], i16 [[ARG1_0]])
+; GFX8-NEXT:    [[ADD_1:%.*]] = call i16 @llvm.umin.i16(i16 [[ARG0_1]], i16 [[ARG1_1]])
+; GFX8-NEXT:    [[TMP0:%.*]] = call <4 x i16> @llvm.umin.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG1]])
+; GFX8-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
+; GFX8-NEXT:    [[INS_0:%.*]] = insertelement <4 x i16> poison, i16 [[ADD_0]], i64 0
+; GFX8-NEXT:    [[INS_1:%.*]] = insertelement <4 x i16> [[INS_0]], i16 [[ADD_1]], i64 1
+; GFX8-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; GFX8-NEXT:    [[INS_31:%.*]] = shufflevector <4 x i16> [[INS_1]], <4 x i16> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; GFX8-NEXT:    ret <4 x i16> [[INS_31]]
 ;
 ; GFX9-LABEL: @uadd_sat_v4i16(

From 46bf5d547fa7f6066019750effaa6c2aa119a8da Mon Sep 17 00:00:00 2001
From: ykhatav <yashasvi.khatavkar@intel.com>
Date: Thu, 8 Aug 2024 09:07:39 -0400
Subject: [PATCH 009/112] Globalopt pass produces invalid debug info (#100654)

This patch fixes an issue in the GlobalOpt pass where deleting a global
variable fails to update the corresponding dbg.value and it references
an empty metadata entry. The SalvageDebugInfo() function has been
updated to handle dbg.value intrinsic when globals are deleted.
---
 llvm/lib/IR/Metadata.cpp                 |  6 +++++
 llvm/lib/Transforms/IPO/GlobalOpt.cpp    |  1 +
 llvm/test/DebugInfo/X86/undef-dbg-val.ll | 34 ++++++++++++++++++++++++
 3 files changed, 41 insertions(+)
 create mode 100644 llvm/test/DebugInfo/X86/undef-dbg-val.ll

diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 8669825749d83c7..2599de3437ba293 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -346,6 +346,12 @@ void ReplaceableMetadataImpl::SalvageDebugInfo(const Constant &C) {
     MetadataTracking::OwnerTy Owner = Pair.second.first;
     if (!Owner)
       continue;
+    // Check for MetadataAsValue.
+    if (isa<MetadataAsValue *>(Owner)) {
+      cast<MetadataAsValue *>(Owner)->handleChangedMetadata(
+          ValueAsMetadata::get(UndefValue::get(C.getType())));
+      continue;
+    }
     if (!isa<Metadata *>(Owner))
       continue;
     auto *OwnerMD = dyn_cast_if_present<MDNode>(cast<Metadata *>(Owner));
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index ab1e41ebf9a9d6c..5293a777496bc7b 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1338,6 +1338,7 @@ deleteIfDead(GlobalValue &GV,
     if (DeleteFnCallback)
       DeleteFnCallback(*F);
   }
+  ReplaceableMetadataImpl::SalvageDebugInfo(GV);
   GV.eraseFromParent();
   ++NumDeleted;
   return true;
diff --git a/llvm/test/DebugInfo/X86/undef-dbg-val.ll b/llvm/test/DebugInfo/X86/undef-dbg-val.ll
new file mode 100644
index 000000000000000..61f3776c22d5a41
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/undef-dbg-val.ll
@@ -0,0 +1,34 @@
+; RUN:  opt -S -passes=globalopt --experimental-debuginfo-iterators=false <%s | FileCheck %s
+; CHECK: #dbg_value(ptr undef, 
+; CHECK-SAME:    [[VAR:![0-9]+]],
+; CHECK-SAME:    !DIExpression()
+; CHECK: [[VAR]] = !DILocalVariable(name: "_format"
+
+
+; ModuleID = '<stdin>'
+source_filename = "test.cpp"
+
+@_ZZZZ4main_format = internal constant [24 x i8] c"Result1: Hello, World!\0A\00", align 16, !dbg !9
+
+define void  @foo() align 2 !dbg !5 {
+entry:
+  call void @llvm.dbg.value(metadata ptr @_ZZZZ4main_format, metadata !11, metadata !DIExpression()), !dbg !12
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, emissionKind: FullDebug)
+!1 = !DIFile(filename: "test.cpp", directory: "/path/to")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 27, type: !6, scopeLine: 27, spFlags: DISPFlagDefinition, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null}
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !DIGlobalVariableExpression(var: !10, expr: !DIExpression())
+!10 = distinct !DIGlobalVariable(name: "_format", scope: !5, file: !1, line: 49, type: !8, isLocal: true, isDefinition: true)
+!11 = !DILocalVariable(name: "_format", arg: 1, scope: !5, file: !1, line: 79, type: !8)
+!12 = !DILocation(line: 0, scope: !5)

From 1139dee910c45cfdd4a6066b22addef585e04032 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 8 Aug 2024 15:13:30 +0200
Subject: [PATCH 010/112] [SimplifyCFG] Add more sinking tests (NFC)

---
 .../SimplifyCFG/X86/sink-common-code.ll       | 146 ++++++++++++++++++
 1 file changed, 146 insertions(+)

diff --git a/llvm/test/Transforms/SimplifyCFG/X86/sink-common-code.ll b/llvm/test/Transforms/SimplifyCFG/X86/sink-common-code.ll
index 0150b3b60d9e42b..cb2bbb8e0b9317e 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/sink-common-code.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/sink-common-code.ll
@@ -2033,6 +2033,152 @@ join:
   ret void
 }
 
+define i32 @many_indirect_phis(i1 %cond, i32 %a, i32 %b) {
+; CHECK-LABEL: @many_indirect_phis(
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[B_SINK:%.*]] = phi i32 [ [[A:%.*]], [[IF]] ], [ [[B:%.*]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[DOTSINK3:%.*]] = phi i32 [ 10, [[IF]] ], [ 11, [[TMP0]] ]
+; CHECK-NEXT:    [[DOTSINK2:%.*]] = phi i32 [ 20, [[IF]] ], [ 21, [[TMP0]] ]
+; CHECK-NEXT:    [[DOTSINK1:%.*]] = phi i32 [ 30, [[IF]] ], [ 31, [[TMP0]] ]
+; CHECK-NEXT:    [[DOTSINK:%.*]] = phi i32 [ 40, [[IF]] ], [ 41, [[TMP0]] ]
+; CHECK-NEXT:    [[ADD_0_B:%.*]] = add i32 [[B_SINK]], 1
+; CHECK-NEXT:    [[ADD_1_B:%.*]] = add i32 [[ADD_0_B]], [[DOTSINK3]]
+; CHECK-NEXT:    [[ADD_2_B:%.*]] = add i32 [[ADD_1_B]], [[DOTSINK2]]
+; CHECK-NEXT:    [[ADD_3_B:%.*]] = add i32 [[ADD_2_B]], [[DOTSINK1]]
+; CHECK-NEXT:    [[ADD_4_B:%.*]] = add i32 [[ADD_3_B]], [[DOTSINK]]
+; CHECK-NEXT:    ret i32 [[ADD_4_B]]
+;
+  br i1 %cond, label %if, label %else
+
+if:
+  call void @dummy()
+  %add.0.a = add i32 %a, 1
+  %add.1.a = add i32 %add.0.a, 10
+  %add.2.a = add i32 %add.1.a, 20
+  %add.3.a = add i32 %add.2.a, 30
+  %add.4.a = add i32 %add.3.a, 40
+  br label %join
+
+else:
+  %add.0.b = add i32 %b, 1
+  %add.1.b = add i32 %add.0.b, 11
+  %add.2.b = add i32 %add.1.b, 21
+  %add.3.b = add i32 %add.2.b, 31
+  %add.4.b = add i32 %add.3.b, 41
+  br label %join
+
+join:
+  %phi = phi i32 [ %add.4.a, %if ], [ %add.4.b, %else ]
+  ret i32 %phi
+}
+
+define i32 @store_and_unrelated_many_phi_add(i1 %cond, ptr %p, i32 %a, i32 %b) {
+; CHECK-LABEL: @store_and_unrelated_many_phi_add(
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[A:%.*]], 2
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[B:%.*]], 3
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[ADD_1]], [[IF]] ], [ [[ADD_2]], [[ELSE]] ]
+; CHECK-NEXT:    store i32 1, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+  br i1 %cond, label %if, label %else
+
+if:
+  call void @dummy()
+  %add.1 = add i32 %a, 2
+  store i32 1, ptr %p
+  br label %join
+
+else:
+  %add.2 = add i32 %b, 3
+  store i32 1, ptr %p
+  br label %join
+
+join:
+  %phi = phi i32 [ %add.1, %if ], [ %add.2, %else ]
+  ret i32 %phi
+}
+
+define i32 @store_and_related_many_phi_add(i1 %cond, ptr %p, i32 %a, i32 %b) {
+; CHECK-LABEL: @store_and_related_many_phi_add(
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[A:%.*]], 2
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[B:%.*]], 3
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[ADD_2_SINK:%.*]] = phi i32 [ [[ADD_2]], [[ELSE]] ], [ [[ADD_1]], [[IF]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[ADD_1]], [[IF]] ], [ [[ADD_2]], [[ELSE]] ]
+; CHECK-NEXT:    store i32 [[ADD_2_SINK]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+  br i1 %cond, label %if, label %else
+
+if:
+  call void @dummy()
+  %add.1 = add i32 %a, 2
+  store i32 %add.1, ptr %p
+  br label %join
+
+else:
+  %add.2 = add i32 %b, 3
+  store i32 %add.2, ptr %p
+  br label %join
+
+join:
+  %phi = phi i32 [ %add.1, %if ], [ %add.2, %else ]
+  ret i32 %phi
+}
+
+define i32 @store_and_unrelated_many_phi_add2(i1 %cond, ptr %p, i32 %a, i32 %b) {
+; CHECK-LABEL: @store_and_unrelated_many_phi_add2(
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[A:%.*]], 2
+; CHECK-NEXT:    br label [[JOIN:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[B:%.*]], 3
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[ADD_1]], [[IF]] ], [ [[ADD_2]], [[ELSE]] ]
+; CHECK-NEXT:    [[ADD_A_2:%.*]] = add i32 [[A]], 1
+; CHECK-NEXT:    store i32 [[ADD_A_2]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+  br i1 %cond, label %if, label %else
+
+if:
+  call void @dummy()
+  %add.1 = add i32 %a, 2
+  %add.a.1 = add i32 %a, 1
+  store i32 %add.a.1, ptr %p
+  br label %join
+
+else:
+  %add.2 = add i32 %b, 3
+  %add.a.2 = add i32 %a, 1
+  store i32 %add.a.2, ptr %p
+  br label %join
+
+join:
+  %phi = phi i32 [ %add.1, %if ], [ %add.2, %else ]
+  ret i32 %phi
+}
+
 declare void @dummy()
 declare void @use.ptr(ptr)
 

From fd7d7882e7fa5a38d4bfde426120d4663718beb4 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 8 Aug 2024 14:25:06 +0100
Subject: [PATCH 011/112] [DebugInfo][RemoveDIs] Use iterators to insert
 everywhere (#102003)

These are the final few places in LLVM where we use instruction pointers
to identify the position that we're inserting something. We're trying to
get away from that with a view to deprecating those methods, thus use
iterators in all these places. I believe they're all debug-info safe.

The sketchiest part is the ExtractValueInst copy constructor, where we
cast nullptr to a BasicBlock pointer, so that we take the non-default
insert-into-no-block path for instruction insertion, instead of the
default nullptr-instruction path for UnaryInstruction. Such a hack is
necessary until we get rid of the instruction constructor entirely.
---
 llvm/lib/IR/Instructions.cpp                          |  5 +++--
 llvm/lib/Transforms/IPO/ExpandVariadics.cpp           |  2 +-
 .../Transforms/Instrumentation/AddressSanitizer.cpp   |  4 ++--
 llvm/lib/Transforms/Scalar/LICM.cpp                   | 11 ++++++-----
 llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp |  2 +-
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp       |  5 +++--
 6 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 58ebe7e95cd06cd..93fa635e9b4e171 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2484,8 +2484,9 @@ void ExtractValueInst::init(ArrayRef<unsigned> Idxs, const Twine &Name) {
 }
 
 ExtractValueInst::ExtractValueInst(const ExtractValueInst &EVI)
-  : UnaryInstruction(EVI.getType(), ExtractValue, EVI.getOperand(0)),
-    Indices(EVI.Indices) {
+    : UnaryInstruction(EVI.getType(), ExtractValue, EVI.getOperand(0),
+                       (BasicBlock *)nullptr),
+      Indices(EVI.Indices) {
   SubclassOptionalData = EVI.SubclassOptionalData;
 }
 
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index b5b590e2b7acf2c..3a1f690bf039076 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -809,7 +809,7 @@ bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
     Value *Dst = NF ? NF : CI->getCalledOperand();
     FunctionType *NFTy = inlinableVariadicFunctionType(M, VarargFunctionType);
 
-    NewCB = CallInst::Create(NFTy, Dst, Args, OpBundles, "", CI);
+    NewCB = CallInst::Create(NFTy, Dst, Args, OpBundles, "", CI->getIterator());
 
     CallInst::TailCallKind TCK = CI->getTailCallKind();
     assert(TCK != CallInst::TCK_MustTail);
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 0ff5e9b815adfcc..67dd2b2052472c7 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -690,8 +690,8 @@ class RuntimeCallInserter {
       if (EHPad && EHPad->isEHPad()) {
         // Replace CI with a clone with an added funclet OperandBundle
         OperandBundleDef OB("funclet", EHPad);
-        auto *NewCall =
-            CallBase::addOperandBundle(CI, LLVMContext::OB_funclet, OB, CI);
+        auto *NewCall = CallBase::addOperandBundle(CI, LLVMContext::OB_funclet,
+                                                   OB, CI->getIterator());
         NewCall->copyMetadata(*CI);
         CI->replaceAllUsesWith(NewCall);
         CI->eraseFromParent();
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index d48b1286b1e08f4..526ae4e88343967 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2767,8 +2767,9 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
     unsigned OpIdx = U->getOperandNo();
     auto *LHS = OpIdx == 0 ? Mul : Ins->getOperand(0);
     auto *RHS = OpIdx == 1 ? Mul : Ins->getOperand(1);
-    auto *NewBO = BinaryOperator::Create(Ins->getOpcode(), LHS, RHS,
-                                         Ins->getName() + ".reass", Ins);
+    auto *NewBO =
+        BinaryOperator::Create(Ins->getOpcode(), LHS, RHS,
+                               Ins->getName() + ".reass", Ins->getIterator());
     NewBO->copyIRFlags(Ins);
     if (VariantOp == Ins)
       VariantOp = NewBO;
@@ -2822,9 +2823,9 @@ static bool hoistBOAssociation(Instruction &I, Loop &L,
   assert(Preheader && "Loop is not in simplify form?");
 
   auto *Inv = BinaryOperator::Create(Opcode, C1, C2, "invariant.op",
-                                     Preheader->getTerminator());
-  auto *NewBO =
-      BinaryOperator::Create(Opcode, LV, Inv, BO->getName() + ".reass", BO);
+                                     Preheader->getTerminator()->getIterator());
+  auto *NewBO = BinaryOperator::Create(
+      Opcode, LV, Inv, BO->getName() + ".reass", BO->getIterator());
 
   // Copy NUW for ADDs if both instructions have it.
   if (Opcode == Instruction::Add && BO->hasNoUnsignedWrap() &&
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 61183752ab90598..cfae63405966ff9 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -460,7 +460,7 @@ static void convertMetadataToAssumes(LoadInst *LI, Value *Val,
     LLVMContext &Ctx = LI->getContext();
     new StoreInst(ConstantInt::getTrue(Ctx),
                   PoisonValue::get(PointerType::getUnqual(Ctx)),
-                  /*isVolatile=*/false, Align(1), LI);
+                  /*isVolatile=*/false, Align(1), LI->getIterator());
     return;
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index de99fb4bee2377a..f82370d738fc690 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2632,8 +2632,9 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
   }
 
   // Create phi nodes to merge from the  backedge-taken check block.
-  PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
-                                         LoopScalarPreHeader->getFirstNonPHI());
+  PHINode *BCResumeVal =
+      PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
+                      LoopScalarPreHeader->getFirstNonPHIIt());
   // Copy original phi DL over to the new one.
   BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
 

From b143b2483fc5d7e73763ff9292dec6479552de9e Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Thu, 8 Aug 2024 06:41:06 -0700
Subject: [PATCH 012/112] [LLVM][rtsan] Add sanitize_realtime attribute for the
 realtime sanitizer (#100596)

Add a new "sanitize_realtime" attribute, which will correspond to the
nonblocking function effect in clang. This is used in the realtime
sanitizer transform.

Please see the [reviewer support
document](https://github.com/realtime-sanitizer/radsan/blob/doc/review-support/doc/review.md)
for what our next steps are. The original discourse thread can be found
[here](https://discourse.llvm.org/t/rfc-nolock-and-noalloc-attributes/76837)
---
 llvm/docs/LangRef.rst                       | 4 ++++
 llvm/include/llvm/Bitcode/LLVMBitCodes.h    | 1 +
 llvm/include/llvm/IR/Attributes.td          | 4 ++++
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp   | 2 ++
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp   | 2 ++
 llvm/lib/Transforms/Utils/CodeExtractor.cpp | 1 +
 llvm/test/Bitcode/attributes.ll             | 7 +++++++
 llvm/test/Bitcode/compatibility.ll          | 8 ++++++--
 8 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index b17e3c828ed3d5e..36c9e8bb339f920 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -2309,6 +2309,10 @@ example:
     This attribute indicates that MemTagSanitizer checks
     (dynamic address safety analysis based on Armv8 MTE) are enabled for
     this function.
+``sanitize_realtime``
+    This attribute indicates that RealtimeSanitizer checks
+    (realtime safety analysis - no allocations, syscalls or exceptions) are enabled
+    for this function.
 ``speculative_load_hardening``
     This attribute indicates that
     `Speculative Load Hardening <https://llvm.org/docs/SpeculativeLoadHardening.html>`_
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index fb88f2fe75adb51..4beac37a5834457 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -758,6 +758,7 @@ enum AttributeKindCodes {
   ATTR_KIND_SANITIZE_NUMERICAL_STABILITY = 93,
   ATTR_KIND_INITIALIZES = 94,
   ATTR_KIND_HYBRID_PATCHABLE = 95,
+  ATTR_KIND_SANITIZE_REALTIME = 96,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index e1bd193891c1e1d..891e34fec0c7985 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -297,6 +297,9 @@ def SanitizeMemTag : EnumAttr<"sanitize_memtag", [FnAttr]>;
 /// NumericalStabilitySanitizer is on.
 def SanitizeNumericalStability : EnumAttr<"sanitize_numerical_stability", [FnAttr]>;
 
+/// RealtimeSanitizer is on.
+def SanitizeRealtime : EnumAttr<"sanitize_realtime", [FnAttr]>;
+
 /// Speculative Load Hardening is enabled.
 ///
 /// Note that this uses the default compatibility (always compatible during
@@ -385,6 +388,7 @@ def : CompatRule<"isEqual<SanitizeMemoryAttr>">;
 def : CompatRule<"isEqual<SanitizeHWAddressAttr>">;
 def : CompatRule<"isEqual<SanitizeMemTagAttr>">;
 def : CompatRule<"isEqual<SanitizeNumericalStabilityAttr>">;
+def : CompatRule<"isEqual<SanitizeRealtimeAttr>">;
 def : CompatRule<"isEqual<SafeStackAttr>">;
 def : CompatRule<"isEqual<ShadowCallStackAttr>">;
 def : CompatRule<"isEqual<UseSampleProfileAttr>">;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index fd4ae109b4bb8f7..d4dbab04e8ecdbc 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2141,6 +2141,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::SanitizeMemory;
   case bitc::ATTR_KIND_SANITIZE_NUMERICAL_STABILITY:
     return Attribute::SanitizeNumericalStability;
+  case bitc::ATTR_KIND_SANITIZE_REALTIME:
+    return Attribute::SanitizeRealtime;
   case bitc::ATTR_KIND_SPECULATIVE_LOAD_HARDENING:
     return Attribute::SpeculativeLoadHardening;
   case bitc::ATTR_KIND_SWIFT_ERROR:
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 52e15e6880ef286..33ec14b60dd2884 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -843,6 +843,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_SANITIZE_MEMORY;
   case Attribute::SanitizeNumericalStability:
     return bitc::ATTR_KIND_SANITIZE_NUMERICAL_STABILITY;
+  case Attribute::SanitizeRealtime:
+    return bitc::ATTR_KIND_SANITIZE_REALTIME;
   case Attribute::SpeculativeLoadHardening:
     return bitc::ATTR_KIND_SPECULATIVE_LOAD_HARDENING;
   case Attribute::SwiftError:
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 5bca5cf8ff91f7c..94c7f161fc4c739 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -959,6 +959,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::SanitizeThread:
       case Attribute::SanitizeHWAddress:
       case Attribute::SanitizeMemTag:
+      case Attribute::SanitizeRealtime:
       case Attribute::SpeculativeLoadHardening:
       case Attribute::StackProtect:
       case Attribute::StackProtectReq:
diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll
index f4dc9b9849827a8..b25bd7582a05afd 100644
--- a/llvm/test/Bitcode/attributes.ll
+++ b/llvm/test/Bitcode/attributes.ll
@@ -505,6 +505,12 @@ define void @f86() nosanitize_bounds
         ret void;
 }
 
+; CHECK: define void @f92() #53
+define void @f92() sanitize_realtime
+{
+        ret void;
+}
+
 ; CHECK: define void @f87() [[FNRETTHUNKEXTERN:#[0-9]+]]
 define void @f87() fn_ret_thunk_extern { ret void }
 
@@ -599,6 +605,7 @@ define void @initializes(ptr initializes((-4, 0), (4, 8)) %a) {
 ; CHECK: attributes #50 = { disable_sanitizer_instrumentation }
 ; CHECK: attributes #51 = { uwtable(sync) }
 ; CHECK: attributes #52 = { nosanitize_bounds }
+; CHECK: attributes #53 = { sanitize_realtime }
 ; CHECK: attributes [[FNRETTHUNKEXTERN]] = { fn_ret_thunk_extern }
 ; CHECK: attributes [[SKIPPROFILE]] = { skipprofile }
 ; CHECK: attributes [[OPTDEBUG]] = { optdebug }
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index e5592b347425a24..fd60c49a4be39be 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1562,7 +1562,7 @@ exit:
   ; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x i8> <i8 2, i8 3>, <2 x i8> <i8 3, i8 2>
 
   call void @f.nobuiltin() builtin
-  ; CHECK: call void @f.nobuiltin() #52
+  ; CHECK: call void @f.nobuiltin() #53
 
   call fastcc noalias ptr @f.noalias() noinline
   ; CHECK: call fastcc noalias ptr @f.noalias() #12
@@ -1989,6 +1989,9 @@ declare void @f.allockind() allockind("alloc,uninitialized")
 declare void @f.sanitize_numerical_stability() sanitize_numerical_stability
 ; CHECK: declare void @f.sanitize_numerical_stability() #51
 
+declare void @f.sanitize_realtime() sanitize_realtime
+; CHECK: declare void @f.sanitize_realtime() #52
+
 ; CHECK: declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan))
 declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan))
 
@@ -2111,7 +2114,8 @@ define float @nofpclass_callsites(float %arg) {
 ; CHECK: attributes #49 = { nosanitize_bounds }
 ; CHECK: attributes #50 = { allockind("alloc,uninitialized") }
 ; CHECK: attributes #51 = { sanitize_numerical_stability }
-; CHECK: attributes #52 = { builtin }
+; CHECK: attributes #52 = { sanitize_realtime }
+; CHECK: attributes #53 = { builtin }
 
 ;; Metadata
 

From 59531cf01eb791f4cef88c2757d399eb3d90a086 Mon Sep 17 00:00:00 2001
From: Kelvin Li <kkwli@users.noreply.github.com>
Date: Thu, 8 Aug 2024 09:45:51 -0400
Subject: [PATCH 013/112] [flang] Set the offset based on the significant bytes
 in the boz input in big endian (#102334)

The offset to the input data should be counted from most significant bit
instead of zero in the big endian environment.
---
 flang/runtime/edit-input.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/runtime/edit-input.cpp b/flang/runtime/edit-input.cpp
index 37989bbcee0ab8d..71021dd8a01588c 100644
--- a/flang/runtime/edit-input.cpp
+++ b/flang/runtime/edit-input.cpp
@@ -119,7 +119,7 @@ static RT_API_ATTRS bool EditBOZInput(
   std::memset(n, 0, bytes);
   int increment{isHostLittleEndian ? -1 : 1};
   auto *data{reinterpret_cast<unsigned char *>(n) +
-      (isHostLittleEndian ? significantBytes - 1 : 0)};
+      (isHostLittleEndian ? significantBytes - 1 : bytes - significantBytes)};
   int shift{((digits - 1) * LOG2_BASE) & 7};
   while (digits > 0) {
     char32_t ch{*io.NextInField(remaining, edit)};

From 29817a96262aa96f3e0be632d91ab810e75f728d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Aug 2024 14:53:43 +0100
Subject: [PATCH 014/112] [X86] Add test coverage for #74101

---
 llvm/test/CodeGen/X86/combine-sub.ll | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/llvm/test/CodeGen/X86/combine-sub.ll b/llvm/test/CodeGen/X86/combine-sub.ll
index 9d5934c345f8a5a..3123efc306360e4 100644
--- a/llvm/test/CodeGen/X86/combine-sub.ll
+++ b/llvm/test/CodeGen/X86/combine-sub.ll
@@ -452,3 +452,16 @@ define void @PR52032_4(ptr %p, ptr %q) {
   store <4 x i32> %i9, ptr %p2, align 4
   ret void
 }
+
+; FIXME: Failure to fold add(xor(bsr(x),-32),33) -> add(or(bsr(x),-32),33) -> add(bsr(x),1)
+define i32 @PR74101(i32 %a0) {
+; CHECK-LABEL: PR74101:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bsrl %edi, %eax
+; CHECK-NEXT:    xorl $-32, %eax
+; CHECK-NEXT:    addl $33, %eax
+; CHECK-NEXT:    retq
+  %lz = call i32 @llvm.ctlz.i32(i32 %a0, i1 true)
+  %add = sub nuw nsw i32 32, %lz
+  ret i32 %add
+}

From 2b1122eaec0d0b335feeac4adfaab2e3e04b7bd9 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 8 Aug 2024 15:18:34 +0100
Subject: [PATCH 015/112] [DebugInfo][RemoveDIs] Use iterator-insertion in
 unittests and fuzzer (#102015)

These are the final few places in LLVM that use instruction pointers to
insert instructions -- use iterators instead, which is needed for
debug-info correctness in the future. Most of this is a gentle
scattering of getIterator calls or not deref-then-addrofing iterators.
libfuzzer does require a storage change to keep built instruction
positions in a container though. The unit-test changes are very
straightforwards.

This leaves us in a position where libfuzzer can't fuzz on either of
debug-info records, however I don't believe that fuzzing of debug-info
is in scope for the library.
---
 llvm/include/llvm/FuzzMutate/OpDescriptor.h   |  2 +-
 llvm/lib/FuzzMutate/IRMutator.cpp             | 10 +--
 llvm/lib/FuzzMutate/Operations.cpp            | 54 +++++++++-------
 llvm/lib/FuzzMutate/RandomIRBuilder.cpp       | 17 ++---
 .../deltas/ReduceOperandBundles.cpp           |  3 +-
 llvm/tools/llvm-stress/llvm-stress.cpp        | 64 ++++++++++---------
 .../Analysis/CGSCCPassManagerTest.cpp         | 37 +++++------
 llvm/unittests/Analysis/LazyCallGraphTest.cpp | 34 +++++-----
 .../Analysis/ScalarEvolutionTest.cpp          | 21 +++---
 llvm/unittests/FuzzMutate/OperationsTest.cpp  | 12 ++--
 llvm/unittests/IR/BasicBlockTest.cpp          | 11 ++--
 .../Utils/ScalarEvolutionExpanderTest.cpp     | 24 +++----
 12 files changed, 155 insertions(+), 134 deletions(-)

diff --git a/llvm/include/llvm/FuzzMutate/OpDescriptor.h b/llvm/include/llvm/FuzzMutate/OpDescriptor.h
index 00a8ea0e5babc6c..78114074dbbfce7 100644
--- a/llvm/include/llvm/FuzzMutate/OpDescriptor.h
+++ b/llvm/include/llvm/FuzzMutate/OpDescriptor.h
@@ -89,7 +89,7 @@ class SourcePred {
 struct OpDescriptor {
   unsigned Weight;
   SmallVector<SourcePred, 2> SourcePreds;
-  std::function<Value *(ArrayRef<Value *>, Instruction *)> BuilderFunc;
+  std::function<Value *(ArrayRef<Value *>, BasicBlock::iterator)> BuilderFunc;
 };
 
 static inline SourcePred onlyType(Type *Only) {
diff --git a/llvm/lib/FuzzMutate/IRMutator.cpp b/llvm/lib/FuzzMutate/IRMutator.cpp
index 3f27daad55e39ba..72e0de593760760 100644
--- a/llvm/lib/FuzzMutate/IRMutator.cpp
+++ b/llvm/lib/FuzzMutate/IRMutator.cpp
@@ -148,7 +148,7 @@ void InjectorIRStrategy::mutate(BasicBlock &BB, RandomIRBuilder &IB) {
   for (const auto &Pred : ArrayRef(OpDesc->SourcePreds).slice(1))
     Srcs.push_back(IB.findOrCreateSource(BB, InstsBefore, Srcs, Pred));
 
-  if (Value *Op = OpDesc->BuilderFunc(Srcs, Insts[IP])) {
+  if (Value *Op = OpDesc->BuilderFunc(Srcs, Insts[IP]->getIterator())) {
     // Find a sink and wire up the results of the operation.
     IB.connectToSink(BB, InstsAfter, Op);
   }
@@ -388,9 +388,9 @@ void InsertFunctionStrategy::mutate(BasicBlock &BB, RandomIRBuilder &IB) {
   }
   bool isRetVoid = (F->getReturnType() == Type::getVoidTy(M->getContext()));
   auto BuilderFunc = [FTy, F, isRetVoid](ArrayRef<Value *> Srcs,
-                                         Instruction *Inst) {
+                                         BasicBlock::iterator InsertPt) {
     StringRef Name = isRetVoid ? nullptr : "C";
-    CallInst *Call = CallInst::Create(FTy, F, Srcs, Name, Inst);
+    CallInst *Call = CallInst::Create(FTy, F, Srcs, Name, InsertPt);
     // Don't return this call inst if it return void as it can't be sinked.
     return isRetVoid ? nullptr : Call;
   };
@@ -414,7 +414,7 @@ void InsertFunctionStrategy::mutate(BasicBlock &BB, RandomIRBuilder &IB) {
     Srcs.push_back(IB.findOrCreateSource(BB, InstsBefore, Srcs, Pred));
   }
 
-  if (Value *Op = BuilderFunc(Srcs, Insts[IP])) {
+  if (Value *Op = BuilderFunc(Srcs, Insts[IP]->getIterator())) {
     // Find a sink and wire up the results of the operation.
     IB.connectToSink(BB, InstsAfter, Op);
   }
@@ -543,7 +543,7 @@ void InsertPHIStrategy::mutate(BasicBlock &BB, RandomIRBuilder &IB) {
   if (&BB == &BB.getParent()->getEntryBlock())
     return;
   Type *Ty = IB.randomType();
-  PHINode *PHI = PHINode::Create(Ty, llvm::pred_size(&BB), "", &BB.front());
+  PHINode *PHI = PHINode::Create(Ty, llvm::pred_size(&BB), "", BB.begin());
 
   // Use a map to make sure the same incoming basic block has the same value.
   DenseMap<BasicBlock *, Value *> IncomingValues;
diff --git a/llvm/lib/FuzzMutate/Operations.cpp b/llvm/lib/FuzzMutate/Operations.cpp
index 408f35879acd3b4..389ff8130771c88 100644
--- a/llvm/lib/FuzzMutate/Operations.cpp
+++ b/llvm/lib/FuzzMutate/Operations.cpp
@@ -98,8 +98,8 @@ void llvm::describeFuzzerVectorOps(std::vector<fuzzerop::OpDescriptor> &Ops) {
 }
 
 OpDescriptor llvm::fuzzerop::selectDescriptor(unsigned Weight) {
-  auto buildOp = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return SelectInst::Create(Srcs[0], Srcs[1], Srcs[2], "S", Inst);
+  auto buildOp = [](ArrayRef<Value *> Srcs, BasicBlock::iterator InsertPt) {
+    return SelectInst::Create(Srcs[0], Srcs[1], Srcs[2], "S", InsertPt);
   };
   return {Weight,
           {boolOrVecBoolType(), matchFirstLengthWAnyType(), matchSecondType()},
@@ -107,16 +107,16 @@ OpDescriptor llvm::fuzzerop::selectDescriptor(unsigned Weight) {
 }
 
 OpDescriptor llvm::fuzzerop::fnegDescriptor(unsigned Weight) {
-  auto buildOp = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return UnaryOperator::Create(Instruction::FNeg, Srcs[0], "F", Inst);
+  auto buildOp = [](ArrayRef<Value *> Srcs, BasicBlock::iterator InsertPt) {
+    return UnaryOperator::Create(Instruction::FNeg, Srcs[0], "F", InsertPt);
   };
   return {Weight, {anyFloatOrVecFloatType()}, buildOp};
 }
 
 OpDescriptor llvm::fuzzerop::binOpDescriptor(unsigned Weight,
                                              Instruction::BinaryOps Op) {
-  auto buildOp = [Op](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return BinaryOperator::Create(Op, Srcs[0], Srcs[1], "B", Inst);
+  auto buildOp = [Op](ArrayRef<Value *> Srcs, BasicBlock::iterator InsertPt) {
+    return BinaryOperator::Create(Op, Srcs[0], Srcs[1], "B", InsertPt);
   };
   switch (Op) {
   case Instruction::Add:
@@ -148,8 +148,9 @@ OpDescriptor llvm::fuzzerop::binOpDescriptor(unsigned Weight,
 OpDescriptor llvm::fuzzerop::cmpOpDescriptor(unsigned Weight,
                                              Instruction::OtherOps CmpOp,
                                              CmpInst::Predicate Pred) {
-  auto buildOp = [CmpOp, Pred](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return CmpInst::Create(CmpOp, Pred, Srcs[0], Srcs[1], "C", Inst);
+  auto buildOp = [CmpOp, Pred](ArrayRef<Value *> Srcs,
+                               BasicBlock::iterator InsertPt) {
+    return CmpInst::Create(CmpOp, Pred, Srcs[0], Srcs[1], "C", InsertPt);
   };
 
   switch (CmpOp) {
@@ -163,9 +164,10 @@ OpDescriptor llvm::fuzzerop::cmpOpDescriptor(unsigned Weight,
 }
 
 OpDescriptor llvm::fuzzerop::splitBlockDescriptor(unsigned Weight) {
-  auto buildSplitBlock = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    BasicBlock *Block = Inst->getParent();
-    BasicBlock *Next = Block->splitBasicBlock(Inst, "BB");
+  auto buildSplitBlock = [](ArrayRef<Value *> Srcs,
+                            BasicBlock::iterator InsertPt) {
+    BasicBlock *Block = InsertPt->getParent();
+    BasicBlock *Next = Block->splitBasicBlock(InsertPt, "BB");
 
     // If it was an exception handling block, we are done.
     if (Block->isEHPad())
@@ -174,7 +176,8 @@ OpDescriptor llvm::fuzzerop::splitBlockDescriptor(unsigned Weight) {
     // Loop back on this block by replacing the unconditional forward branch
     // with a conditional with a backedge.
     if (Block != &Block->getParent()->getEntryBlock()) {
-      BranchInst::Create(Block, Next, Srcs[0], Block->getTerminator());
+      BranchInst::Create(Block, Next, Srcs[0],
+                         Block->getTerminator()->getIterator());
       Block->getTerminator()->eraseFromParent();
 
       // We need values for each phi in the block. Since there isn't a good way
@@ -193,12 +196,12 @@ OpDescriptor llvm::fuzzerop::splitBlockDescriptor(unsigned Weight) {
 }
 
 OpDescriptor llvm::fuzzerop::gepDescriptor(unsigned Weight) {
-  auto buildGEP = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
+  auto buildGEP = [](ArrayRef<Value *> Srcs, BasicBlock::iterator InsertPt) {
     // TODO: It would be better to generate a random type here, rather than
     // generating a random value and picking its type.
     Type *Ty = Srcs[1]->getType();
     auto Indices = ArrayRef(Srcs).drop_front(2);
-    return GetElementPtrInst::Create(Ty, Srcs[0], Indices, "G", Inst);
+    return GetElementPtrInst::Create(Ty, Srcs[0], Indices, "G", InsertPt);
   };
   // TODO: Handle aggregates and vectors
   // TODO: Support multiple indices.
@@ -239,10 +242,11 @@ static SourcePred validExtractValueIndex() {
 }
 
 OpDescriptor llvm::fuzzerop::extractValueDescriptor(unsigned Weight) {
-  auto buildExtract = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
+  auto buildExtract = [](ArrayRef<Value *> Srcs,
+                         BasicBlock::iterator InsertPt) {
     // TODO: It's pretty inefficient to shuffle this all through constants.
     unsigned Idx = cast<ConstantInt>(Srcs[1])->getZExtValue();
-    return ExtractValueInst::Create(Srcs[0], {Idx}, "E", Inst);
+    return ExtractValueInst::Create(Srcs[0], {Idx}, "E", InsertPt);
   };
   // TODO: Should we handle multiple indices?
   return {Weight, {anyAggregateType(), validExtractValueIndex()}, buildExtract};
@@ -298,10 +302,10 @@ static SourcePred validInsertValueIndex() {
 }
 
 OpDescriptor llvm::fuzzerop::insertValueDescriptor(unsigned Weight) {
-  auto buildInsert = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
+  auto buildInsert = [](ArrayRef<Value *> Srcs, BasicBlock::iterator InsertPt) {
     // TODO: It's pretty inefficient to shuffle this all through constants.
     unsigned Idx = cast<ConstantInt>(Srcs[2])->getZExtValue();
-    return InsertValueInst::Create(Srcs[0], Srcs[1], {Idx}, "I", Inst);
+    return InsertValueInst::Create(Srcs[0], Srcs[1], {Idx}, "I", InsertPt);
   };
   return {
       Weight,
@@ -310,16 +314,17 @@ OpDescriptor llvm::fuzzerop::insertValueDescriptor(unsigned Weight) {
 }
 
 OpDescriptor llvm::fuzzerop::extractElementDescriptor(unsigned Weight) {
-  auto buildExtract = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return ExtractElementInst::Create(Srcs[0], Srcs[1], "E", Inst);
+  auto buildExtract = [](ArrayRef<Value *> Srcs,
+                         BasicBlock::iterator InsertPt) {
+    return ExtractElementInst::Create(Srcs[0], Srcs[1], "E", InsertPt);
   };
   // TODO: Try to avoid undefined accesses.
   return {Weight, {anyVectorType(), anyIntType()}, buildExtract};
 }
 
 OpDescriptor llvm::fuzzerop::insertElementDescriptor(unsigned Weight) {
-  auto buildInsert = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return InsertElementInst::Create(Srcs[0], Srcs[1], Srcs[2], "I", Inst);
+  auto buildInsert = [](ArrayRef<Value *> Srcs, BasicBlock::iterator InsertPt) {
+    return InsertElementInst::Create(Srcs[0], Srcs[1], Srcs[2], "I", InsertPt);
   };
   // TODO: Try to avoid undefined accesses.
   return {Weight,
@@ -343,8 +348,9 @@ static SourcePred validShuffleVectorIndex() {
 }
 
 OpDescriptor llvm::fuzzerop::shuffleVectorDescriptor(unsigned Weight) {
-  auto buildShuffle = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    return new ShuffleVectorInst(Srcs[0], Srcs[1], Srcs[2], "S", Inst);
+  auto buildShuffle = [](ArrayRef<Value *> Srcs,
+                         BasicBlock::iterator InsertPt) {
+    return new ShuffleVectorInst(Srcs[0], Srcs[1], Srcs[2], "S", InsertPt);
   };
   return {Weight,
           {anyVectorType(), matchFirstType(), validShuffleVectorIndex()},
diff --git a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
index 5569888e5b28e8b..fe4ad10a02d57d3 100644
--- a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
+++ b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
@@ -69,9 +69,9 @@ AllocaInst *RandomIRBuilder::createStackMemory(Function *F, Type *Ty,
   BasicBlock *EntryBB = &F->getEntryBlock();
   DataLayout DL(F->getParent());
   AllocaInst *Alloca = new AllocaInst(Ty, DL.getAllocaAddrSpace(), "A",
-                                      &*EntryBB->getFirstInsertionPt());
+                                      EntryBB->getFirstInsertionPt());
   if (Init)
-    new StoreInst(Init, Alloca, Alloca->getNextNode());
+    new StoreInst(Init, Alloca, std::next(Alloca->getIterator()));
   return Alloca;
 }
 
@@ -165,7 +165,7 @@ Value *RandomIRBuilder::findOrCreateSource(BasicBlock &BB,
       Type *Ty = GV->getValueType();
       LoadInst *LoadGV = nullptr;
       if (BB.getTerminator()) {
-        LoadGV = new LoadInst(Ty, GV, "LGV", &*BB.getFirstInsertionPt());
+        LoadGV = new LoadInst(Ty, GV, "LGV", BB.getFirstInsertionPt());
       } else {
         LoadGV = new LoadInst(Ty, GV, "LGV", &BB);
       }
@@ -213,7 +213,7 @@ Value *RandomIRBuilder::newSource(BasicBlock &BB, ArrayRef<Instruction *> Insts,
     }
     // Pick the type independently.
     Type *AccessTy = RS.getSelection()->getType();
-    auto *NewLoad = new LoadInst(AccessTy, Ptr, "L", &*IP);
+    auto *NewLoad = new LoadInst(AccessTy, Ptr, "L", IP);
 
     // Only sample this load if it really matches the descriptor
     if (Pred.matches(Srcs, NewLoad))
@@ -231,7 +231,8 @@ Value *RandomIRBuilder::newSource(BasicBlock &BB, ArrayRef<Instruction *> Insts,
     Function *F = BB.getParent();
     AllocaInst *Alloca = createStackMemory(F, Ty, newSrc);
     if (BB.getTerminator()) {
-      newSrc = new LoadInst(Ty, Alloca, /*ArrLen,*/ "L", BB.getTerminator());
+      newSrc = new LoadInst(Ty, Alloca, /*ArrLen,*/ "L",
+                            BB.getTerminator()->getIterator());
     } else {
       newSrc = new LoadInst(Ty, Alloca, /*ArrLen,*/ "L", &BB);
     }
@@ -325,7 +326,7 @@ Instruction *RandomIRBuilder::connectToSink(BasicBlock &BB,
       for (BasicBlock *Dom : Dominators) {
         for (Instruction &I : *Dom) {
           if (isa<PointerType>(I.getType()))
-            return new StoreInst(V, &I, Insts.back());
+            return new StoreInst(V, &I, Insts.back()->getIterator());
         }
       }
       break;
@@ -351,7 +352,7 @@ Instruction *RandomIRBuilder::connectToSink(BasicBlock &BB,
       Module *M = BB.getParent()->getParent();
       auto [GV, DidCreate] =
           findOrCreateGlobalVariable(M, {}, fuzzerop::onlyType(V->getType()));
-      return new StoreInst(V, GV, Insts.back());
+      return new StoreInst(V, GV, Insts.back()->getIterator());
     }
     case EndOfValueSink:
     default:
@@ -373,7 +374,7 @@ Instruction *RandomIRBuilder::newSink(BasicBlock &BB,
     }
   }
 
-  return new StoreInst(V, Ptr, Insts.back());
+  return new StoreInst(V, Ptr, Insts.back()->getIterator());
 }
 
 Value *RandomIRBuilder::findPointer(BasicBlock &BB,
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp
index a211b6ac8d6cfc3..d2274877f126b21 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceOperandBundles.cpp
@@ -88,7 +88,8 @@ static void maybeRewriteCallWithDifferentBundles(
             });
 
   // Finally actually replace the bundles on the call.
-  CallBase *NewCall = CallBase::Create(OrigCall, NewBundles, OrigCall);
+  CallBase *NewCall =
+      CallBase::Create(OrigCall, NewBundles, OrigCall->getIterator());
   OrigCall->replaceAllUsesWith(NewCall);
   OrigCall->eraseFromParent();
 }
diff --git a/llvm/tools/llvm-stress/llvm-stress.cpp b/llvm/tools/llvm-stress/llvm-stress.cpp
index 80fb21038d304f9..e44b6023fff231e 100644
--- a/llvm/tools/llvm-stress/llvm-stress.cpp
+++ b/llvm/tools/llvm-stress/llvm-stress.cpp
@@ -336,7 +336,7 @@ struct LoadModifier: public Modifier {
     // Try to use predefined pointers. If non-exist, use undef pointer value;
     Value *Ptr = getRandomPointerValue();
     Type *Ty = pickType();
-    Value *V = new LoadInst(Ty, Ptr, "L", BB->getTerminator());
+    Value *V = new LoadInst(Ty, Ptr, "L", BB->getTerminator()->getIterator());
     PT->push_back(V);
   }
 };
@@ -356,7 +356,7 @@ struct StoreModifier: public Modifier {
       return;
 
     Value *Val = getRandomValue(ValTy);
-    new StoreInst(Val, Ptr, BB->getTerminator());
+    new StoreInst(Val, Ptr, BB->getTerminator()->getIterator());
   }
 };
 
@@ -399,7 +399,8 @@ struct BinModifier: public Modifier {
     case 12:{Op = Instruction::Xor;  break; }
     }
 
-    PT->push_back(BinaryOperator::Create(Op, Val0, Val1, "B", Term));
+    PT->push_back(
+        BinaryOperator::Create(Op, Val0, Val1, "B", Term->getIterator()));
   }
 };
 
@@ -462,8 +463,8 @@ struct AllocaModifier: public Modifier {
   void Act() override {
     Type *Tp = pickType();
     const DataLayout &DL = BB->getDataLayout();
-    PT->push_back(new AllocaInst(Tp, DL.getAllocaAddrSpace(),
-                                 "A", BB->getFirstNonPHI()));
+    PT->push_back(new AllocaInst(Tp, DL.getAllocaAddrSpace(), "A",
+                                 BB->getFirstNonPHIIt()));
   }
 };
 
@@ -474,9 +475,8 @@ struct ExtractElementModifier: public Modifier {
   void Act() override {
     Value *Val0 = getRandomVectorValue();
     Value *V = ExtractElementInst::Create(
-        Val0,
-        getRandomValue(Type::getInt32Ty(BB->getContext())),
-        "E", BB->getTerminator());
+        Val0, getRandomValue(Type::getInt32Ty(BB->getContext())), "E",
+        BB->getTerminator()->getIterator());
     return PT->push_back(V);
   }
 };
@@ -508,7 +508,7 @@ struct ShuffModifier: public Modifier {
     Constant *Mask = ConstantVector::get(Idxs);
 
     Value *V = new ShuffleVectorInst(Val0, Val1, Mask, "Shuff",
-                                     BB->getTerminator());
+                                     BB->getTerminator()->getIterator());
     PT->push_back(V);
   }
 };
@@ -522,9 +522,8 @@ struct InsertElementModifier: public Modifier {
     Value *Val1 = getRandomValue(Val0->getType()->getScalarType());
 
     Value *V = InsertElementInst::Create(
-        Val0, Val1,
-        getRandomValue(Type::getInt32Ty(BB->getContext())),
-        "I", BB->getTerminator());
+        Val0, Val1, getRandomValue(Type::getInt32Ty(BB->getContext())), "I",
+        BB->getTerminator()->getIterator());
     return PT->push_back(V);
   }
 };
@@ -550,7 +549,7 @@ struct CastModifier: public Modifier {
       if (!DestTy->isPointerTy())
         DestTy = PointerType::get(DestTy, 0);
       return PT->push_back(
-        new BitCastInst(V, DestTy, "PC", BB->getTerminator()));
+          new BitCastInst(V, DestTy, "PC", BB->getTerminator()->getIterator()));
     }
 
     unsigned VSize = VTy->getScalarType()->getPrimitiveSizeInBits();
@@ -559,47 +558,50 @@ struct CastModifier: public Modifier {
     // Generate lots of bitcasts.
     if ((getRandom() & 1) && VSize == DestSize) {
       return PT->push_back(
-        new BitCastInst(V, DestTy, "BC", BB->getTerminator()));
+          new BitCastInst(V, DestTy, "BC", BB->getTerminator()->getIterator()));
     }
 
     // Both types are integers:
     if (VTy->isIntOrIntVectorTy() && DestTy->isIntOrIntVectorTy()) {
       if (VSize > DestSize) {
         return PT->push_back(
-          new TruncInst(V, DestTy, "Tr", BB->getTerminator()));
+            new TruncInst(V, DestTy, "Tr", BB->getTerminator()->getIterator()));
       } else {
         assert(VSize < DestSize && "Different int types with the same size?");
         if (getRandom() & 1)
-          return PT->push_back(
-            new ZExtInst(V, DestTy, "ZE", BB->getTerminator()));
-        return PT->push_back(new SExtInst(V, DestTy, "Se", BB->getTerminator()));
+          return PT->push_back(new ZExtInst(
+              V, DestTy, "ZE", BB->getTerminator()->getIterator()));
+        return PT->push_back(
+            new SExtInst(V, DestTy, "Se", BB->getTerminator()->getIterator()));
       }
     }
 
     // Fp to int.
     if (VTy->isFPOrFPVectorTy() && DestTy->isIntOrIntVectorTy()) {
       if (getRandom() & 1)
-        return PT->push_back(
-          new FPToSIInst(V, DestTy, "FC", BB->getTerminator()));
-      return PT->push_back(new FPToUIInst(V, DestTy, "FC", BB->getTerminator()));
+        return PT->push_back(new FPToSIInst(
+            V, DestTy, "FC", BB->getTerminator()->getIterator()));
+      return PT->push_back(
+          new FPToUIInst(V, DestTy, "FC", BB->getTerminator()->getIterator()));
     }
 
     // Int to fp.
     if (VTy->isIntOrIntVectorTy() && DestTy->isFPOrFPVectorTy()) {
       if (getRandom() & 1)
-        return PT->push_back(
-          new SIToFPInst(V, DestTy, "FC", BB->getTerminator()));
-      return PT->push_back(new UIToFPInst(V, DestTy, "FC", BB->getTerminator()));
+        return PT->push_back(new SIToFPInst(
+            V, DestTy, "FC", BB->getTerminator()->getIterator()));
+      return PT->push_back(
+          new UIToFPInst(V, DestTy, "FC", BB->getTerminator()->getIterator()));
     }
 
     // Both floats.
     if (VTy->isFPOrFPVectorTy() && DestTy->isFPOrFPVectorTy()) {
       if (VSize > DestSize) {
-        return PT->push_back(
-          new FPTruncInst(V, DestTy, "Tr", BB->getTerminator()));
+        return PT->push_back(new FPTruncInst(
+            V, DestTy, "Tr", BB->getTerminator()->getIterator()));
       } else if (VSize < DestSize) {
         return PT->push_back(
-          new FPExtInst(V, DestTy, "ZE", BB->getTerminator()));
+            new FPExtInst(V, DestTy, "ZE", BB->getTerminator()->getIterator()));
       }
       // If VSize == DestSize, then the two types must be fp128 and ppc_fp128,
       // for which there is no defined conversion. So do nothing.
@@ -625,7 +627,8 @@ struct SelectModifier: public Modifier {
         CondTy = VectorType::get(CondTy, VTy->getElementCount());
 
     Value *Cond = getRandomValue(CondTy);
-    Value *V = SelectInst::Create(Cond, Val0, Val1, "Sl", BB->getTerminator());
+    Value *V = SelectInst::Create(Cond, Val0, Val1, "Sl",
+                                  BB->getTerminator()->getIterator());
     return PT->push_back(V);
   }
 };
@@ -654,7 +657,7 @@ struct CmpModifier: public Modifier {
 
     Value *V = CmpInst::Create(fp ? Instruction::FCmp : Instruction::ICmp,
                                (CmpInst::Predicate)op, Val0, Val1, "Cmp",
-                               BB->getTerminator());
+                               BB->getTerminator()->getIterator());
     return PT->push_back(V);
   }
 };
@@ -712,7 +715,8 @@ static void IntroduceControlFlow(Function *F, Random &R) {
     BasicBlock *Next = Curr->splitBasicBlock(Loc, "CF");
     Instr->moveBefore(Curr->getTerminator());
     if (Curr != &F->getEntryBlock()) {
-      BranchInst::Create(Curr, Next, Instr, Curr->getTerminator());
+      BranchInst::Create(Curr, Next, Instr,
+                         Curr->getTerminator()->getIterator());
       Curr->getTerminator()->eraseFromParent();
     }
   }
diff --git a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
index 9fd782d1b7614b1..5c71bc8063d6c9b 100644
--- a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
+++ b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp
@@ -1205,7 +1205,7 @@ TEST_F(CGSCCPassManagerTest, TestAnalysisInvalidationCGSCCUpdate) {
         // Insert a bitcast of `h3` so that we retain a ref edge to it.
         (void)CastInst::CreatePointerCast(
             &H3F, PointerType::getUnqual(H2F.getContext()), "dummy",
-            &*H2F.begin()->begin());
+            H2F.begin()->begin());
 
         // Now update the call graph.
         auto &NewC =
@@ -1251,7 +1251,7 @@ TEST_F(CGSCCPassManagerTest, TestAnalysisInvalidationCGSCCUpdate) {
         assert(H3F.getName() == "h3" && "Wrong called function!");
         H2F.begin()->begin()->eraseFromParent();
         // And insert a call to `h3`.
-        (void)CallInst::Create(&H3F, {}, "", &*H2F.begin()->begin());
+        (void)CallInst::Create(&H3F, {}, "", H2F.begin()->begin());
 
         // Now update the call graph.
         auto &NewC =
@@ -1359,7 +1359,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses0) {
         ASSERT_NE(FnH3, nullptr);
 
         // And insert a call to `h1`, `h2`, and `h3`.
-        Instruction *IP = &FnH2->getEntryBlock().front();
+        BasicBlock::iterator IP = FnH2->getEntryBlock().begin();
         (void)CallInst::Create(FnH1, {}, "", IP);
         (void)CallInst::Create(FnH2, {}, "", IP);
         (void)CallInst::Create(FnH3, {}, "", IP);
@@ -1396,7 +1396,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses1) {
     ASSERT_NE(FnH3, nullptr);
 
     // And insert a call to `h1`, `h2`, and `h3`.
-    Instruction *IP = &FnH2->getEntryBlock().front();
+    BasicBlock::iterator IP = FnH2->getEntryBlock().begin();
     (void)CallInst::Create(FnH1, {}, "", IP);
     (void)CallInst::Create(FnH2, {}, "", IP);
     (void)CallInst::Create(FnH3, {}, "", IP);
@@ -1429,7 +1429,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses2) {
         ASSERT_NE(FnH2, nullptr);
 
         // And insert a call to `h2`
-        Instruction *IP = &FnF->getEntryBlock().front();
+        BasicBlock::iterator IP = FnF->getEntryBlock().begin();
         (void)CallInst::Create(FnH2, {}, "", IP);
 
         auto &FN = *llvm::find_if(
@@ -1460,7 +1460,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses3) {
     ASSERT_NE(FnH2, nullptr);
 
     // And insert a call to `h2`
-    Instruction *IP = &FnF->getEntryBlock().front();
+    BasicBlock::iterator IP = FnF->getEntryBlock().begin();
     (void)CallInst::Create(FnH2, {}, "", IP);
 
     auto &FN = *llvm::find_if(
@@ -1492,7 +1492,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses4) {
         ReturnInst::Create(FnewF->getContext(), BB);
 
         // And insert a call to `newF`
-        Instruction *IP = &FnF->getEntryBlock().front();
+        BasicBlock::iterator IP = FnF->getEntryBlock().begin();
         (void)CallInst::Create(FnewF, {}, "", IP);
 
         // Use the CallGraphUpdater to update the call graph for the new
@@ -1536,7 +1536,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses5) {
     CGU.initialize(CG, C, AM, UR);
 
     // And insert a call to `newF`
-    Instruction *IP = &FnF->getEntryBlock().front();
+    BasicBlock::iterator IP = FnF->getEntryBlock().begin();
     (void)CallInst::Create(FnewF, {}, "", IP);
 
     auto &FN = *llvm::find_if(
@@ -1569,7 +1569,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses6) {
         ASSERT_NE(FnH3, nullptr);
 
         // And insert a call to `h1`, `h2`, and `h3`.
-        Instruction *IP = &FnH2->getEntryBlock().front();
+        BasicBlock::iterator IP = FnH2->getEntryBlock().begin();
         (void)CallInst::Create(FnH1, {}, "", IP);
         (void)CallInst::Create(FnH2, {}, "", IP);
         (void)CallInst::Create(FnH3, {}, "", IP);
@@ -1600,7 +1600,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses7) {
         ASSERT_NE(FnH2, nullptr);
 
         // And insert a call to `h2`
-        Instruction *IP = &FnF->getEntryBlock().front();
+        BasicBlock::iterator IP = FnF->getEntryBlock().begin();
         (void)CallInst::Create(FnH2, {}, "", IP);
 
         // Use the CallGraphUpdater to update the call graph for the new
@@ -1690,7 +1690,7 @@ TEST_F(CGSCCPassManagerTest, TestUpdateCGAndAnalysisManagerForPasses10) {
         ASSERT_NE(FnH3, nullptr);
 
         // And insert a call to `h1`, and `h3`.
-        Instruction *IP = &FnH1->getEntryBlock().front();
+        BasicBlock::iterator IP = FnH1->getEntryBlock().begin();
         (void)CallInst::Create(FnH1, {}, "", IP);
         (void)CallInst::Create(FnH3, {}, "", IP);
 
@@ -1763,11 +1763,11 @@ TEST_F(CGSCCPassManagerTest, TestInsertionOfNewFunctions1) {
           // 2. Insert a ref edge from 'f' to 'f'.
           (void)CastInst::CreatePointerCast(
               &F, PointerType::getUnqual(F.getContext()), "f.ref",
-              &F.getEntryBlock().front());
+              F.getEntryBlock().begin());
           // 3. Insert a ref edge from 'f' to 'g'.
           (void)CastInst::CreatePointerCast(
               G, PointerType::getUnqual(F.getContext()), "g.ref",
-              &F.getEntryBlock().front());
+              F.getEntryBlock().begin());
 
           CG.addSplitFunction(F, *G);
 
@@ -1827,9 +1827,9 @@ TEST_F(CGSCCPassManagerTest, TestInsertionOfNewFunctions2) {
       (void)ReturnInst::Create(G2->getContext(), G2BB);
 
       // Add 'f -> g1' call edge.
-      (void)CallInst::Create(G1, {}, "", &F.getEntryBlock().front());
+      (void)CallInst::Create(G1, {}, "", F.getEntryBlock().begin());
       // Add 'f -> g2' call edge.
-      (void)CallInst::Create(G2, {}, "", &F.getEntryBlock().front());
+      (void)CallInst::Create(G2, {}, "", F.getEntryBlock().begin());
 
       CG.addSplitFunction(F, *G1);
       CG.addSplitFunction(F, *G2);
@@ -1853,11 +1853,11 @@ TEST_F(CGSCCPassManagerTest, TestInsertionOfNewFunctions2) {
       // Add 'f -> h1' ref edge.
       (void)CastInst::CreatePointerCast(H1,
                                         PointerType::getUnqual(F.getContext()),
-                                        "h1.ref", &F.getEntryBlock().front());
+                                        "h1.ref", F.getEntryBlock().begin());
       // Add 'f -> h2' ref edge.
       (void)CastInst::CreatePointerCast(H2,
                                         PointerType::getUnqual(F.getContext()),
-                                        "h2.ref", &F.getEntryBlock().front());
+                                        "h2.ref", F.getEntryBlock().begin());
 
       CG.addSplitRefRecursiveFunctions(F, SmallVector<Function *, 2>({H1, H2}));
 
@@ -1980,7 +1980,8 @@ TEST_F(CGSCCPassManagerTest, TestInsertionOfNewNonTrivialCallEdge) {
       ASSERT_TRUE(F3 != nullptr);
 
       // Create call from f1 to f3.
-      (void)CallInst::Create(F3, {}, "", F.getEntryBlock().getTerminator());
+      (void)CallInst::Create(F3, {}, "",
+                             F.getEntryBlock().getTerminator()->getIterator());
 
       ASSERT_NO_FATAL_FAILURE(
           updateCGAndAnalysisManagerForCGSCCPass(CG, C, *N, AM, UR, FAM))
diff --git a/llvm/unittests/Analysis/LazyCallGraphTest.cpp b/llvm/unittests/Analysis/LazyCallGraphTest.cpp
index 64b6ccddc53b0c6..6cfc01ed81102fb 100644
--- a/llvm/unittests/Analysis/LazyCallGraphTest.cpp
+++ b/llvm/unittests/Analysis/LazyCallGraphTest.cpp
@@ -2357,7 +2357,7 @@ TEST(LazyCallGraphTest, AddSplitFunction1) {
   (void)ReturnInst::Create(Context, GBB);
 
   // Create f -call-> g.
-  (void)CallInst::Create(G, {}, "", &*F.getEntryBlock().begin());
+  (void)CallInst::Create(G, {}, "", F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2398,7 +2398,7 @@ TEST(LazyCallGraphTest, AddSplitFunction2) {
 
   // Create f -ref-> g.
   (void)CastInst::CreatePointerCast(G, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2441,7 +2441,7 @@ TEST(LazyCallGraphTest, AddSplitFunction3) {
   (void)ReturnInst::Create(Context, GBB);
 
   // Create f -call-> g.
-  (void)CallInst::Create(G, {}, "", &*F.getEntryBlock().begin());
+  (void)CallInst::Create(G, {}, "", F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2487,7 +2487,7 @@ TEST(LazyCallGraphTest, AddSplitFunction4) {
 
   // Create f -ref-> g.
   (void)CastInst::CreatePointerCast(G, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2533,7 +2533,7 @@ TEST(LazyCallGraphTest, AddSplitFunction5) {
 
   // Create f -ref-> g.
   (void)CastInst::CreatePointerCast(G, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2577,7 +2577,7 @@ TEST(LazyCallGraphTest, AddSplitFunction6) {
   (void)ReturnInst::Create(Context, GBB);
 
   // Create f -call-> g.
-  (void)CallInst::Create(G, {}, "", &*F.getEntryBlock().begin());
+  (void)CallInst::Create(G, {}, "", F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2628,7 +2628,7 @@ TEST(LazyCallGraphTest, AddSplitFunction7) {
   (void)ReturnInst::Create(Context, GBB);
 
   // Create f -call-> g.
-  (void)CallInst::Create(G, {}, "", &*F.getEntryBlock().begin());
+  (void)CallInst::Create(G, {}, "", F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2681,7 +2681,7 @@ TEST(LazyCallGraphTest, AddSplitFunction8) {
 
   // Create f -ref-> g.
   (void)CastInst::CreatePointerCast(G, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2734,7 +2734,7 @@ TEST(LazyCallGraphTest, AddSplitFunction9) {
   (void)ReturnInst::Create(Context, GBB);
 
   // Create f -call-> g.
-  (void)CallInst::Create(G, {}, "", &*F.getEntryBlock().begin());
+  (void)CallInst::Create(G, {}, "", F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2778,7 +2778,7 @@ TEST(LazyCallGraphTest, AddSplitFunctions1) {
 
   // Create f -ref-> g.
   (void)CastInst::CreatePointerCast(G, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2822,7 +2822,7 @@ TEST(LazyCallGraphTest, AddSplitFunctions2) {
 
   // Create f -ref-> g.
   (void)CastInst::CreatePointerCast(G, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2875,9 +2875,9 @@ TEST(LazyCallGraphTest, AddSplitFunctions3) {
 
   // Create f -ref-> g1 and f -ref-> g2.
   (void)CastInst::CreatePointerCast(G1, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
   (void)CastInst::CreatePointerCast(G2, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -2934,9 +2934,9 @@ TEST(LazyCallGraphTest, AddSplitFunctions4) {
 
   // Create f -ref-> g1 and f -ref-> g2.
   (void)CastInst::CreatePointerCast(G1, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
   (void)CastInst::CreatePointerCast(G2, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
@@ -3004,9 +3004,9 @@ TEST(LazyCallGraphTest, AddSplitFunctions5) {
 
   // Create f -ref-> g1 and f -ref-> g2.
   (void)CastInst::CreatePointerCast(G1, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
   (void)CastInst::CreatePointerCast(G2, PointerType::getUnqual(Context), "",
-                                    &*F.getEntryBlock().begin());
+                                    F.getEntryBlock().begin());
 
   EXPECT_FALSE(verifyModule(*M, &errs()));
 
diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
index 64a7503d30eedd0..6fc24f6796310d5 100644
--- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -137,7 +137,7 @@ TEST_F(ScalarEvolutionsTest, SimplifiedPHI) {
                      LoopBB);
   ReturnInst::Create(Context, nullptr, ExitBB);
   auto *Ty = Type::getInt32Ty(Context);
-  auto *PN = PHINode::Create(Ty, 2, "", &*LoopBB->begin());
+  auto *PN = PHINode::Create(Ty, 2, "", LoopBB->begin());
   PN->addIncoming(Constant::getNullValue(Ty), EntryBB);
   PN->addIncoming(UndefValue::get(Ty), LoopBB);
   ScalarEvolution SE = buildSE(*F);
@@ -930,10 +930,12 @@ TEST_F(ScalarEvolutionsTest, SCEVAddRecFromPHIwithLargeConstants) {
   auto *Int64_32 = ConstantInt::get(Context, APInt(64, 32));
   auto *Br = BranchInst::Create(
       LoopBB, ExitBB, UndefValue::get(Type::getInt1Ty(Context)), LoopBB);
-  auto *Phi = PHINode::Create(Type::getInt64Ty(Context), 2, "", Br);
-  auto *Shl = BinaryOperator::CreateShl(Phi, Int64_32, "", Br);
-  auto *AShr = BinaryOperator::CreateExactAShr(Shl, Int64_32, "", Br);
-  auto *Add = BinaryOperator::CreateAdd(AShr, MinInt64, "", Br);
+  auto *Phi =
+      PHINode::Create(Type::getInt64Ty(Context), 2, "", Br->getIterator());
+  auto *Shl = BinaryOperator::CreateShl(Phi, Int64_32, "", Br->getIterator());
+  auto *AShr =
+      BinaryOperator::CreateExactAShr(Shl, Int64_32, "", Br->getIterator());
+  auto *Add = BinaryOperator::CreateAdd(AShr, MinInt64, "", Br->getIterator());
   Phi->addIncoming(MinInt64, EntryBB);
   Phi->addIncoming(Add, LoopBB);
   // exit:
@@ -986,10 +988,11 @@ TEST_F(ScalarEvolutionsTest, SCEVAddRecFromPHIwithLargeConstantAccum) {
   auto *Int32_16 = ConstantInt::get(Context, APInt(32, 16));
   auto *Br = BranchInst::Create(
       LoopBB, ExitBB, UndefValue::get(Type::getInt1Ty(Context)), LoopBB);
-  auto *Phi = PHINode::Create(Int32Ty, 2, "", Br);
-  auto *Shl = BinaryOperator::CreateShl(Phi, Int32_16, "", Br);
-  auto *AShr = BinaryOperator::CreateExactAShr(Shl, Int32_16, "", Br);
-  auto *Add = BinaryOperator::CreateAdd(AShr, MinInt32, "", Br);
+  auto *Phi = PHINode::Create(Int32Ty, 2, "", Br->getIterator());
+  auto *Shl = BinaryOperator::CreateShl(Phi, Int32_16, "", Br->getIterator());
+  auto *AShr =
+      BinaryOperator::CreateExactAShr(Shl, Int32_16, "", Br->getIterator());
+  auto *Add = BinaryOperator::CreateAdd(AShr, MinInt32, "", Br->getIterator());
   auto *Arg = &*(F->arg_begin());
   Phi->addIncoming(Arg, EntryBB);
   Phi->addIncoming(Add, LoopBB);
diff --git a/llvm/unittests/FuzzMutate/OperationsTest.cpp b/llvm/unittests/FuzzMutate/OperationsTest.cpp
index be4c75423a89a53..bc972ad21d049f1 100644
--- a/llvm/unittests/FuzzMutate/OperationsTest.cpp
+++ b/llvm/unittests/FuzzMutate/OperationsTest.cpp
@@ -261,7 +261,7 @@ TEST(OperationsTest, SplitBlock) {
   // Create a block with only a return and split it on the return.
   auto *BB = BasicBlock::Create(Ctx, "BB", F);
   auto *RI = ReturnInst::Create(Ctx, BB);
-  SBOp.BuilderFunc({UndefValue::get(Type::getInt1Ty(Ctx))}, RI);
+  SBOp.BuilderFunc({UndefValue::get(Type::getInt1Ty(Ctx))}, RI->getIterator());
 
   // We should end up with an unconditional branch from BB to BB1, and the
   // return ends up in BB1.
@@ -271,9 +271,9 @@ TEST(OperationsTest, SplitBlock) {
   ASSERT_THAT(RI->getParent(), Eq(BB1));
 
   // Now add an instruction to BB1 and split on that.
-  auto *AI = new AllocaInst(Type::getInt8Ty(Ctx), 0, "a", RI);
+  auto *AI = new AllocaInst(Type::getInt8Ty(Ctx), 0, "a", RI->getIterator());
   Value *Cond = ConstantInt::getFalse(Ctx);
-  SBOp.BuilderFunc({Cond}, AI);
+  SBOp.BuilderFunc({Cond}, AI->getIterator());
 
   // We should end up with a loop back on BB1 and the instruction we split on
   // moves to BB2.
@@ -313,7 +313,7 @@ TEST(OperationsTest, SplitEHBlock) {
 
   fuzzerop::OpDescriptor Descr = fuzzerop::splitBlockDescriptor(1);
 
-  Descr.BuilderFunc({ConstantInt::getTrue(Ctx)}, &*BB.getFirstInsertionPt());
+  Descr.BuilderFunc({ConstantInt::getTrue(Ctx)}, BB.getFirstInsertionPt());
   ASSERT_TRUE(!verifyModule(*M, &errs()));
 }
 
@@ -346,7 +346,7 @@ TEST(OperationsTest, SplitBlockWithPhis) {
 
   // Now we split the block with PHI nodes, making sure they're all updated.
   Value *Cond = ConstantInt::getFalse(Ctx);
-  SBOp.BuilderFunc({Cond}, RI);
+  SBOp.BuilderFunc({Cond}, RI->getIterator());
 
   // Make sure the PHIs are updated with a value for the third incoming edge.
   EXPECT_THAT(PHI1->getNumIncomingValues(), Eq(3u));
@@ -373,7 +373,7 @@ TEST(OperationsTest, GEP) {
                                            ConstantInt::get(Int32Ty, 0)));
 
   GEPOp.BuilderFunc({UndefValue::get(Int8PtrTy), ConstantInt::get(Int32Ty, 0)},
-                    RI);
+                    RI->getIterator());
   EXPECT_FALSE(verifyModule(M, &errs()));
 }
 
diff --git a/llvm/unittests/IR/BasicBlockTest.cpp b/llvm/unittests/IR/BasicBlockTest.cpp
index 3756f227143a508..eea2746a352aa62 100644
--- a/llvm/unittests/IR/BasicBlockTest.cpp
+++ b/llvm/unittests/IR/BasicBlockTest.cpp
@@ -49,12 +49,15 @@ TEST(BasicBlockTest, PhiRange) {
 
   // Now insert some PHI nodes.
   auto *Int32Ty = Type::getInt32Ty(Context);
-  auto *P1 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.1", BI);
-  auto *P2 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.2", BI);
-  auto *P3 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.3", BI);
+  auto *P1 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.1",
+                             BI->getIterator());
+  auto *P2 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.2",
+                             BI->getIterator());
+  auto *P3 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.3",
+                             BI->getIterator());
 
   // Some non-PHI nodes.
-  auto *Sum = BinaryOperator::CreateAdd(P1, P2, "sum", BI);
+  auto *Sum = BinaryOperator::CreateAdd(P1, P2, "sum", BI->getIterator());
 
   // Now wire up the incoming values that are interesting.
   P1->addIncoming(P2, BB.get());
diff --git a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
index c4560cb9ce9b22a..32870d9fb373782 100644
--- a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
+++ b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
@@ -99,22 +99,24 @@ TEST_F(ScalarEvolutionExpanderTest, ExpandPtrTypeSCEV) {
   const DataLayout &DL = F->getDataLayout();
   BranchInst *Br = BranchInst::Create(
       LoopBB, ExitBB, UndefValue::get(Type::getInt1Ty(Context)), LoopBB);
-  AllocaInst *Alloca =
-      new AllocaInst(I32Ty, DL.getAllocaAddrSpace(), "alloca", Br);
+  AllocaInst *Alloca = new AllocaInst(I32Ty, DL.getAllocaAddrSpace(), "alloca",
+                                      Br->getIterator());
   ConstantInt *Ci32 = ConstantInt::get(Context, APInt(32, 1));
   GetElementPtrInst *Gep0 =
-      GetElementPtrInst::Create(I32Ty, Alloca, Ci32, "gep0", Br);
-  CastInst *CastA =
-      CastInst::CreateBitOrPointerCast(Gep0, I8PtrTy, "bitcast1", Br);
+      GetElementPtrInst::Create(I32Ty, Alloca, Ci32, "gep0", Br->getIterator());
+  CastInst *CastA = CastInst::CreateBitOrPointerCast(Gep0, I8PtrTy, "bitcast1",
+                                                     Br->getIterator());
   GetElementPtrInst *Gep1 =
-      GetElementPtrInst::Create(I8Ty, CastA, Ci32, "gep1", Br);
+      GetElementPtrInst::Create(I8Ty, CastA, Ci32, "gep1", Br->getIterator());
   GetElementPtrInst *Gep2 = GetElementPtrInst::Create(
-      I8Ty, UndefValue::get(I8PtrTy), Ci32, "gep2", Br);
+      I8Ty, UndefValue::get(I8PtrTy), Ci32, "gep2", Br->getIterator());
   CmpInst *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT,
-                                 UndefValue::get(I8PtrTy), CastA, "cmp", Br);
-  SelectInst *Sel = SelectInst::Create(Cmp, Gep1, Gep2, "select", Br);
-  CastInst *CastB =
-      CastInst::CreateBitOrPointerCast(Sel, I32PtrTy, "bitcast2", Br);
+                                 UndefValue::get(I8PtrTy), CastA, "cmp",
+                                 Br->getIterator());
+  SelectInst *Sel =
+      SelectInst::Create(Cmp, Gep1, Gep2, "select", Br->getIterator());
+  CastInst *CastB = CastInst::CreateBitOrPointerCast(Sel, I32PtrTy, "bitcast2",
+                                                     Br->getIterator());
 
   ScalarEvolution SE = buildSE(*F);
   const SCEV *S = SE.getSCEV(CastB);

From fdf8e3e31103bc81917cdb27150877f524bb2669 Mon Sep 17 00:00:00 2001
From: Volodymyr Sapsai <vsapsai@apple.com>
Date: Thu, 8 Aug 2024 11:23:47 -0300
Subject: [PATCH 016/112] [Modules][Diagnostic] Mention which AST file's
 options differ from the current TU options. (#101413)

Claiming a mismatch is always in a precompiled header is wrong and
misleading as a mismatch can happen in any provided AST file. Emitting a
path for a file with a problem allows to disambiguate between multiple
input files.

Use generic term "AST file" because we don't always know a kind of the
provided file (for example, see `ASTReader::readASTFileControlBlock`).

rdar://65005546
---
 .../Basic/DiagnosticSerializationKinds.td     |  46 ++---
 clang/include/clang/Serialization/ASTReader.h |  34 ++-
 clang/lib/Frontend/ASTUnit.cpp                |   6 +-
 clang/lib/Frontend/FrontendActions.cpp        |   8 +-
 clang/lib/Serialization/ASTReader.cpp         | 195 +++++++++---------
 .../Modules/check-for-sanitizer-feature.cpp   |   2 +-
 clang/test/Modules/ignored_macros.m           |   2 +-
 clang/test/Modules/load_failure.c             |   2 +-
 clang/test/Modules/merge-target-features.cpp  |   6 +-
 clang/test/Modules/mismatch-diagnostics.cpp   |   2 +-
 .../Modules/module-pch-different-cache-path.c |   4 +-
 clang/test/Modules/pr62359.cppm               |   4 +-
 clang/test/PCH/arc.m                          |   4 +-
 clang/test/PCH/fuzzy-pch.c                    |   6 +-
 clang/test/PCH/module-hash-difference.m       |   2 +-
 clang/test/PCH/ms-pch-macro.c                 |   4 +-
 clang/test/PCH/no-validate-pch.cl             |   4 +-
 17 files changed, 176 insertions(+), 155 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSerializationKinds.td b/clang/include/clang/Basic/DiagnosticSerializationKinds.td
index 51d0abbbec252ab..9854972cbfe7e40 100644
--- a/clang/include/clang/Basic/DiagnosticSerializationKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSerializationKinds.td
@@ -29,20 +29,20 @@ def note_pch_rebuild_required : Note<"please rebuild precompiled header '%0'">;
 def note_module_cache_path : Note<
     "after modifying system headers, please delete the module cache at '%0'">;
 
-def err_pch_targetopt_mismatch : Error<
-    "PCH file was compiled for the %0 '%1' but the current translation "
-    "unit is being compiled for target '%2'">;
-def err_pch_targetopt_feature_mismatch : Error<
-    "%select{AST file was|current translation unit is}0 compiled with the target "
-    "feature '%1' but the %select{current translation unit is|AST file was}0 "
+def err_ast_file_targetopt_mismatch : Error<
+    "AST file '%0' was compiled for the %1 '%2' but the current translation "
+    "unit is being compiled for target '%3'">;
+def err_ast_file_targetopt_feature_mismatch : Error<
+    "%select{AST file '%1' was|current translation unit is}0 compiled with the target "
+    "feature '%2' but the %select{current translation unit is|AST file '%1' was}0 "
     "not">;
-def err_pch_langopt_mismatch : Error<"%0 was %select{disabled|enabled}1 in "
-    "PCH file but is currently %select{disabled|enabled}2">;
-def err_pch_langopt_value_mismatch : Error<
-  "%0 differs in PCH file vs. current file">;
-def err_pch_diagopt_mismatch : Error<"%0 is currently enabled, but was not in "
-  "the PCH file">;
-def err_pch_modulecache_mismatch : Error<"PCH was compiled with module cache "
+def err_ast_file_langopt_mismatch : Error<"%0 was %select{disabled|enabled}1 in "
+    "AST file '%3' but is currently %select{disabled|enabled}2">;
+def err_ast_file_langopt_value_mismatch : Error<
+  "%0 differs in AST file '%1' vs. current file">;
+def err_ast_file_diagopt_mismatch : Error<"%0 is currently enabled, but was not in "
+  "the AST file '%1'">;
+def err_ast_file_modulecache_mismatch : Error<"AST file '%2' was compiled with module cache "
   "path '%0', but the path is currently '%1'">;
 def warn_pch_vfsoverlay_mismatch : Warning<
   "PCH was compiled with different VFS overlay files than are currently in use">,
@@ -99,19 +99,19 @@ def err_module_different_modmap : Error<
     "module '%0' %select{uses|does not use}1 additional module map '%2'"
     "%select{| not}1 used when the module was built">;
 
-def err_pch_macro_def_undef : Error<
-    "macro '%0' was %select{defined|undef'd}1 in the precompiled header but "
+def err_ast_file_macro_def_undef : Error<
+    "macro '%0' was %select{defined|undef'd}1 in the AST file '%2' but "
     "%select{undef'd|defined}1 on the command line">;
-def err_pch_macro_def_conflict : Error<
-    "definition of macro '%0' differs between the precompiled header ('%1') "
+def err_ast_file_macro_def_conflict : Error<
+    "definition of macro '%0' differs between the AST file '%3' ('%1') "
     "and the command line ('%2')">;
-def err_pch_undef : Error<
-    "%select{command line contains|precompiled header was built with}0 "
-    "'-undef' but %select{precompiled header was not built with it|"
+def err_ast_file_undef : Error<
+    "%select{command line contains|AST file '%1' was built with}0 "
+    "'-undef' but %select{AST file '%1' was not built with it|"
     "it is not present on the command line}0">;
-def err_pch_pp_detailed_record : Error<
-    "%select{command line contains|precompiled header was built with}0 "
-    "'-detailed-preprocessing-record' but %select{precompiled header was not "
+def err_ast_file_pp_detailed_record : Error<
+    "%select{command line contains|AST file '%1' was built with}0 "
+    "'-detailed-preprocessing-record' but %select{AST file '%1' was not "
     "built with it|it is not present on the command line}0">;
 
 def err_module_odr_violation_missing_decl : Error<
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index 6e4b0ec3d27c1b4..db93fbd703260d9 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -130,6 +130,7 @@ class ASTReaderListener {
   ///
   /// \returns true to indicate the options are invalid or false otherwise.
   virtual bool ReadLanguageOptions(const LangOptions &LangOpts,
+                                   StringRef ModuleFilename,
                                    bool Complain,
                                    bool AllowCompatibleDifferences) {
     return false;
@@ -139,7 +140,7 @@ class ASTReaderListener {
   ///
   /// \returns true to indicate the target options are invalid, or false
   /// otherwise.
-  virtual bool ReadTargetOptions(const TargetOptions &TargetOpts, bool Complain,
+  virtual bool ReadTargetOptions(const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
                                  bool AllowCompatibleDifferences) {
     return false;
   }
@@ -150,6 +151,7 @@ class ASTReaderListener {
   /// otherwise.
   virtual bool
   ReadDiagnosticOptions(IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts,
+                        StringRef ModuleFilename,
                         bool Complain) {
     return false;
   }
@@ -172,6 +174,7 @@ class ASTReaderListener {
   /// \returns true to indicate the header search options are invalid, or false
   /// otherwise.
   virtual bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                                       StringRef ModuleFilename,
                                        StringRef SpecificModuleCachePath,
                                        bool Complain) {
     return false;
@@ -200,6 +203,7 @@ class ASTReaderListener {
   /// \returns true to indicate the preprocessor options are invalid, or false
   /// otherwise.
   virtual bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
+                                       StringRef ModuleFilename,
                                        bool ReadMacros, bool Complain,
                                        std::string &SuggestedPredefines) {
     return false;
@@ -262,19 +266,22 @@ class ChainedASTReaderListener : public ASTReaderListener {
   bool ReadFullVersionInformation(StringRef FullVersion) override;
   void ReadModuleName(StringRef ModuleName) override;
   void ReadModuleMapFile(StringRef ModuleMapPath) override;
-  bool ReadLanguageOptions(const LangOptions &LangOpts, bool Complain,
+  bool ReadLanguageOptions(const LangOptions &LangOpts, StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override;
-  bool ReadTargetOptions(const TargetOptions &TargetOpts, bool Complain,
+  bool ReadTargetOptions(const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
                          bool AllowCompatibleDifferences) override;
   bool ReadDiagnosticOptions(IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts,
+                             StringRef ModuleFilename,
                              bool Complain) override;
   bool ReadFileSystemOptions(const FileSystemOptions &FSOpts,
                              bool Complain) override;
 
   bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                               StringRef ModuleFilename,
                                StringRef SpecificModuleCachePath,
                                bool Complain) override;
   bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
+                               StringRef ModuleFilename,
                                bool ReadMacros, bool Complain,
                                std::string &SuggestedPredefines) override;
 
@@ -299,16 +306,19 @@ class PCHValidator : public ASTReaderListener {
   PCHValidator(Preprocessor &PP, ASTReader &Reader)
       : PP(PP), Reader(Reader) {}
 
-  bool ReadLanguageOptions(const LangOptions &LangOpts, bool Complain,
+  bool ReadLanguageOptions(const LangOptions &LangOpts, StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override;
-  bool ReadTargetOptions(const TargetOptions &TargetOpts, bool Complain,
+  bool ReadTargetOptions(const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
                          bool AllowCompatibleDifferences) override;
   bool ReadDiagnosticOptions(IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts,
+                             StringRef ModuleFilename,
                              bool Complain) override;
   bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
+                               StringRef ModuleFilename,
                                bool ReadMacros, bool Complain,
                                std::string &SuggestedPredefines) override;
   bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                               StringRef ModuleFilename,
                                StringRef SpecificModuleCachePath,
                                bool Complain) override;
   void ReadCounter(const serialization::ModuleFile &M, unsigned Value) override;
@@ -325,6 +335,7 @@ class SimpleASTReaderListener : public ASTReaderListener {
   SimpleASTReaderListener(Preprocessor &PP) : PP(PP) {}
 
   bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
+                               StringRef ModuleFilename,
                                bool ReadMacros, bool Complain,
                                std::string &SuggestedPredefines) override;
 };
@@ -1367,7 +1378,7 @@ class ASTReader
                                  const ModuleFile *ImportedBy,
                                  unsigned ClientLoadCapabilities);
   static ASTReadResult ReadOptionsBlock(
-      llvm::BitstreamCursor &Stream, unsigned ClientLoadCapabilities,
+      llvm::BitstreamCursor &Stream, StringRef Filename, unsigned ClientLoadCapabilities,
       bool AllowCompatibleConfigurationMismatch, ASTReaderListener &Listener,
       std::string &SuggestedPredefines);
 
@@ -1380,6 +1391,7 @@ class ASTReader
 
   static ASTReadResult
   readUnhashedControlBlockImpl(ModuleFile *F, llvm::StringRef StreamData,
+                               StringRef Filename,
                                unsigned ClientLoadCapabilities,
                                bool AllowCompatibleConfigurationMismatch,
                                ASTReaderListener *Listener,
@@ -1396,21 +1408,21 @@ class ASTReader
                                        unsigned ClientLoadCapabilities);
   llvm::Error ReadSubmoduleBlock(ModuleFile &F,
                                  unsigned ClientLoadCapabilities);
-  static bool ParseLanguageOptions(const RecordData &Record, bool Complain,
+  static bool ParseLanguageOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
                                    ASTReaderListener &Listener,
                                    bool AllowCompatibleDifferences);
-  static bool ParseTargetOptions(const RecordData &Record, bool Complain,
+  static bool ParseTargetOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
                                  ASTReaderListener &Listener,
                                  bool AllowCompatibleDifferences);
-  static bool ParseDiagnosticOptions(const RecordData &Record, bool Complain,
+  static bool ParseDiagnosticOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
                                      ASTReaderListener &Listener);
   static bool ParseFileSystemOptions(const RecordData &Record, bool Complain,
                                      ASTReaderListener &Listener);
-  static bool ParseHeaderSearchOptions(const RecordData &Record, bool Complain,
+  static bool ParseHeaderSearchOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
                                        ASTReaderListener &Listener);
   static bool ParseHeaderSearchPaths(const RecordData &Record, bool Complain,
                                      ASTReaderListener &Listener);
-  static bool ParsePreprocessorOptions(const RecordData &Record, bool Complain,
+  static bool ParsePreprocessorOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
                                        ASTReaderListener &Listener,
                                        std::string &SuggestedPredefines);
 
diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
index 67d4c07d1ce39a0..aec1402d6cb95b4 100644
--- a/clang/lib/Frontend/ASTUnit.cpp
+++ b/clang/lib/Frontend/ASTUnit.cpp
@@ -536,7 +536,7 @@ class ASTInfoCollector : public ASTReaderListener {
         LangOpt(LangOpt), TargetOpts(TargetOpts), Target(Target),
         Counter(Counter) {}
 
-  bool ReadLanguageOptions(const LangOptions &LangOpts, bool Complain,
+  bool ReadLanguageOptions(const LangOptions &LangOpts, StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override {
     if (InitializedLanguage)
       return false;
@@ -559,6 +559,7 @@ class ASTInfoCollector : public ASTReaderListener {
   }
 
   bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                               StringRef ModuleFilename,
                                StringRef SpecificModuleCachePath,
                                bool Complain) override {
     // llvm::SaveAndRestore doesn't support bit field.
@@ -597,13 +598,14 @@ class ASTInfoCollector : public ASTReaderListener {
   }
 
   bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
+                               StringRef ModuleFilename,
                                bool ReadMacros, bool Complain,
                                std::string &SuggestedPredefines) override {
     this->PPOpts = PPOpts;
     return false;
   }
 
-  bool ReadTargetOptions(const TargetOptions &TargetOpts, bool Complain,
+  bool ReadTargetOptions(const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
                          bool AllowCompatibleDifferences) override {
     // If we've already initialized the target, don't do it again.
     if (Target)
diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index e70210d55fe28d5..f9887fe634ec6cd 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -622,7 +622,7 @@ namespace {
       Out.indent(2) << "Module map file: " << ModuleMapPath << "\n";
     }
 
-    bool ReadLanguageOptions(const LangOptions &LangOpts, bool Complain,
+    bool ReadLanguageOptions(const LangOptions &LangOpts, StringRef ModuleFilename, bool Complain,
                              bool AllowCompatibleDifferences) override {
       Out.indent(2) << "Language options:\n";
 #define LANGOPT(Name, Bits, Default, Description) \
@@ -645,7 +645,7 @@ namespace {
       return false;
     }
 
-    bool ReadTargetOptions(const TargetOptions &TargetOpts, bool Complain,
+    bool ReadTargetOptions(const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override {
       Out.indent(2) << "Target options:\n";
       Out.indent(4) << "  Triple: " << TargetOpts.Triple << "\n";
@@ -665,7 +665,7 @@ namespace {
     }
 
     bool ReadDiagnosticOptions(IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts,
-                               bool Complain) override {
+                               StringRef ModuleFilename, bool Complain) override {
       Out.indent(2) << "Diagnostic options:\n";
 #define DIAGOPT(Name, Bits, Default) DUMP_BOOLEAN(DiagOpts->Name, #Name);
 #define ENUM_DIAGOPT(Name, Type, Bits, Default) \
@@ -684,6 +684,7 @@ namespace {
     }
 
     bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                                 StringRef ModuleFilename,
                                  StringRef SpecificModuleCachePath,
                                  bool Complain) override {
       Out.indent(2) << "Header search options:\n";
@@ -717,6 +718,7 @@ namespace {
     }
 
     bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
+                                 StringRef ModuleFilename,
                                  bool ReadMacros, bool Complain,
                                  std::string &SuggestedPredefines) override {
       Out.indent(2) << "Preprocessor options:\n";
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index e7ccf8a60feb5df..7fbd0ddde2d8a75 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -174,27 +174,28 @@ void ChainedASTReaderListener::ReadModuleMapFile(StringRef ModuleMapPath) {
 
 bool
 ChainedASTReaderListener::ReadLanguageOptions(const LangOptions &LangOpts,
+                                              StringRef ModuleFilename,
                                               bool Complain,
                                               bool AllowCompatibleDifferences) {
-  return First->ReadLanguageOptions(LangOpts, Complain,
+  return First->ReadLanguageOptions(LangOpts, ModuleFilename, Complain,
                                     AllowCompatibleDifferences) ||
-         Second->ReadLanguageOptions(LangOpts, Complain,
+         Second->ReadLanguageOptions(LangOpts, ModuleFilename, Complain,
                                      AllowCompatibleDifferences);
 }
 
 bool ChainedASTReaderListener::ReadTargetOptions(
-    const TargetOptions &TargetOpts, bool Complain,
+    const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
     bool AllowCompatibleDifferences) {
-  return First->ReadTargetOptions(TargetOpts, Complain,
+  return First->ReadTargetOptions(TargetOpts, ModuleFilename, Complain,
                                   AllowCompatibleDifferences) ||
-         Second->ReadTargetOptions(TargetOpts, Complain,
+         Second->ReadTargetOptions(TargetOpts, ModuleFilename, Complain,
                                    AllowCompatibleDifferences);
 }
 
 bool ChainedASTReaderListener::ReadDiagnosticOptions(
-    IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts, bool Complain) {
-  return First->ReadDiagnosticOptions(DiagOpts, Complain) ||
-         Second->ReadDiagnosticOptions(DiagOpts, Complain);
+    IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts, StringRef ModuleFilename, bool Complain) {
+  return First->ReadDiagnosticOptions(DiagOpts, ModuleFilename, Complain) ||
+         Second->ReadDiagnosticOptions(DiagOpts, ModuleFilename, Complain);
 }
 
 bool
@@ -205,20 +206,20 @@ ChainedASTReaderListener::ReadFileSystemOptions(const FileSystemOptions &FSOpts,
 }
 
 bool ChainedASTReaderListener::ReadHeaderSearchOptions(
-    const HeaderSearchOptions &HSOpts, StringRef SpecificModuleCachePath,
+    const HeaderSearchOptions &HSOpts, StringRef ModuleFilename, StringRef SpecificModuleCachePath,
     bool Complain) {
-  return First->ReadHeaderSearchOptions(HSOpts, SpecificModuleCachePath,
+  return First->ReadHeaderSearchOptions(HSOpts, ModuleFilename, SpecificModuleCachePath,
                                         Complain) ||
-         Second->ReadHeaderSearchOptions(HSOpts, SpecificModuleCachePath,
+         Second->ReadHeaderSearchOptions(HSOpts, ModuleFilename, SpecificModuleCachePath,
                                          Complain);
 }
 
 bool ChainedASTReaderListener::ReadPreprocessorOptions(
-    const PreprocessorOptions &PPOpts, bool ReadMacros, bool Complain,
+    const PreprocessorOptions &PPOpts, StringRef ModuleFilename, bool ReadMacros, bool Complain,
     std::string &SuggestedPredefines) {
-  return First->ReadPreprocessorOptions(PPOpts, ReadMacros, Complain,
+  return First->ReadPreprocessorOptions(PPOpts, ModuleFilename, ReadMacros, Complain,
                                         SuggestedPredefines) ||
-         Second->ReadPreprocessorOptions(PPOpts, ReadMacros, Complain,
+         Second->ReadPreprocessorOptions(PPOpts, ModuleFilename, ReadMacros, Complain,
                                          SuggestedPredefines);
 }
 
@@ -282,17 +283,17 @@ ASTReaderListener::~ASTReaderListener() = default;
 /// \returns true if the languagae options mis-match, false otherwise.
 static bool checkLanguageOptions(const LangOptions &LangOpts,
                                  const LangOptions &ExistingLangOpts,
-                                 DiagnosticsEngine *Diags,
+                                 StringRef ModuleFilename, DiagnosticsEngine *Diags,
                                  bool AllowCompatibleDifferences = true) {
 #define LANGOPT(Name, Bits, Default, Description)                   \
   if (ExistingLangOpts.Name != LangOpts.Name) {                     \
     if (Diags) {                                                    \
       if (Bits == 1)                                                \
-        Diags->Report(diag::err_pch_langopt_mismatch)               \
-          << Description << LangOpts.Name << ExistingLangOpts.Name; \
+        Diags->Report(diag::err_ast_file_langopt_mismatch)          \
+          << Description << LangOpts.Name << ExistingLangOpts.Name << ModuleFilename; \
       else                                                          \
-        Diags->Report(diag::err_pch_langopt_value_mismatch)         \
-          << Description;                                           \
+        Diags->Report(diag::err_ast_file_langopt_value_mismatch)    \
+          << Description << ModuleFilename;                         \
     }                                                               \
     return true;                                                    \
   }
@@ -300,16 +301,16 @@ static bool checkLanguageOptions(const LangOptions &LangOpts,
 #define VALUE_LANGOPT(Name, Bits, Default, Description)   \
   if (ExistingLangOpts.Name != LangOpts.Name) {           \
     if (Diags)                                            \
-      Diags->Report(diag::err_pch_langopt_value_mismatch) \
-        << Description;                                   \
+      Diags->Report(diag::err_ast_file_langopt_value_mismatch) \
+        << Description << ModuleFilename;                 \
     return true;                                          \
   }
 
 #define ENUM_LANGOPT(Name, Type, Bits, Default, Description)   \
   if (ExistingLangOpts.get##Name() != LangOpts.get##Name()) {  \
     if (Diags)                                                 \
-      Diags->Report(diag::err_pch_langopt_value_mismatch)      \
-        << Description;                                        \
+      Diags->Report(diag::err_ast_file_langopt_value_mismatch) \
+        << Description << ModuleFilename;                      \
     return true;                                               \
   }
 
@@ -332,22 +333,22 @@ static bool checkLanguageOptions(const LangOptions &LangOpts,
 
   if (ExistingLangOpts.ModuleFeatures != LangOpts.ModuleFeatures) {
     if (Diags)
-      Diags->Report(diag::err_pch_langopt_value_mismatch) << "module features";
+      Diags->Report(diag::err_ast_file_langopt_value_mismatch) << "module features" << ModuleFilename;
     return true;
   }
 
   if (ExistingLangOpts.ObjCRuntime != LangOpts.ObjCRuntime) {
     if (Diags)
-      Diags->Report(diag::err_pch_langopt_value_mismatch)
-      << "target Objective-C runtime";
+      Diags->Report(diag::err_ast_file_langopt_value_mismatch)
+      << "target Objective-C runtime" << ModuleFilename;
     return true;
   }
 
   if (ExistingLangOpts.CommentOpts.BlockCommandNames !=
       LangOpts.CommentOpts.BlockCommandNames) {
     if (Diags)
-      Diags->Report(diag::err_pch_langopt_value_mismatch)
-        << "block command names";
+      Diags->Report(diag::err_ast_file_langopt_value_mismatch)
+        << "block command names" << ModuleFilename;
     return true;
   }
 
@@ -369,8 +370,8 @@ static bool checkLanguageOptions(const LangOptions &LangOpts,
     bool InExistingModule = ExistingSanitizers.has(SanitizerKind::ID);         \
     bool InImportedModule = ImportedSanitizers.has(SanitizerKind::ID);         \
     if (InExistingModule != InImportedModule)                                  \
-      Diags->Report(diag::err_pch_targetopt_feature_mismatch)                  \
-          << InExistingModule << (Flag + NAME);                                \
+      Diags->Report(diag::err_ast_file_targetopt_feature_mismatch)             \
+          << InExistingModule << ModuleFilename << (Flag + NAME);              \
   }
 #include "clang/Basic/Sanitizers.def"
       }
@@ -389,13 +390,13 @@ static bool checkLanguageOptions(const LangOptions &LangOpts,
 /// \returns true if the target options mis-match, false otherwise.
 static bool checkTargetOptions(const TargetOptions &TargetOpts,
                                const TargetOptions &ExistingTargetOpts,
-                               DiagnosticsEngine *Diags,
+                               StringRef ModuleFilename, DiagnosticsEngine *Diags,
                                bool AllowCompatibleDifferences = true) {
 #define CHECK_TARGET_OPT(Field, Name)                             \
   if (TargetOpts.Field != ExistingTargetOpts.Field) {             \
     if (Diags)                                                    \
-      Diags->Report(diag::err_pch_targetopt_mismatch)             \
-        << Name << TargetOpts.Field << ExistingTargetOpts.Field;  \
+      Diags->Report(diag::err_ast_file_targetopt_mismatch)        \
+        << ModuleFilename << Name << TargetOpts.Field << ExistingTargetOpts.Field; \
     return true;                                                  \
   }
 
@@ -439,11 +440,11 @@ static bool checkTargetOptions(const TargetOptions &TargetOpts,
 
   if (Diags) {
     for (StringRef Feature : UnmatchedReadFeatures)
-      Diags->Report(diag::err_pch_targetopt_feature_mismatch)
-          << /* is-existing-feature */ false << Feature;
+      Diags->Report(diag::err_ast_file_targetopt_feature_mismatch)
+          << /* is-existing-feature */ false << ModuleFilename << Feature;
     for (StringRef Feature : UnmatchedExistingFeatures)
-      Diags->Report(diag::err_pch_targetopt_feature_mismatch)
-          << /* is-existing-feature */ true << Feature;
+      Diags->Report(diag::err_ast_file_targetopt_feature_mismatch)
+          << /* is-existing-feature */ true << ModuleFilename << Feature;
   }
 
   return !UnmatchedReadFeatures.empty() || !UnmatchedExistingFeatures.empty();
@@ -451,19 +452,20 @@ static bool checkTargetOptions(const TargetOptions &TargetOpts,
 
 bool
 PCHValidator::ReadLanguageOptions(const LangOptions &LangOpts,
+                                  StringRef ModuleFilename,
                                   bool Complain,
                                   bool AllowCompatibleDifferences) {
   const LangOptions &ExistingLangOpts = PP.getLangOpts();
-  return checkLanguageOptions(LangOpts, ExistingLangOpts,
+  return checkLanguageOptions(LangOpts, ExistingLangOpts, ModuleFilename,
                               Complain ? &Reader.Diags : nullptr,
                               AllowCompatibleDifferences);
 }
 
 bool PCHValidator::ReadTargetOptions(const TargetOptions &TargetOpts,
-                                     bool Complain,
+                                     StringRef ModuleFilename, bool Complain,
                                      bool AllowCompatibleDifferences) {
   const TargetOptions &ExistingTargetOpts = PP.getTargetInfo().getTargetOpts();
-  return checkTargetOptions(TargetOpts, ExistingTargetOpts,
+  return checkTargetOptions(TargetOpts, ExistingTargetOpts, ModuleFilename,
                             Complain ? &Reader.Diags : nullptr,
                             AllowCompatibleDifferences);
 }
@@ -478,7 +480,7 @@ using DeclsMap = llvm::DenseMap<DeclarationName, SmallVector<NamedDecl *, 8>>;
 
 static bool checkDiagnosticGroupMappings(DiagnosticsEngine &StoredDiags,
                                          DiagnosticsEngine &Diags,
-                                         bool Complain) {
+                                         StringRef ModuleFilename, bool Complain) {
   using Level = DiagnosticsEngine::Level;
 
   // Check current mappings for new -Werror mappings, and the stored mappings
@@ -496,8 +498,8 @@ static bool checkDiagnosticGroupMappings(DiagnosticsEngine &StoredDiags,
           StoredDiags.getDiagnosticLevel(DiagID, SourceLocation());
       if (StoredLevel < DiagnosticsEngine::Error) {
         if (Complain)
-          Diags.Report(diag::err_pch_diagopt_mismatch) << "-Werror=" +
-              Diags.getDiagnosticIDs()->getWarningOptionForDiag(DiagID).str();
+          Diags.Report(diag::err_ast_file_diagopt_mismatch) << "-Werror=" +
+              Diags.getDiagnosticIDs()->getWarningOptionForDiag(DiagID).str() << ModuleFilename;
         return true;
       }
     }
@@ -514,7 +516,7 @@ static bool isExtHandlingFromDiagsError(DiagnosticsEngine &Diags) {
 }
 
 static bool checkDiagnosticMappings(DiagnosticsEngine &StoredDiags,
-                                    DiagnosticsEngine &Diags, bool IsSystem,
+                                    DiagnosticsEngine &Diags, StringRef ModuleFilename, bool IsSystem,
                                     bool SystemHeaderWarningsInModule,
                                     bool Complain) {
   // Top-level options
@@ -526,32 +528,32 @@ static bool checkDiagnosticMappings(DiagnosticsEngine &StoredDiags,
     if (StoredDiags.getSuppressSystemWarnings() &&
         !SystemHeaderWarningsInModule) {
       if (Complain)
-        Diags.Report(diag::err_pch_diagopt_mismatch) << "-Wsystem-headers";
+        Diags.Report(diag::err_ast_file_diagopt_mismatch) << "-Wsystem-headers" << ModuleFilename;
       return true;
     }
   }
 
   if (Diags.getWarningsAsErrors() && !StoredDiags.getWarningsAsErrors()) {
     if (Complain)
-      Diags.Report(diag::err_pch_diagopt_mismatch) << "-Werror";
+      Diags.Report(diag::err_ast_file_diagopt_mismatch) << "-Werror" << ModuleFilename;
     return true;
   }
 
   if (Diags.getWarningsAsErrors() && Diags.getEnableAllWarnings() &&
       !StoredDiags.getEnableAllWarnings()) {
     if (Complain)
-      Diags.Report(diag::err_pch_diagopt_mismatch) << "-Weverything -Werror";
+      Diags.Report(diag::err_ast_file_diagopt_mismatch) << "-Weverything -Werror" << ModuleFilename;
     return true;
   }
 
   if (isExtHandlingFromDiagsError(Diags) &&
       !isExtHandlingFromDiagsError(StoredDiags)) {
     if (Complain)
-      Diags.Report(diag::err_pch_diagopt_mismatch) << "-pedantic-errors";
+      Diags.Report(diag::err_ast_file_diagopt_mismatch) << "-pedantic-errors" << ModuleFilename;
     return true;
   }
 
-  return checkDiagnosticGroupMappings(StoredDiags, Diags, Complain);
+  return checkDiagnosticGroupMappings(StoredDiags, Diags, ModuleFilename, Complain);
 }
 
 /// Return the top import module if it is implicit, nullptr otherwise.
@@ -580,7 +582,7 @@ static Module *getTopImportImplicitModule(ModuleManager &ModuleMgr,
 }
 
 bool PCHValidator::ReadDiagnosticOptions(
-    IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts, bool Complain) {
+    IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts, StringRef ModuleFilename, bool Complain) {
   DiagnosticsEngine &ExistingDiags = PP.getDiagnostics();
   IntrusiveRefCntPtr<DiagnosticIDs> DiagIDs(ExistingDiags.getDiagnosticIDs());
   IntrusiveRefCntPtr<DiagnosticsEngine> Diags(
@@ -605,7 +607,7 @@ bool PCHValidator::ReadDiagnosticOptions(
 
   // FIXME: if the diagnostics are incompatible, save a DiagnosticOptions that
   // contains the union of their flags.
-  return checkDiagnosticMappings(*Diags, ExistingDiags, TopM->IsSystem,
+  return checkDiagnosticMappings(*Diags, ExistingDiags, ModuleFilename, TopM->IsSystem,
                                  SystemHeaderWarningsInModule, Complain);
 }
 
@@ -665,7 +667,7 @@ enum OptionValidation {
 ///        are no differences in the options between the two.
 static bool checkPreprocessorOptions(
     const PreprocessorOptions &PPOpts,
-    const PreprocessorOptions &ExistingPPOpts, bool ReadMacros,
+    const PreprocessorOptions &ExistingPPOpts, StringRef ModuleFilename, bool ReadMacros,
     DiagnosticsEngine *Diags, FileManager &FileMgr,
     std::string &SuggestedPredefines, const LangOptions &LangOpts,
     OptionValidation Validation = OptionValidateContradictions) {
@@ -695,7 +697,7 @@ static bool checkPreprocessorOptions(
           // If strict matches are requested, don't tolerate any extra defines
           // on the command line that are missing in the AST file.
           if (Diags) {
-            Diags->Report(diag::err_pch_macro_def_undef) << MacroName << true;
+            Diags->Report(diag::err_ast_file_macro_def_undef) << MacroName << true << ModuleFilename;
           }
           return true;
         }
@@ -721,8 +723,8 @@ static bool checkPreprocessorOptions(
       // conflict.
       if (Existing.second != Known->second.second) {
         if (Diags) {
-          Diags->Report(diag::err_pch_macro_def_undef)
-              << MacroName << Known->second.second;
+          Diags->Report(diag::err_ast_file_macro_def_undef)
+              << MacroName << Known->second.second << ModuleFilename;
         }
         return true;
       }
@@ -736,8 +738,8 @@ static bool checkPreprocessorOptions(
 
       // The macro bodies differ; complain.
       if (Diags) {
-        Diags->Report(diag::err_pch_macro_def_conflict)
-            << MacroName << Known->second.first << Existing.first;
+        Diags->Report(diag::err_ast_file_macro_def_conflict)
+            << MacroName << Known->second.first << Existing.first << ModuleFilename;
       }
       return true;
     }
@@ -750,7 +752,7 @@ static bool checkPreprocessorOptions(
       // the AST file that are missing on the command line.
       for (const auto &MacroName : ASTFileMacros.keys()) {
         if (Diags) {
-          Diags->Report(diag::err_pch_macro_def_undef) << MacroName << false;
+          Diags->Report(diag::err_ast_file_macro_def_undef) << MacroName << false << ModuleFilename;
         }
         return true;
       }
@@ -761,7 +763,7 @@ static bool checkPreprocessorOptions(
   if (PPOpts.UsePredefines != ExistingPPOpts.UsePredefines &&
       Validation != OptionValidateNone) {
     if (Diags) {
-      Diags->Report(diag::err_pch_undef) << ExistingPPOpts.UsePredefines;
+      Diags->Report(diag::err_ast_file_undef) << ExistingPPOpts.UsePredefines << ModuleFilename;
     }
     return true;
   }
@@ -771,7 +773,7 @@ static bool checkPreprocessorOptions(
       PPOpts.DetailedRecord != ExistingPPOpts.DetailedRecord &&
       Validation != OptionValidateNone) {
     if (Diags) {
-      Diags->Report(diag::err_pch_pp_detailed_record) << PPOpts.DetailedRecord;
+      Diags->Report(diag::err_ast_file_pp_detailed_record) << PPOpts.DetailedRecord << ModuleFilename;
     }
     return true;
   }
@@ -815,19 +817,19 @@ static bool checkPreprocessorOptions(
 }
 
 bool PCHValidator::ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                                           bool ReadMacros, bool Complain,
+                                           StringRef ModuleFilename, bool ReadMacros, bool Complain,
                                            std::string &SuggestedPredefines) {
   const PreprocessorOptions &ExistingPPOpts = PP.getPreprocessorOpts();
 
   return checkPreprocessorOptions(
-      PPOpts, ExistingPPOpts, ReadMacros, Complain ? &Reader.Diags : nullptr,
+      PPOpts, ExistingPPOpts, ModuleFilename, ReadMacros, Complain ? &Reader.Diags : nullptr,
       PP.getFileManager(), SuggestedPredefines, PP.getLangOpts());
 }
 
 bool SimpleASTReaderListener::ReadPreprocessorOptions(
-    const PreprocessorOptions &PPOpts, bool ReadMacros, bool Complain,
+    const PreprocessorOptions &PPOpts, StringRef ModuleFilename, bool ReadMacros, bool Complain,
     std::string &SuggestedPredefines) {
-  return checkPreprocessorOptions(PPOpts, PP.getPreprocessorOpts(), ReadMacros,
+  return checkPreprocessorOptions(PPOpts, PP.getPreprocessorOpts(), ModuleFilename, ReadMacros,
                                   nullptr, PP.getFileManager(),
                                   SuggestedPredefines, PP.getLangOpts(),
                                   OptionValidateNone);
@@ -840,7 +842,7 @@ bool SimpleASTReaderListener::ReadPreprocessorOptions(
 static bool checkModuleCachePath(llvm::vfs::FileSystem &VFS,
                                  StringRef SpecificModuleCachePath,
                                  StringRef ExistingModuleCachePath,
-                                 DiagnosticsEngine *Diags,
+                                 StringRef ModuleFilename, DiagnosticsEngine *Diags,
                                  const LangOptions &LangOpts,
                                  const PreprocessorOptions &PPOpts) {
   if (!LangOpts.Modules || PPOpts.AllowPCHWithDifferentModulesCachePath ||
@@ -851,18 +853,19 @@ static bool checkModuleCachePath(llvm::vfs::FileSystem &VFS,
   if (EqualOrErr && *EqualOrErr)
     return false;
   if (Diags)
-    Diags->Report(diag::err_pch_modulecache_mismatch)
-        << SpecificModuleCachePath << ExistingModuleCachePath;
+    Diags->Report(diag::err_ast_file_modulecache_mismatch)
+        << SpecificModuleCachePath << ExistingModuleCachePath << ModuleFilename;
   return true;
 }
 
 bool PCHValidator::ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                                           StringRef ModuleFilename,
                                            StringRef SpecificModuleCachePath,
                                            bool Complain) {
   return checkModuleCachePath(Reader.getFileManager().getVirtualFileSystem(),
                               SpecificModuleCachePath,
                               PP.getHeaderSearchInfo().getModuleCachePath(),
-                              Complain ? &Reader.Diags : nullptr,
+                              ModuleFilename, Complain ? &Reader.Diags : nullptr,
                               PP.getLangOpts(), PP.getPreprocessorOpts());
 }
 
@@ -2761,7 +2764,7 @@ static bool isDiagnosedResult(ASTReader::ASTReadResult ARR, unsigned Caps) {
 }
 
 ASTReader::ASTReadResult ASTReader::ReadOptionsBlock(
-    BitstreamCursor &Stream, unsigned ClientLoadCapabilities,
+    BitstreamCursor &Stream, StringRef Filename, unsigned ClientLoadCapabilities,
     bool AllowCompatibleConfigurationMismatch, ASTReaderListener &Listener,
     std::string &SuggestedPredefines) {
   if (llvm::Error Err = Stream.EnterSubBlock(OPTIONS_BLOCK_ID)) {
@@ -2806,7 +2809,7 @@ ASTReader::ASTReadResult ASTReader::ReadOptionsBlock(
     switch ((OptionsRecordTypes)MaybeRecordType.get()) {
     case LANGUAGE_OPTIONS: {
       bool Complain = (ClientLoadCapabilities & ARR_ConfigurationMismatch) == 0;
-      if (ParseLanguageOptions(Record, Complain, Listener,
+      if (ParseLanguageOptions(Record, Filename, Complain, Listener,
                                AllowCompatibleConfigurationMismatch))
         Result = ConfigurationMismatch;
       break;
@@ -2814,7 +2817,7 @@ ASTReader::ASTReadResult ASTReader::ReadOptionsBlock(
 
     case TARGET_OPTIONS: {
       bool Complain = (ClientLoadCapabilities & ARR_ConfigurationMismatch) == 0;
-      if (ParseTargetOptions(Record, Complain, Listener,
+      if (ParseTargetOptions(Record, Filename, Complain, Listener,
                              AllowCompatibleConfigurationMismatch))
         Result = ConfigurationMismatch;
       break;
@@ -2831,7 +2834,7 @@ ASTReader::ASTReadResult ASTReader::ReadOptionsBlock(
     case HEADER_SEARCH_OPTIONS: {
       bool Complain = (ClientLoadCapabilities & ARR_ConfigurationMismatch) == 0;
       if (!AllowCompatibleConfigurationMismatch &&
-          ParseHeaderSearchOptions(Record, Complain, Listener))
+          ParseHeaderSearchOptions(Record, Filename, Complain, Listener))
         Result = ConfigurationMismatch;
       break;
     }
@@ -2839,7 +2842,7 @@ ASTReader::ASTReadResult ASTReader::ReadOptionsBlock(
     case PREPROCESSOR_OPTIONS:
       bool Complain = (ClientLoadCapabilities & ARR_ConfigurationMismatch) == 0;
       if (!AllowCompatibleConfigurationMismatch &&
-          ParsePreprocessorOptions(Record, Complain, Listener,
+          ParsePreprocessorOptions(Record, Filename, Complain, Listener,
                                    SuggestedPredefines))
         Result = ConfigurationMismatch;
       break;
@@ -2976,7 +2979,7 @@ ASTReader::ReadControlBlock(ModuleFile &F,
               F.Kind == MK_ExplicitModule || F.Kind == MK_PrebuiltModule;
 
           ASTReadResult Result =
-              ReadOptionsBlock(Stream, ClientLoadCapabilities,
+              ReadOptionsBlock(Stream, F.FileName, ClientLoadCapabilities,
                                AllowCompatibleConfigurationMismatch, *Listener,
                                SuggestedPredefines);
           if (Result == Failure) {
@@ -4872,7 +4875,7 @@ ASTReader::readUnhashedControlBlock(ModuleFile &F, bool WasImportedBy,
   bool DisableValidation = shouldDisableValidationForFile(F);
 
   ASTReadResult Result = readUnhashedControlBlockImpl(
-      &F, F.Data, ClientLoadCapabilities, AllowCompatibleConfigurationMismatch,
+      &F, F.Data, F.FileName, ClientLoadCapabilities, AllowCompatibleConfigurationMismatch,
       Listener.get(),
       WasImportedBy ? false : HSOpts.ModulesValidateDiagnosticOptions);
 
@@ -4916,7 +4919,7 @@ ASTReader::readUnhashedControlBlock(ModuleFile &F, bool WasImportedBy,
 }
 
 ASTReader::ASTReadResult ASTReader::readUnhashedControlBlockImpl(
-    ModuleFile *F, llvm::StringRef StreamData, unsigned ClientLoadCapabilities,
+    ModuleFile *F, llvm::StringRef StreamData, StringRef Filename, unsigned ClientLoadCapabilities,
     bool AllowCompatibleConfigurationMismatch, ASTReaderListener *Listener,
     bool ValidateDiagnosticOptions) {
   // Initialize a stream.
@@ -4986,7 +4989,7 @@ ASTReader::ASTReadResult ASTReader::readUnhashedControlBlockImpl(
       bool Complain = (ClientLoadCapabilities & ARR_OutOfDate) == 0;
       if (Listener && ValidateDiagnosticOptions &&
           !AllowCompatibleConfigurationMismatch &&
-          ParseDiagnosticOptions(Record, Complain, *Listener))
+          ParseDiagnosticOptions(Record, Filename, Complain, *Listener))
         Result = OutOfDate; // Don't return early.  Read the signature.
       break;
     }
@@ -5373,31 +5376,33 @@ namespace {
           ExistingModuleCachePath(ExistingModuleCachePath), FileMgr(FileMgr),
           StrictOptionMatches(StrictOptionMatches) {}
 
-    bool ReadLanguageOptions(const LangOptions &LangOpts, bool Complain,
+    bool ReadLanguageOptions(const LangOptions &LangOpts, StringRef ModuleFilename, bool Complain,
                              bool AllowCompatibleDifferences) override {
-      return checkLanguageOptions(ExistingLangOpts, LangOpts, nullptr,
+      return checkLanguageOptions(ExistingLangOpts, LangOpts, ModuleFilename, nullptr,
                                   AllowCompatibleDifferences);
     }
 
-    bool ReadTargetOptions(const TargetOptions &TargetOpts, bool Complain,
+    bool ReadTargetOptions(const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override {
-      return checkTargetOptions(ExistingTargetOpts, TargetOpts, nullptr,
+      return checkTargetOptions(ExistingTargetOpts, TargetOpts, ModuleFilename, nullptr,
                                 AllowCompatibleDifferences);
     }
 
     bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+                                 StringRef ModuleFilename,
                                  StringRef SpecificModuleCachePath,
                                  bool Complain) override {
       return checkModuleCachePath(
           FileMgr.getVirtualFileSystem(), SpecificModuleCachePath,
-          ExistingModuleCachePath, nullptr, ExistingLangOpts, ExistingPPOpts);
+          ExistingModuleCachePath, ModuleFilename, nullptr, ExistingLangOpts, ExistingPPOpts);
     }
 
     bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
+                                 StringRef ModuleFilename,
                                  bool ReadMacros, bool Complain,
                                  std::string &SuggestedPredefines) override {
       return checkPreprocessorOptions(
-          PPOpts, ExistingPPOpts, ReadMacros, /*Diags=*/nullptr, FileMgr,
+          PPOpts, ExistingPPOpts, ModuleFilename, ReadMacros, /*Diags=*/nullptr, FileMgr,
           SuggestedPredefines, ExistingLangOpts,
           StrictOptionMatches ? OptionValidateStrictMatches
                               : OptionValidateContradictions);
@@ -5466,7 +5471,7 @@ bool ASTReader::readASTFileControlBlock(
       switch (Entry.ID) {
       case OPTIONS_BLOCK_ID: {
         std::string IgnoredSuggestedPredefines;
-        if (ReadOptionsBlock(Stream, ClientLoadCapabilities,
+        if (ReadOptionsBlock(Stream, Filename, ClientLoadCapabilities,
                              /*AllowCompatibleConfigurationMismatch*/ false,
                              Listener, IgnoredSuggestedPredefines) != Success)
           return true;
@@ -5692,7 +5697,7 @@ bool ASTReader::readASTFileControlBlock(
 
   // Scan for the UNHASHED_CONTROL_BLOCK_ID block.
   if (readUnhashedControlBlockImpl(
-          nullptr, Bytes, ClientLoadCapabilities,
+          nullptr, Bytes, Filename, ClientLoadCapabilities,
           /*AllowCompatibleConfigurationMismatch*/ false, &Listener,
           ValidateDiagnosticOptions) != Success)
     return true;
@@ -6033,7 +6038,7 @@ llvm::Error ASTReader::ReadSubmoduleBlock(ModuleFile &F,
 ///
 /// \returns true if the listener deems the file unacceptable, false otherwise.
 bool ASTReader::ParseLanguageOptions(const RecordData &Record,
-                                     bool Complain,
+                                     StringRef ModuleFilename, bool Complain,
                                      ASTReaderListener &Listener,
                                      bool AllowCompatibleDifferences) {
   LangOptions LangOpts;
@@ -6070,11 +6075,11 @@ bool ASTReader::ParseLanguageOptions(const RecordData &Record,
 
   LangOpts.OMPHostIRFile = ReadString(Record, Idx);
 
-  return Listener.ReadLanguageOptions(LangOpts, Complain,
+  return Listener.ReadLanguageOptions(LangOpts, ModuleFilename, Complain,
                                       AllowCompatibleDifferences);
 }
 
-bool ASTReader::ParseTargetOptions(const RecordData &Record, bool Complain,
+bool ASTReader::ParseTargetOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
                                    ASTReaderListener &Listener,
                                    bool AllowCompatibleDifferences) {
   unsigned Idx = 0;
@@ -6090,11 +6095,11 @@ bool ASTReader::ParseTargetOptions(const RecordData &Record, bool Complain,
     TargetOpts.Features.push_back(ReadString(Record, Idx));
   }
 
-  return Listener.ReadTargetOptions(TargetOpts, Complain,
+  return Listener.ReadTargetOptions(TargetOpts, ModuleFilename, Complain,
                                     AllowCompatibleDifferences);
 }
 
-bool ASTReader::ParseDiagnosticOptions(const RecordData &Record, bool Complain,
+bool ASTReader::ParseDiagnosticOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
                                        ASTReaderListener &Listener) {
   IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts(new DiagnosticOptions);
   unsigned Idx = 0;
@@ -6108,7 +6113,7 @@ bool ASTReader::ParseDiagnosticOptions(const RecordData &Record, bool Complain,
   for (unsigned N = Record[Idx++]; N; --N)
     DiagOpts->Remarks.push_back(ReadString(Record, Idx));
 
-  return Listener.ReadDiagnosticOptions(DiagOpts, Complain);
+  return Listener.ReadDiagnosticOptions(DiagOpts, ModuleFilename, Complain);
 }
 
 bool ASTReader::ParseFileSystemOptions(const RecordData &Record, bool Complain,
@@ -6120,7 +6125,7 @@ bool ASTReader::ParseFileSystemOptions(const RecordData &Record, bool Complain,
 }
 
 bool ASTReader::ParseHeaderSearchOptions(const RecordData &Record,
-                                         bool Complain,
+                                         StringRef ModuleFilename, bool Complain,
                                          ASTReaderListener &Listener) {
   HeaderSearchOptions HSOpts;
   unsigned Idx = 0;
@@ -6139,7 +6144,7 @@ bool ASTReader::ParseHeaderSearchOptions(const RecordData &Record,
   HSOpts.UseLibcxx = Record[Idx++];
   std::string SpecificModuleCachePath = ReadString(Record, Idx);
 
-  return Listener.ReadHeaderSearchOptions(HSOpts, SpecificModuleCachePath,
+  return Listener.ReadHeaderSearchOptions(HSOpts, ModuleFilename, SpecificModuleCachePath,
                                           Complain);
 }
 
@@ -6176,7 +6181,7 @@ bool ASTReader::ParseHeaderSearchPaths(const RecordData &Record, bool Complain,
 }
 
 bool ASTReader::ParsePreprocessorOptions(const RecordData &Record,
-                                         bool Complain,
+                                         StringRef ModuleFilename, bool Complain,
                                          ASTReaderListener &Listener,
                                          std::string &SuggestedPredefines) {
   PreprocessorOptions PPOpts;
@@ -6208,7 +6213,7 @@ bool ASTReader::ParsePreprocessorOptions(const RecordData &Record,
   PPOpts.ObjCXXARCStandardLibrary =
     static_cast<ObjCXXARCStandardLibraryKind>(Record[Idx++]);
   SuggestedPredefines.clear();
-  return Listener.ReadPreprocessorOptions(PPOpts, ReadMacros, Complain,
+  return Listener.ReadPreprocessorOptions(PPOpts, ModuleFilename, ReadMacros, Complain,
                                           SuggestedPredefines);
 }
 
diff --git a/clang/test/Modules/check-for-sanitizer-feature.cpp b/clang/test/Modules/check-for-sanitizer-feature.cpp
index 2137b1bf36bb845..861b571f0efaa0e 100644
--- a/clang/test/Modules/check-for-sanitizer-feature.cpp
+++ b/clang/test/Modules/check-for-sanitizer-feature.cpp
@@ -43,7 +43,7 @@
 //
 // Import the PCH without ASan enabled (we expect an error).
 // RUN: not %clang_cc1 -x c -include-pch %t.asan_pch %s -verify 2>&1 | FileCheck %s --check-prefix=PCH_MISMATCH
-// PCH_MISMATCH: AST file was compiled with the target feature '-fsanitize=address' but the current translation unit is not
+// PCH_MISMATCH: AST file '{{.*}}.asan_pch' was compiled with the target feature '-fsanitize=address' but the current translation unit is not
 //
 // Emit a PCH with UBSan enabled.
 // RUN: %clang_cc1 -x c -fsanitize=null %S/Inputs/check-for-sanitizer-feature/check.h -emit-pch -o %t.ubsan_pch
diff --git a/clang/test/Modules/ignored_macros.m b/clang/test/Modules/ignored_macros.m
index a87a11f89c314f5..33801dfa4f47677 100644
--- a/clang/test/Modules/ignored_macros.m
+++ b/clang/test/Modules/ignored_macros.m
@@ -10,7 +10,7 @@
 // RUN: %clang_cc1 -fmodules-cache-path=%t.modules -fmodules -fimplicit-module-maps -I %S/Inputs -emit-pch -o %t.pch -x objective-c-header %s -verify
 // RUN: not %clang_cc1 -fmodules-cache-path=%t.modules -DIGNORED=1 -fmodules -fimplicit-module-maps -I %S/Inputs -include-pch %t.pch %s > %t.err 2>&1
 // RUN: FileCheck -check-prefix=CHECK-CONFLICT %s < %t.err
-// CHECK-CONFLICT: PCH was compiled with module cache path
+// CHECK-CONFLICT: AST file '{{.*}}' was compiled with module cache path
 
 // Third trial: pass -DIGNORED=1 only to the second invocation, but
 // make it ignored. There should be no failure, IGNORED is defined in
diff --git a/clang/test/Modules/load_failure.c b/clang/test/Modules/load_failure.c
index 3a8d29597348da9..662b39b6f1874f8 100644
--- a/clang/test/Modules/load_failure.c
+++ b/clang/test/Modules/load_failure.c
@@ -15,7 +15,7 @@
 // RUN: FileCheck -check-prefix=CHECK-FAILURE %s < %t.out
 
 // FIXME: Clean up diagnostic text below and give it a location
-// CHECK-FAILURE: error: C99 was disabled in PCH file but is currently enabled
+// CHECK-FAILURE: error: C99 was disabled in AST file '{{.*}}load_failure.pcm' but is currently enabled
 // FIXME: When we have a syntax for modules in C, use that.
 
 
diff --git a/clang/test/Modules/merge-target-features.cpp b/clang/test/Modules/merge-target-features.cpp
index 6a29c2db8a8d9e4..cc2bbfa077e9857 100644
--- a/clang/test/Modules/merge-target-features.cpp
+++ b/clang/test/Modules/merge-target-features.cpp
@@ -20,7 +20,7 @@
 // RUN:   -target-cpu i386 \
 // RUN:   -fsyntax-only merge-target-features.cpp 2>&1 \
 // RUN:   | FileCheck --check-prefix=SUBSET --implicit-check-not=error: %s
-// SUBSET: error: AST file was compiled with the target feature '+sse2' but the current translation unit is not
+// SUBSET: error: AST file '{{.*}}foo.pcm' was compiled with the target feature '+sse2' but the current translation unit is not
 // SUBSET: error: {{.*}} configuration mismatch
 //
 // RUN: %clang_cc1 -fmodules -x c++ -fmodules-cache-path=%t \
@@ -57,8 +57,8 @@
 // RUN:   -target-cpu i386 -target-feature +cx16 \
 // RUN:   -fsyntax-only merge-target-features.cpp 2>&1 \
 // RUN:   | FileCheck --check-prefix=MISMATCH --implicit-check-not=error: %s
-// MISMATCH: error: AST file was compiled with the target feature '+sse2' but the current translation unit is not
-// MISMATCH: error: current translation unit is compiled with the target feature '+cx16' but the AST file was not
+// MISMATCH: error: AST file '{{.*}}foo.pcm' was compiled with the target feature '+sse2' but the current translation unit is not
+// MISMATCH: error: current translation unit is compiled with the target feature '+cx16' but the AST file '{{.*}}foo.pcm' was not
 // MISMATCH: error: {{.*}} configuration mismatch
 
 #include "foo.h"
diff --git a/clang/test/Modules/mismatch-diagnostics.cpp b/clang/test/Modules/mismatch-diagnostics.cpp
index 5a026aa1f6c020f..dffd4b46a678e5b 100644
--- a/clang/test/Modules/mismatch-diagnostics.cpp
+++ b/clang/test/Modules/mismatch-diagnostics.cpp
@@ -29,5 +29,5 @@ export module mismatching_module;
 
 //--- use.cpp
 import mismatching_module;
-// CHECK: error: POSIX thread support was enabled in PCH file but is currently disabled
+// CHECK: error: POSIX thread support was enabled in AST file '{{.*[/|\\\\]}}mismatching_module.pcm' but is currently disabled
 // CHECK-NEXT: module file {{.*[/|\\\\]}}mismatching_module.pcm cannot be loaded due to a configuration mismatch with the current compilation
diff --git a/clang/test/Modules/module-pch-different-cache-path.c b/clang/test/Modules/module-pch-different-cache-path.c
index 8778adc886f719e..8dd04a166eab629 100644
--- a/clang/test/Modules/module-pch-different-cache-path.c
+++ b/clang/test/Modules/module-pch-different-cache-path.c
@@ -14,5 +14,5 @@
 
 pch_int x = 0;
 
-// CHECK-ERROR: PCH was compiled with module cache path '{{.*}}', but the path is currently '{{.*}}'
-// CHECK-SUCCESS-NOT: PCH was compiled with module cache path '{{.*}}', but the path is currently '{{.*}}'
+// CHECK-ERROR: AST file '{{.*}}' was compiled with module cache path '{{.*}}', but the path is currently '{{.*}}'
+// CHECK-SUCCESS-NOT: AST file '{{.*}}' was compiled with module cache path '{{.*}}', but the path is currently '{{.*}}'
diff --git a/clang/test/Modules/pr62359.cppm b/clang/test/Modules/pr62359.cppm
index 69acc3ce303a574..7d9d3eec26cca7a 100644
--- a/clang/test/Modules/pr62359.cppm
+++ b/clang/test/Modules/pr62359.cppm
@@ -43,7 +43,7 @@ int use() {
   return 0;
 }
 
-// CHECK: OpenMP{{.*}}differs in PCH file vs. current file
+// CHECK: OpenMP{{.*}}differs in AST file '{{.*}}Hello.pcm' vs. current file
 
 //--- use2.cpp
 // expected-no-diagnostics
@@ -55,5 +55,5 @@ int use2() {
   return 0;
 }
 
-// CHECK: OpenMP{{.*}}differs in PCH file vs. current file
+// CHECK: OpenMP{{.*}}differs in AST file '{{.*}}Hello.pcm' vs. current file
 // CHECK: use of undeclared identifier 'pragma'
diff --git a/clang/test/PCH/arc.m b/clang/test/PCH/arc.m
index 32069e2314164cd..e4ad71a469b956b 100644
--- a/clang/test/PCH/arc.m
+++ b/clang/test/PCH/arc.m
@@ -14,5 +14,5 @@
 array0 a0;
 array1 a1;
 
-// CHECK-ERR1: Objective-C automated reference counting was enabled in PCH file but is currently disabled
-// CHECK-ERR2: Objective-C automated reference counting was disabled in PCH file but is currently enabled
+// CHECK-ERR1: Objective-C automated reference counting was enabled in AST file '{{.*}}' but is currently disabled
+// CHECK-ERR2: Objective-C automated reference counting was disabled in AST file '{{.*}}' but is currently enabled
diff --git a/clang/test/PCH/fuzzy-pch.c b/clang/test/PCH/fuzzy-pch.c
index 7296d1dc893b3b1..53985866dc08ed1 100644
--- a/clang/test/PCH/fuzzy-pch.c
+++ b/clang/test/PCH/fuzzy-pch.c
@@ -24,8 +24,8 @@ BAR bar = 17;
 #  error BAR was not defined
 #endif
 
-// CHECK-FOO: definition of macro 'FOO' differs between the precompiled header ('1') and the command line ('blah')
-// CHECK-NOFOO: macro 'FOO' was defined in the precompiled header but undef'd on the command line
+// CHECK-FOO: definition of macro 'FOO' differs between the AST file '{{.*}}' ('1') and the command line ('blah')
+// CHECK-NOFOO: macro 'FOO' was defined in the AST file '{{.*}}' but undef'd on the command line
 
-// CHECK-UNDEF: command line contains '-undef' but precompiled header was not built with it
+// CHECK-UNDEF: command line contains '-undef' but AST file '{{.*}}' was not built with it
 
diff --git a/clang/test/PCH/module-hash-difference.m b/clang/test/PCH/module-hash-difference.m
index fc542b0e8d1ad1e..73cf536f88b4f17 100644
--- a/clang/test/PCH/module-hash-difference.m
+++ b/clang/test/PCH/module-hash-difference.m
@@ -4,5 +4,5 @@
 // RUN: not %clang_cc1 -fsyntax-only -include-pch %t.pch %s -I %S/Inputs/modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t.mcp -fdisable-module-hash 2> %t.err
 // RUN: FileCheck -input-file=%t.err %s
 
-// CHECK: error: PCH was compiled with module cache path {{.*}}, but the path is currently {{.*}}
+// CHECK: error: AST file '{{.*}}' was compiled with module cache path {{.*}}, but the path is currently {{.*}}
 @import Foo;
diff --git a/clang/test/PCH/ms-pch-macro.c b/clang/test/PCH/ms-pch-macro.c
index a512e66e2486686..4d4900cc4f90dc0 100644
--- a/clang/test/PCH/ms-pch-macro.c
+++ b/clang/test/PCH/ms-pch-macro.c
@@ -33,7 +33,7 @@ BAR bar = 17;
 #  error BAR was not defined
 #endif
 
-// CHECK-FOO: definition of macro 'FOO' differs between the precompiled header ('1') and the command line ('blah')
-// CHECK-NOFOO: macro 'FOO' was defined in the precompiled header but undef'd on the command line
+// CHECK-FOO: definition of macro 'FOO' differs between the AST file '{{.*}}1.pch' ('1') and the command line ('blah')
+// CHECK-NOFOO: macro 'FOO' was defined in the AST file '{{.*}}1.pch' but undef'd on the command line
 
 // expected-warning@2 {{definition of macro 'BAR' does not match definition in precompiled header}}
diff --git a/clang/test/PCH/no-validate-pch.cl b/clang/test/PCH/no-validate-pch.cl
index 26c5fd5cc04c2ca..aa228ee2052192c 100644
--- a/clang/test/PCH/no-validate-pch.cl
+++ b/clang/test/PCH/no-validate-pch.cl
@@ -16,8 +16,8 @@
 // CHECK: note: previous definition is here
 // CHECK: #define X 4
 
-// CHECK-VAL: error: __OPTIMIZE__ predefined macro was enabled in PCH file but is currently disabled
-// CHECK-VAL: error: definition of macro 'X' differs between the precompiled header ('4') and the command line ('5')
+// CHECK-VAL: error: __OPTIMIZE__ predefined macro was enabled in AST file '{{.*}}' but is currently disabled
+// CHECK-VAL: error: definition of macro 'X' differs between the AST file '{{.*}}' ('4') and the command line ('5')
 
 void test(void) {
   int a = ONE;

From 37a94b7eddeab0ae42207699fb9be37587172a9e Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 8 Aug 2024 16:31:20 +0200
Subject: [PATCH 017/112] [LICM][MustExec] Make must-exec logic for IV
 condition commutative (#93150)

MustExec has special logic to determine whether the first loop iteration
will always be executed, by simplifying the IV comparison with the start
value. Currently, this code assumes that the IV is on the LHS of the
comparison, but this is not guaranteed. Make sure it handles the
commuted variant as well.

The changed PhaseOrdering test previously performed peeling to make the
loads dereferenceable -- as a side effect, this also reduced the exit
count by one, avoiding the awkward <= MAX case.

Now we know up-front the the loads are dereferenceable and can be simply
hoisted. As such, we retain the original exit count and now have to
handle it by widening the exit count calculation to i128. This is a
regression, but at least it preserves the vectorization, which was the
original goal. I'm not sure what else can be done about that test.
---
 llvm/lib/Analysis/MustExecute.cpp             |  19 +-
 llvm/test/Transforms/LICM/hoist-mustexec.ll   |   3 +-
 ...ple-unreachable-exits-for-vectorization.ll | 287 ++++++++----------
 3 files changed, 148 insertions(+), 161 deletions(-)

diff --git a/llvm/lib/Analysis/MustExecute.cpp b/llvm/lib/Analysis/MustExecute.cpp
index 904d30d0544654d..caed62679a683cc 100644
--- a/llvm/lib/Analysis/MustExecute.cpp
+++ b/llvm/lib/Analysis/MustExecute.cpp
@@ -135,16 +135,21 @@ static bool CanProveNotTakenFirstIteration(const BasicBlock *ExitBlock,
   // todo: this would be a lot more powerful if we used scev, but all the
   // plumbing is currently missing to pass a pointer in from the pass
   // Check for cmp (phi [x, preheader] ...), y where (pred x, y is known
+  ICmpInst::Predicate Pred = Cond->getPredicate();
   auto *LHS = dyn_cast<PHINode>(Cond->getOperand(0));
   auto *RHS = Cond->getOperand(1);
-  if (!LHS || LHS->getParent() != CurLoop->getHeader())
-    return false;
-  auto DL = ExitBlock->getDataLayout();
+  if (!LHS || LHS->getParent() != CurLoop->getHeader()) {
+    Pred = Cond->getSwappedPredicate();
+    LHS = dyn_cast<PHINode>(Cond->getOperand(1));
+    RHS = Cond->getOperand(0);
+    if (!LHS || LHS->getParent() != CurLoop->getHeader())
+      return false;
+  }
+
+  auto DL = ExitBlock->getModule()->getDataLayout();
   auto *IVStart = LHS->getIncomingValueForBlock(CurLoop->getLoopPreheader());
-  auto *SimpleValOrNull = simplifyCmpInst(Cond->getPredicate(),
-                                          IVStart, RHS,
-                                          {DL, /*TLI*/ nullptr,
-                                              DT, /*AC*/ nullptr, BI});
+  auto *SimpleValOrNull = simplifyCmpInst(
+      Pred, IVStart, RHS, {DL, /*TLI*/ nullptr, DT, /*AC*/ nullptr, BI});
   auto *SimpleCst = dyn_cast_or_null<Constant>(SimpleValOrNull);
   if (!SimpleCst)
     return false;
diff --git a/llvm/test/Transforms/LICM/hoist-mustexec.ll b/llvm/test/Transforms/LICM/hoist-mustexec.ll
index 81e0815053ffe5b..a6f5a2be05ee410 100644
--- a/llvm/test/Transforms/LICM/hoist-mustexec.ll
+++ b/llvm/test/Transforms/LICM/hoist-mustexec.ll
@@ -218,7 +218,6 @@ fail:
 }
 
 ; Same as previous case, with commuted icmp.
-; FIXME: The load should get hoisted here as well.
 define i32 @test3_commuted(ptr noalias nocapture readonly %a) nounwind uwtable {
 ; CHECK-LABEL: define i32 @test3_commuted(
 ; CHECK-SAME: ptr noalias nocapture readonly [[A:%.*]]) #[[ATTR1]] {
@@ -227,6 +226,7 @@ define i32 @test3_commuted(ptr noalias nocapture readonly %a) nounwind uwtable {
 ; CHECK-NEXT:    [[IS_ZERO:%.*]] = icmp eq i32 [[LEN]], 0
 ; CHECK-NEXT:    br i1 [[IS_ZERO]], label [[FAIL:%.*]], label [[PREHEADER:%.*]]
 ; CHECK:       preheader:
+; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[INC:%.*]], [[CONTINUE:%.*]] ]
@@ -234,7 +234,6 @@ define i32 @test3_commuted(ptr noalias nocapture readonly %a) nounwind uwtable {
 ; CHECK-NEXT:    [[R_CHK:%.*]] = icmp uge i32 [[LEN]], [[IV]]
 ; CHECK-NEXT:    br i1 [[R_CHK]], label [[CONTINUE]], label [[FAIL_LOOPEXIT:%.*]]
 ; CHECK:       continue:
-; CHECK-NEXT:    [[I1:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    [[ADD]] = add nsw i32 [[I1]], [[ACC]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
index 8fc5189e8bc79e9..cc4890e27f2bda6 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
@@ -9,95 +9,87 @@
 
 define i64 @sum_2_at_with_int_conversion(ptr %A, ptr %B, i64 %N) {
 ; CHECK-LABEL: @sum_2_at_with_int_conversion(
-; CHECK-NEXT:  at_with_int_conversion.exit11.peel:
+; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[START_I:%.*]] = load ptr, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[GEP_END_I:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[END_I:%.*]] = load ptr, ptr [[GEP_END_I]], align 8
 ; CHECK-NEXT:    [[START_INT_I:%.*]] = ptrtoint ptr [[START_I]] to i64
 ; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64
 ; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
+; CHECK-NEXT:    [[START_I1:%.*]] = load ptr, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[END_I3:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
+; CHECK-NEXT:    [[START_INT_I4:%.*]] = ptrtoint ptr [[START_I1]] to i64
+; CHECK-NEXT:    [[END_INT_I5:%.*]] = ptrtoint ptr [[END_I3]] to i64
+; CHECK-NEXT:    [[SUB_I6:%.*]] = sub i64 [[END_INT_I5]], [[START_INT_I4]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i64 [[SUB_I]] to i128
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i128 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[SUB_I6]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i128 [[TMP2]], 1
+; CHECK-NEXT:    [[UMIN:%.*]] = tail call i128 @llvm.umin.i128(i128 [[TMP3]], i128 [[TMP1]])
 ; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0)
-; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 8
-; CHECK-NEXT:    [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8
-; CHECK-NEXT:    [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
-; CHECK-NEXT:    [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64
-; CHECK-NEXT:    [[END_INT_I5_PEEL:%.*]] = ptrtoint ptr [[END_I3_PEEL]] to i64
-; CHECK-NEXT:    [[SUB_I6_PEEL:%.*]] = sub i64 [[END_INT_I5_PEEL]], [[START_INT_I4_PEEL]]
-; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8
-; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8
-; CHECK-NEXT:    [[SUM_NEXT_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]]
-; CHECK-NEXT:    [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1
-; CHECK-NEXT:    br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext nneg i64 [[SMAX]] to i128
+; CHECK-NEXT:    [[UMIN12:%.*]] = tail call i128 @llvm.umin.i128(i128 [[UMIN]], i128 [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i128 [[TMP1]], [[UMIN12]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[ERROR_I:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i128 [[TMP3]], [[UMIN12]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[ERROR_I10:%.*]], label [[LOOP_PREHEADER:%.*]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
-; CHECK-NEXT:    [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I6_PEEL]], i64 [[TMP0]])
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[UMIN]]
-; CHECK-NEXT:    [[UMIN15:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I]])
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[UMIN15]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 5
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER20:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw i64 [[SMAX]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER17:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[SUM_NEXT_PEEL]], i64 0
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP7]], -4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or disjoint i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI13:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[WIDE_LOAD17]], [[VEC_PHI16]]
-; CHECK-NEXT:    [[TMP12]] = add <2 x i64> [[TMP10]], [[WIDE_LOAD18]]
-; CHECK-NEXT:    [[TMP13]] = add <2 x i64> [[TMP11]], [[WIDE_LOAD19]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[START_I1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add <2 x i64> [[WIDE_LOAD14]], [[VEC_PHI13]]
+; CHECK-NEXT:    [[TMP14]] = add <2 x i64> [[TMP12]], [[WIDE_LOAD15]]
+; CHECK-NEXT:    [[TMP15]] = add <2 x i64> [[TMP13]], [[WIDE_LOAD16]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
-; CHECK-NEXT:    br label [[LOOP_PREHEADER20]]
-; CHECK:       loop.preheader20:
-; CHECK-NEXT:    [[IV_PH:%.*]] = phi i64 [ 1, [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP7]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[LOOP_PREHEADER17]]
+; CHECK:       loop.preheader17:
+; CHECK-NEXT:    [[IV_PH:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_PH:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT11:%.*]] ], [ [[IV_PH]], [[LOOP_PREHEADER20]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT11]] ], [ [[SUM_PH]], [[LOOP_PREHEADER20]] ]
-; CHECK-NEXT:    [[INRANGE_I:%.*]] = icmp ult i64 [[SUB_I]], [[IV]]
-; CHECK-NEXT:    br i1 [[INRANGE_I]], label [[ERROR_I:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT:%.*]]
-; CHECK:       error.i:
-; CHECK-NEXT:    tail call void @error()
-; CHECK-NEXT:    unreachable
-; CHECK:       at_with_int_conversion.exit:
-; CHECK-NEXT:    [[INRANGE_I7:%.*]] = icmp ult i64 [[SUB_I6_PEEL]], [[IV]]
-; CHECK-NEXT:    br i1 [[INRANGE_I7]], label [[ERROR_I10:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT11]]
-; CHECK:       error.i10:
-; CHECK-NEXT:    tail call void @error()
-; CHECK-NEXT:    unreachable
-; CHECK:       at_with_int_conversion.exit11:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[IV_PH]], [[LOOP_PREHEADER17]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[LOOP]] ], [ [[SUM_PH]], [[LOOP_PREHEADER17]] ]
 ; CHECK-NEXT:    [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]]
 ; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8
-; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1]], i64 [[IV]]
 ; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[LV_I]], [[SUM]]
 ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[ADD]], [[LV_I9]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       error.i:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
+; CHECK:       error.i10:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT11_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT11]] ]
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[SUM_NEXT]], [[LOOP]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
 ;
 entry:
@@ -120,120 +112,111 @@ exit:
 
 define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-LABEL: @sum_3_at_with_int_conversion(
-; CHECK-NEXT:  at_with_int_conversion.exit22.peel:
+; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[START_I:%.*]] = load ptr, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[GEP_END_I:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[END_I:%.*]] = load ptr, ptr [[GEP_END_I]], align 8
 ; CHECK-NEXT:    [[START_INT_I:%.*]] = ptrtoint ptr [[START_I]] to i64
 ; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64
 ; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
-; CHECK-NEXT:    [[GEP_END_I13:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 8
+; CHECK-NEXT:    [[START_I1:%.*]] = load ptr, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B]], i64 8
+; CHECK-NEXT:    [[END_I3:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
+; CHECK-NEXT:    [[START_INT_I4:%.*]] = ptrtoint ptr [[START_I1]] to i64
+; CHECK-NEXT:    [[END_INT_I5:%.*]] = ptrtoint ptr [[END_I3]] to i64
+; CHECK-NEXT:    [[SUB_I6:%.*]] = sub i64 [[END_INT_I5]], [[START_INT_I4]]
+; CHECK-NEXT:    [[START_I12:%.*]] = load ptr, ptr [[C:%.*]], align 8
+; CHECK-NEXT:    [[GEP_END_I13:%.*]] = getelementptr i8, ptr [[C]], i64 8
+; CHECK-NEXT:    [[END_I14:%.*]] = load ptr, ptr [[GEP_END_I13]], align 8
+; CHECK-NEXT:    [[START_INT_I15:%.*]] = ptrtoint ptr [[START_I12]] to i64
+; CHECK-NEXT:    [[END_INT_I16:%.*]] = ptrtoint ptr [[END_I14]] to i64
+; CHECK-NEXT:    [[SUB_I17:%.*]] = sub i64 [[END_INT_I16]], [[START_INT_I15]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i64 [[SUB_I]] to i128
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i128 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i64 [[SUB_I6]] to i128
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i128 [[TMP2]], 1
+; CHECK-NEXT:    [[UMIN:%.*]] = tail call i128 @llvm.umin.i128(i128 [[TMP3]], i128 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i64 [[SUB_I17]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i128 [[TMP4]], 1
+; CHECK-NEXT:    [[UMIN23:%.*]] = tail call i128 @llvm.umin.i128(i128 [[UMIN]], i128 [[TMP5]])
 ; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0)
-; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 8
-; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8
-; CHECK-NEXT:    [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8
-; CHECK-NEXT:    [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
-; CHECK-NEXT:    [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64
-; CHECK-NEXT:    [[END_I3_PEEL_FR:%.*]] = freeze ptr [[END_I3_PEEL]]
-; CHECK-NEXT:    [[END_INT_I5_PEEL:%.*]] = ptrtoint ptr [[END_I3_PEEL_FR]] to i64
-; CHECK-NEXT:    [[SUB_I6_PEEL:%.*]] = sub i64 [[END_INT_I5_PEEL]], [[START_INT_I4_PEEL]]
-; CHECK-NEXT:    [[START_I12_PEEL:%.*]] = load ptr, ptr [[C]], align 8
-; CHECK-NEXT:    [[END_I14_PEEL:%.*]] = load ptr, ptr [[GEP_END_I13]], align 8
-; CHECK-NEXT:    [[START_INT_I15_PEEL:%.*]] = ptrtoint ptr [[START_I12_PEEL]] to i64
-; CHECK-NEXT:    [[END_INT_I16_PEEL:%.*]] = ptrtoint ptr [[END_I14_PEEL]] to i64
-; CHECK-NEXT:    [[SUB_I17_PEEL:%.*]] = sub i64 [[END_INT_I16_PEEL]], [[START_INT_I15_PEEL]]
-; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8
-; CHECK-NEXT:    [[LV_I20_PEEL:%.*]] = load i64, ptr [[START_I12_PEEL]], align 8
-; CHECK-NEXT:    [[ADD_2_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]]
-; CHECK-NEXT:    [[SUM_NEXT_PEEL:%.*]] = add i64 [[ADD_2_PEEL]], [[LV_I20_PEEL]]
-; CHECK-NEXT:    [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1
-; CHECK-NEXT:    br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = zext nneg i64 [[SMAX]] to i128
+; CHECK-NEXT:    [[UMIN24:%.*]] = tail call i128 @llvm.umin.i128(i128 [[UMIN23]], i128 [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i128 [[TMP1]], [[UMIN24]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i128 [[TMP5]], [[UMIN24]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[ERROR_I:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i128 [[TMP3]], [[UMIN24]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[ERROR_I10:%.*]], label [[ENTRY_SPLIT_SPLIT:%.*]]
+; CHECK:       entry.split.split:
+; CHECK-NEXT:    br i1 [[TMP8]], label [[ERROR_I21:%.*]], label [[LOOP_PREHEADER:%.*]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
-; CHECK-NEXT:    [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I17_PEEL]], i64 [[TMP0]])
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[UMIN]]
-; CHECK-NEXT:    [[UMIN26:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I6_PEEL]])
-; CHECK-NEXT:    [[UMIN27:%.*]] = tail call i64 @llvm.umin.i64(i64 [[UMIN26]], i64 [[SUB_I]])
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[UMIN27]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 5
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER34:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add nuw i64 [[SMAX]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 3
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER31:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[IND_END:%.*]] = add i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[SUM_NEXT_PEEL]], i64 0
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP10]], -4
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI28:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or disjoint i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add <2 x i64> [[WIDE_LOAD29]], [[VEC_PHI28]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[TMP12]], [[WIDE_LOAD30]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add <2 x i64> [[TMP13]], [[WIDE_LOAD31]]
-; CHECK-NEXT:    [[TMP16]] = add <2 x i64> [[TMP14]], [[WIDE_LOAD32]]
-; CHECK-NEXT:    [[TMP17]] = add <2 x i64> [[TMP15]], [[WIDE_LOAD33]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI25:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP11]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD26:%.*]] = load <2 x i64>, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[START_I1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD27:%.*]] = load <2 x i64>, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD28:%.*]] = load <2 x i64>, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[START_I12]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP15]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP16]], align 8
+; CHECK-NEXT:    [[TMP17:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add <2 x i64> [[WIDE_LOAD26]], [[VEC_PHI25]]
+; CHECK-NEXT:    [[TMP19:%.*]] = add <2 x i64> [[TMP17]], [[WIDE_LOAD27]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add <2 x i64> [[TMP18]], [[WIDE_LOAD28]]
+; CHECK-NEXT:    [[TMP21]] = add <2 x i64> [[TMP19]], [[WIDE_LOAD29]]
+; CHECK-NEXT:    [[TMP22]] = add <2 x i64> [[TMP20]], [[WIDE_LOAD30]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
-; CHECK-NEXT:    br label [[LOOP_PREHEADER34]]
-; CHECK:       loop.preheader34:
-; CHECK-NEXT:    [[IV_PH:%.*]] = phi i64 [ 1, [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP10]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[LOOP_PREHEADER31]]
+; CHECK:       loop.preheader31:
+; CHECK-NEXT:    [[IV_PH:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_PH:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT22:%.*]] ], [ [[IV_PH]], [[LOOP_PREHEADER34]] ]
-; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT22]] ], [ [[SUM_PH]], [[LOOP_PREHEADER34]] ]
-; CHECK-NEXT:    [[INRANGE_I:%.*]] = icmp ult i64 [[SUB_I]], [[IV]]
-; CHECK-NEXT:    br i1 [[INRANGE_I]], label [[ERROR_I:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT:%.*]]
-; CHECK:       error.i:
-; CHECK-NEXT:    tail call void @error()
-; CHECK-NEXT:    unreachable
-; CHECK:       at_with_int_conversion.exit:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[IV_PH]], [[LOOP_PREHEADER31]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[LOOP]] ], [ [[SUM_PH]], [[LOOP_PREHEADER31]] ]
 ; CHECK-NEXT:    [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]]
 ; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8
-; CHECK-NEXT:    [[INRANGE_I7:%.*]] = icmp ult i64 [[SUB_I6_PEEL]], [[IV]]
-; CHECK-NEXT:    br i1 [[INRANGE_I7]], label [[ERROR_I10:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT11:%.*]]
-; CHECK:       error.i10:
-; CHECK-NEXT:    tail call void @error()
-; CHECK-NEXT:    unreachable
-; CHECK:       at_with_int_conversion.exit11:
-; CHECK-NEXT:    [[INRANGE_I18:%.*]] = icmp ult i64 [[SUB_I17_PEEL]], [[IV]]
-; CHECK-NEXT:    br i1 [[INRANGE_I18]], label [[ERROR_I21:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT22]]
-; CHECK:       error.i21:
-; CHECK-NEXT:    tail call void @error()
-; CHECK-NEXT:    unreachable
-; CHECK:       at_with_int_conversion.exit22:
-; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1]], i64 [[IV]]
 ; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8
-; CHECK-NEXT:    [[GEP_IDX_I19:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_IDX_I19:%.*]] = getelementptr i64, ptr [[START_I12]], i64 [[IV]]
 ; CHECK-NEXT:    [[LV_I20:%.*]] = load i64, ptr [[GEP_IDX_I19]], align 8
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i64 [[LV_I]], [[SUM]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i64 [[ADD_1]], [[LV_I9]]
 ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[ADD_2]], [[LV_I20]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       error.i:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
+; CHECK:       error.i10:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
+; CHECK:       error.i21:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
-; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT22_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT22]] ]
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[SUM_NEXT]], [[LOOP]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
 ;
 entry:

From 22a130220c306935ff9bd6bfcb5ee7ce1600e304 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Thu, 8 Aug 2024 15:57:36 +0100
Subject: [PATCH 018/112] [mlir][vector] Add more tests for ConvertVectorToLLVM
 (1/n) (#101936)

Adds tests with scalable vectors for the Vector-To-LLVM conversion pass.
Covers the following Ops:
  * vector.bitcast
  * vector.broadcast

Note, this has uncovered some missing logic in `BroadcastOpLowering`.
This PR fixes the most basic cases where the scalable flags were dropped
and the generated code was incorrect. Also, the conditions in
`vector::isBroadcastableTo` are relaxed to allow cases like this:
```mlir
%0 = vector.broadcast %arg0 : vector<1xf32> to vector<[4]xf32>
```

The `BroadcastOpLowering` pattern is effectively disabled for scalable
vectors in more complex cases where an SCF loop would be required to
loop over the scalable dims, e.g.:
```mlir
 %0 = vector.broadcast %arg0 : vector<[4]x1x2xf32> to vector<[4]x3x2xf32>
```

These cases are marked as "Stretch not at start" in the code. In those
cases, support for scalable vectors is left as a TODO.
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      |   5 +-
 .../Transforms/LowerVectorBroadcast.cpp       |   7 +-
 .../VectorToLLVM/vector-to-llvm.mlir          | 236 ++++++++++++++++++
 3 files changed, 246 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index fe58bd35684e4d0..b8734d81311f67d 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -2405,7 +2405,10 @@ BroadcastableToResult mlir::vector::isBroadcastableTo(
     bool srcDimScalableFlag = srcVectorType.getScalableDims()[dimIdx];
     bool dstDimScalableFlag = dstVectorType.getScalableDims()[lead + dimIdx];
     if ((srcDim == 1 && srcDimScalableFlag && dstDim != 1) ||
-        (srcDimScalableFlag != dstDimScalableFlag))
+        // 1 -> [N] is fine, everything else should be rejected when mixing
+        // fixed-width and scalable dims
+        (srcDimScalableFlag != dstDimScalableFlag &&
+         (srcDim != 1 || srcDimScalableFlag)))
       foundMismatchingDims = true;
 
     if (foundMismatchingDims) {
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
index 32e7eb27f5e29b3..6c36bbaee85237e 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorBroadcast.cpp
@@ -125,7 +125,8 @@ class BroadcastOpLowering : public OpRewritePattern<vector::BroadcastOp> {
     //   ..
     //   %x = [%a,%b,%c,%d]
     VectorType resType =
-        VectorType::get(dstType.getShape().drop_front(), eltType);
+        VectorType::get(dstType.getShape().drop_front(), eltType,
+                        dstType.getScalableDims().drop_front());
     Value result = rewriter.create<arith::ConstantOp>(
         loc, dstType, rewriter.getZeroAttr(dstType));
     if (m == 0) {
@@ -136,6 +137,10 @@ class BroadcastOpLowering : public OpRewritePattern<vector::BroadcastOp> {
         result = rewriter.create<vector::InsertOp>(loc, bcst, result, d);
     } else {
       // Stetch not at start.
+      if (dstType.getScalableDims()[0]) {
+        // TODO: For scalable vectors we should emit an scf.for loop.
+        return failure();
+      }
       for (int64_t d = 0, dim = dstType.getDimSize(0); d < dim; ++d) {
         Value ext = rewriter.create<vector::ExtractOp>(loc, op.getSource(), d);
         Value bcst = rewriter.create<vector::BroadcastOp>(loc, resType, ext);
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index c310954b906e4e5..69c4b5e5c87e521 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -23,6 +23,15 @@ func.func @bitcast_f32_to_i32_vector(%input: vector<16xf32>) -> vector<16xi32> {
 // CHECK-SAME:  %[[input:.*]]: vector<16xf32>
 // CHECK:       llvm.bitcast %[[input]] : vector<16xf32> to vector<16xi32>
 
+func.func @bitcast_f32_to_i32_vector_scalable(%input: vector<[16]xf32>) -> vector<[16]xi32> {
+  %0 = vector.bitcast %input : vector<[16]xf32> to vector<[16]xi32>
+  return %0 : vector<[16]xi32>
+}
+
+// CHECK-LABEL: @bitcast_f32_to_i32_vector_scalable
+// CHECK-SAME:  %[[input:.*]]: vector<[16]xf32>
+// CHECK:       llvm.bitcast %[[input]] : vector<[16]xf32> to vector<[16]xi32>
+
 // -----
 
 func.func @bitcast_i8_to_f32_vector(%input: vector<64xi8>) -> vector<16xf32> {
@@ -34,6 +43,15 @@ func.func @bitcast_i8_to_f32_vector(%input: vector<64xi8>) -> vector<16xf32> {
 // CHECK-SAME:  %[[input:.*]]: vector<64xi8>
 // CHECK:       llvm.bitcast %[[input]] : vector<64xi8> to vector<16xf32>
 
+func.func @bitcast_i8_to_f32_vector_scalable(%input: vector<[64]xi8>) -> vector<[16]xf32> {
+  %0 = vector.bitcast %input : vector<[64]xi8> to vector<[16]xf32>
+  return %0 : vector<[16]xf32>
+}
+
+// CHECK-LABEL: @bitcast_i8_to_f32_vector_scalable
+// CHECK-SAME:  %[[input:.*]]: vector<[64]xi8>
+// CHECK:       llvm.bitcast %[[input]] : vector<[64]xi8> to vector<[16]xf32>
+
 // -----
 
 func.func @bitcast_index_to_i8_vector(%input: vector<16xindex>) -> vector<128xi8> {
@@ -46,6 +64,16 @@ func.func @bitcast_index_to_i8_vector(%input: vector<16xindex>) -> vector<128xi8
 // CHECK:       %[[T0:.*]] = builtin.unrealized_conversion_cast %[[input]] : vector<16xindex> to vector<16xi64>
 // CHECK:       llvm.bitcast %[[T0]] : vector<16xi64> to vector<128xi8>
 
+func.func @bitcast_index_to_i8_vector_scalable(%input: vector<[16]xindex>) -> vector<[128]xi8> {
+  %0 = vector.bitcast %input : vector<[16]xindex> to vector<[128]xi8>
+  return %0 : vector<[128]xi8>
+}
+
+// CHECK-LABEL: @bitcast_index_to_i8_vector_scalable
+// CHECK-SAME:  %[[input:.*]]: vector<[16]xindex>
+// CHECK:       %[[T0:.*]] = builtin.unrealized_conversion_cast %[[input]] : vector<[16]xindex> to vector<[16]xi64>
+// CHECK:       llvm.bitcast %[[T0]] : vector<[16]xi64> to vector<[128]xi8>
+
 // -----
 
 func.func @broadcast_vec0d_from_f32(%arg0: f32) -> vector<f32> {
@@ -80,6 +108,17 @@ func.func @broadcast_vec1d_from_f32(%arg0: f32) -> vector<2xf32> {
 // CHECK:       %[[T1:.*]] = llvm.shufflevector %[[T0]]
 // CHECK:       return %[[T1]] : vector<2xf32>
 
+
+func.func @broadcast_vec1d_from_f32_scalable(%arg0: f32) -> vector<[2]xf32> {
+  %0 = vector.broadcast %arg0 : f32 to vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+// CHECK-LABEL: @broadcast_vec1d_from_f32_scalable
+// CHECK-SAME:  %[[A:.*]]: f32)
+// CHECK:       %[[T0:.*]] = llvm.insertelement %[[A]]
+// CHECK:       %[[T1:.*]] = llvm.shufflevector %[[T0]]
+// CHECK:       return %[[T1]] : vector<[2]xf32>
+
 // -----
 
 func.func @broadcast_vec1d_from_index(%arg0: index) -> vector<2xindex> {
@@ -94,6 +133,18 @@ func.func @broadcast_vec1d_from_index(%arg0: index) -> vector<2xindex> {
 // CHECK:       %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<2xi64> to vector<2xindex>
 // CHECK:       return %[[T2]] : vector<2xindex>
 
+func.func @broadcast_vec1d_from_index_scalable(%arg0: index) -> vector<[2]xindex> {
+  %0 = vector.broadcast %arg0 : index to vector<[2]xindex>
+  return %0 : vector<[2]xindex>
+}
+// CHECK-LABEL: @broadcast_vec1d_from_index_scalable
+// CHECK-SAME:  %[[A:.*]]: index)
+// CHECK:       %[[A1:.*]] = builtin.unrealized_conversion_cast %[[A]] : index to i64
+// CHECK:       %[[T0:.*]] = llvm.insertelement %[[A1]]
+// CHECK:       %[[T1:.*]] = llvm.shufflevector %[[T0]]
+// CHECK:       %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<[2]xi64> to vector<[2]xindex>
+// CHECK:       return %[[T2]] : vector<[2]xindex>
+
 // -----
 
 func.func @broadcast_vec2d_from_scalar(%arg0: f32) -> vector<2x3xf32> {
@@ -109,6 +160,19 @@ func.func @broadcast_vec2d_from_scalar(%arg0: f32) -> vector<2x3xf32> {
 // CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %[[T3]] : !llvm.array<2 x vector<3xf32>> to vector<2x3xf32>
 // CHECK:       return %[[T4]] : vector<2x3xf32>
 
+func.func @broadcast_vec2d_from_scalar_scalable(%arg0: f32) -> vector<2x[3]xf32> {
+  %0 = vector.broadcast %arg0 : f32 to vector<2x[3]xf32>
+  return %0 : vector<2x[3]xf32>
+}
+// CHECK-LABEL: @broadcast_vec2d_from_scalar_scalable(
+// CHECK-SAME:  %[[A:.*]]: f32)
+// CHECK:       %[[T0:.*]] = llvm.insertelement %[[A]]
+// CHECK:       %[[T1:.*]] = llvm.shufflevector %[[T0]]
+// CHECK:       %[[T2:.*]] = llvm.insertvalue %[[T1]], %{{.*}}[0] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T3:.*]] = llvm.insertvalue %[[T1]], %{{.*}}[1] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %[[T3]] : !llvm.array<2 x vector<[3]xf32>> to vector<2x[3]xf32>
+// CHECK:       return %[[T4]] : vector<2x[3]xf32>
+
 // -----
 
 func.func @broadcast_vec3d_from_scalar(%arg0: f32) -> vector<2x3x4xf32> {
@@ -125,6 +189,21 @@ func.func @broadcast_vec3d_from_scalar(%arg0: f32) -> vector<2x3x4xf32> {
 // CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %[[T3]] : !llvm.array<2 x array<3 x vector<4xf32>>> to vector<2x3x4xf32>
 // CHECK:       return %[[T4]] : vector<2x3x4xf32>
 
+
+func.func @broadcast_vec3d_from_scalar_scalable(%arg0: f32) -> vector<2x3x[4]xf32> {
+  %0 = vector.broadcast %arg0 : f32 to vector<2x3x[4]xf32>
+  return %0 : vector<2x3x[4]xf32>
+}
+// CHECK-LABEL: @broadcast_vec3d_from_scalar_scalable(
+// CHECK-SAME:  %[[A:.*]]: f32)
+// CHECK:       %[[T0:.*]] = llvm.insertelement %[[A]]
+// CHECK:       %[[T1:.*]] = llvm.shufflevector %[[T0]]
+// CHECK:       %[[T2:.*]] = llvm.insertvalue %[[T1]], %{{.*}}[0, 0] : !llvm.array<2 x array<3 x vector<[4]xf32>>>
+// ...
+// CHECK:       %[[T3:.*]] = llvm.insertvalue %[[T1]], %{{.*}}[1, 2] : !llvm.array<2 x array<3 x vector<[4]xf32>>>
+// CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %[[T3]] : !llvm.array<2 x array<3 x vector<[4]xf32>>> to vector<2x3x[4]xf32>
+// CHECK:       return %[[T4]] : vector<2x3x[4]xf32>
+
 // -----
 
 func.func @broadcast_vec1d_from_vec1d(%arg0: vector<2xf32>) -> vector<2xf32> {
@@ -135,6 +214,14 @@ func.func @broadcast_vec1d_from_vec1d(%arg0: vector<2xf32>) -> vector<2xf32> {
 // CHECK-SAME:  %[[A:.*]]: vector<2xf32>)
 // CHECK:       return %[[A]] : vector<2xf32>
 
+func.func @broadcast_vec1d_from_vec1d_scalable(%arg0: vector<[2]xf32>) -> vector<[2]xf32> {
+  %0 = vector.broadcast %arg0 : vector<[2]xf32> to vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+// CHECK-LABEL: @broadcast_vec1d_from_vec1d_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<[2]xf32>)
+// CHECK:       return %[[A]] : vector<[2]xf32>
+
 // -----
 
 func.func @broadcast_vec2d_from_vec0d(%arg0: vector<f32>) -> vector<3x2xf32> {
@@ -172,6 +259,20 @@ func.func @broadcast_vec2d_from_vec1d(%arg0: vector<2xf32>) -> vector<3x2xf32> {
 // CHECK:       %[[T5:.*]] = builtin.unrealized_conversion_cast %[[T4]] : !llvm.array<3 x vector<2xf32>> to vector<3x2xf32>
 // CHECK:       return %[[T5]] : vector<3x2xf32>
 
+func.func @broadcast_vec2d_from_vec1d_scalable(%arg0: vector<[2]xf32>) -> vector<3x[2]xf32> {
+  %0 = vector.broadcast %arg0 : vector<[2]xf32> to vector<3x[2]xf32>
+  return %0 : vector<3x[2]xf32>
+}
+// CHECK-LABEL: @broadcast_vec2d_from_vec1d_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<[2]xf32>)
+// CHECK:       %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<3x[2]xf32>
+// CHECK:       %[[T1:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x[2]xf32> to !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T2:.*]] = llvm.insertvalue %[[A]], %[[T1]][0] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T3:.*]] = llvm.insertvalue %[[A]], %[[T2]][1] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T4:.*]] = llvm.insertvalue %[[A]], %[[T3]][2] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T5:.*]] = builtin.unrealized_conversion_cast %[[T4]] : !llvm.array<3 x vector<[2]xf32>> to vector<3x[2]xf32>
+// CHECK:       return %[[T5]] : vector<3x[2]xf32>
+
 // -----
 
 func.func @broadcast_vec2d_from_index_vec1d(%arg0: vector<2xindex>) -> vector<3x2xindex> {
@@ -188,6 +289,20 @@ func.func @broadcast_vec2d_from_index_vec1d(%arg0: vector<2xindex>) -> vector<3x
 // CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %{{.*}} : !llvm.array<3 x vector<2xi64>> to vector<3x2xindex>
 // CHECK:       return %[[T4]] : vector<3x2xindex>
 
+func.func @broadcast_vec2d_from_index_vec1d_scalable(%arg0: vector<[2]xindex>) -> vector<3x[2]xindex> {
+  %0 = vector.broadcast %arg0 : vector<[2]xindex> to vector<3x[2]xindex>
+  return %0 : vector<3x[2]xindex>
+}
+// CHECK-LABEL: @broadcast_vec2d_from_index_vec1d_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<[2]xindex>)
+// CHECK:       %[[T1:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<[2]xindex> to vector<[2]xi64>
+// CHECK:       %[[T0:.*]] = arith.constant dense<0> : vector<3x[2]xindex>
+// CHECK:       %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x[2]xindex> to !llvm.array<3 x vector<[2]xi64>>
+// CHECK:       %[[T3:.*]] = llvm.insertvalue %[[T1]], %[[T2]][0] : !llvm.array<3 x vector<[2]xi64>>
+
+// CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %{{.*}} : !llvm.array<3 x vector<[2]xi64>> to vector<3x[2]xindex>
+// CHECK:       return %[[T4]] : vector<3x[2]xindex>
+
 // -----
 
 func.func @broadcast_vec3d_from_vec1d(%arg0: vector<2xf32>) -> vector<4x3x2xf32> {
@@ -213,6 +328,29 @@ func.func @broadcast_vec3d_from_vec1d(%arg0: vector<2xf32>) -> vector<4x3x2xf32>
 // CHECK:       %[[T11:.*]] = builtin.unrealized_conversion_cast %[[T10]] : !llvm.array<4 x array<3 x vector<2xf32>>> to vector<4x3x2xf32>
 // CHECK:       return %[[T11]] : vector<4x3x2xf32>
 
+func.func @broadcast_vec3d_from_vec1d_scalable(%arg0: vector<[2]xf32>) -> vector<4x3x[2]xf32> {
+  %0 = vector.broadcast %arg0 : vector<[2]xf32> to vector<4x3x[2]xf32>
+  return %0 : vector<4x3x[2]xf32>
+}
+// CHECK-LABEL: @broadcast_vec3d_from_vec1d_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<[2]xf32>)
+// CHECK-DAG:   %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<3x[2]xf32>
+// CHECK-DAG:   %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<3x[2]xf32> to !llvm.array<3 x vector<[2]xf32>>
+// CHECK-DAG:   %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x[2]xf32>
+// CHECK-DAG:   %[[T6:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<4x3x[2]xf32> to !llvm.array<4 x array<3 x vector<[2]xf32>>>
+
+// CHECK:       %[[T3:.*]] = llvm.insertvalue %[[A]], %[[T2]][0] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T4:.*]] = llvm.insertvalue %[[A]], %[[T3]][1] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T5:.*]] = llvm.insertvalue %[[A]], %[[T4]][2] : !llvm.array<3 x vector<[2]xf32>>
+
+// CHECK:       %[[T7:.*]] = llvm.insertvalue %[[T5]], %[[T6]][0] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T8:.*]] = llvm.insertvalue %[[T5]], %[[T7]][1] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T9:.*]] = llvm.insertvalue %[[T5]], %[[T8]][2] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T10:.*]] = llvm.insertvalue %[[T5]], %[[T9]][3] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+
+// CHECK:       %[[T11:.*]] = builtin.unrealized_conversion_cast %[[T10]] : !llvm.array<4 x array<3 x vector<[2]xf32>>> to vector<4x3x[2]xf32>
+// CHECK:       return %[[T11]] : vector<4x3x[2]xf32>
+
 // -----
 
 func.func @broadcast_vec3d_from_vec2d(%arg0: vector<3x2xf32>) -> vector<4x3x2xf32> {
@@ -231,6 +369,22 @@ func.func @broadcast_vec3d_from_vec2d(%arg0: vector<3x2xf32>) -> vector<4x3x2xf3
 // CHECK:       %[[T10:.*]] = builtin.unrealized_conversion_cast %[[T9]] : !llvm.array<4 x array<3 x vector<2xf32>>> to vector<4x3x2xf32>
 // CHECK:       return %[[T10]] : vector<4x3x2xf32>
 
+func.func @broadcast_vec3d_from_vec2d_scalable(%arg0: vector<3x[2]xf32>) -> vector<4x3x[2]xf32> {
+  %0 = vector.broadcast %arg0 : vector<3x[2]xf32> to vector<4x3x[2]xf32>
+  return %0 : vector<4x3x[2]xf32>
+}
+// CHECK-LABEL: @broadcast_vec3d_from_vec2d_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<3x[2]xf32>)
+// CHECK:       %[[T1:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<3x[2]xf32> to !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T0:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x[2]xf32>
+// CHECK:       %[[T2:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<4x3x[2]xf32> to !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T3:.*]] = llvm.insertvalue %[[T1]], %[[T2]][0] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T5:.*]] = llvm.insertvalue %[[T1]], %[[T3]][1] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T7:.*]] = llvm.insertvalue %[[T1]], %[[T5]][2] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T9:.*]] = llvm.insertvalue %[[T1]], %[[T7]][3] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T10:.*]] = builtin.unrealized_conversion_cast %[[T9]] : !llvm.array<4 x array<3 x vector<[2]xf32>>> to vector<4x3x[2]xf32>
+// CHECK:       return %[[T10]] : vector<4x3x[2]xf32>
+
 
 // -----
 
@@ -246,6 +400,18 @@ func.func @broadcast_stretch(%arg0: vector<1xf32>) -> vector<4xf32> {
 // CHECK:       %[[T4:.*]] = llvm.shufflevector %[[T3]]
 // CHECK:       return %[[T4]] : vector<4xf32>
 
+func.func @broadcast_stretch_scalable(%arg0: vector<1xf32>) -> vector<[4]xf32> {
+  %0 = vector.broadcast %arg0 : vector<1xf32> to vector<[4]xf32>
+  return %0 : vector<[4]xf32>
+}
+// CHECK-LABEL: @broadcast_stretch_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<1xf32>)
+// CHECK:       %[[T1:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:       %[[T2:.*]] = llvm.extractelement %[[A]]{{\[}}%[[T1]] : i64] : vector<1xf32>
+// CHECK:       %[[T3:.*]] = llvm.insertelement %[[T2]]
+// CHECK:       %[[T4:.*]] = llvm.shufflevector %[[T3]]
+// CHECK:       return %[[T4]] : vector<[4]xf32>
+
 // -----
 
 func.func @broadcast_stretch_at_start(%arg0: vector<1x4xf32>) -> vector<3x4xf32> {
@@ -264,6 +430,22 @@ func.func @broadcast_stretch_at_start(%arg0: vector<1x4xf32>) -> vector<3x4xf32>
 // CHECK:       %[[T8:.*]] = builtin.unrealized_conversion_cast %[[T7]] : !llvm.array<3 x vector<4xf32>> to vector<3x4xf32>
 // CHECK:       return %[[T8]] : vector<3x4xf32>
 
+func.func @broadcast_stretch_at_start_scalable(%arg0: vector<1x[4]xf32>) -> vector<3x[4]xf32> {
+  %0 = vector.broadcast %arg0 : vector<1x[4]xf32> to vector<3x[4]xf32>
+  return %0 : vector<3x[4]xf32>
+}
+// CHECK-LABEL: @broadcast_stretch_at_start_scalable(
+// CHECK-SAME:  %[[A:.*]]: vector<1x[4]xf32>)
+// CHECK:       %[[T2:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<1x[4]xf32> to !llvm.array<1 x vector<[4]xf32>>
+// CHECK:       %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<3x[4]xf32>
+// CHECK:       %[[T4:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<3x[4]xf32> to !llvm.array<3 x vector<[4]xf32>>
+// CHECK:       %[[T3:.*]] = llvm.extractvalue %[[T2]][0] : !llvm.array<1 x vector<[4]xf32>>
+// CHECK:       %[[T5:.*]] = llvm.insertvalue %[[T3]], %[[T4]][0] : !llvm.array<3 x vector<[4]xf32>>
+// CHECK:       %[[T6:.*]] = llvm.insertvalue %[[T3]], %[[T5]][1] : !llvm.array<3 x vector<[4]xf32>>
+// CHECK:       %[[T7:.*]] = llvm.insertvalue %[[T3]], %[[T6]][2] : !llvm.array<3 x vector<[4]xf32>>
+// CHECK:       %[[T8:.*]] = builtin.unrealized_conversion_cast %[[T7]] : !llvm.array<3 x vector<[4]xf32>> to vector<3x[4]xf32>
+// CHECK:       return %[[T8]] : vector<3x[4]xf32>
+
 // -----
 
 func.func @broadcast_stretch_at_end(%arg0: vector<4x1xf32>) -> vector<4x3xf32> {
@@ -302,6 +484,16 @@ func.func @broadcast_stretch_at_end(%arg0: vector<4x1xf32>) -> vector<4x3xf32> {
 // CHECK:       %[[T27:.*]] = builtin.unrealized_conversion_cast %[[T26]] : !llvm.array<4 x vector<3xf32>> to vector<4x3xf32>
 // CHECK:       return %[[T27]] : vector<4x3xf32>
 
+// TODO: Add support for scalable vectors
+
+func.func @broadcast_stretch_at_end_scalable(%arg0: vector<[4]x1xf32>) -> vector<[4]x3xf32> {
+  %0 = vector.broadcast %arg0 : vector<[4]x1xf32> to vector<[4]x3xf32>
+  return %0 : vector<[4]x3xf32>
+}
+// CHECK-LABEL: @broadcast_stretch_at_end_scalable
+// CHECK-SAME:  %[[A:.*]]: vector<[4]x1xf32>)
+// CHECK: vector.broadcast %[[A]] : vector<[4]x1xf32> to vector<[4]x3xf32>
+
 // -----
 
 func.func @broadcast_stretch_in_middle(%arg0: vector<4x1x2xf32>) -> vector<4x3x2xf32> {
@@ -338,6 +530,50 @@ func.func @broadcast_stretch_in_middle(%arg0: vector<4x1x2xf32>) -> vector<4x3x2
 // CHECK:       %[[T32:.*]] = builtin.unrealized_conversion_cast %[[T31]] : !llvm.array<4 x array<3 x vector<2xf32>>> to vector<4x3x2xf32>
 // CHECK:       return %[[T32]] : vector<4x3x2xf32>
 
+func.func @broadcast_stretch_in_middle_scalable_v1(%arg0: vector<4x1x[2]xf32>) -> vector<4x3x[2]xf32> {
+  %0 = vector.broadcast %arg0 : vector<4x1x[2]xf32> to vector<4x3x[2]xf32>
+  return %0 : vector<4x3x[2]xf32>
+}
+// CHECK-LABEL: @broadcast_stretch_in_middle_scalable_v1(
+// CHECK-SAME:  %[[A:.*]]: vector<4x1x[2]xf32>) -> vector<4x3x[2]xf32> {
+// CHECK:       %[[T3:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<4x1x[2]xf32> to !llvm.array<4 x array<1 x vector<[2]xf32>>>
+// CHECK:       %[[T1:.*]] = arith.constant dense<0.000000e+00> : vector<4x3x[2]xf32>
+// CHECK:       %[[T9:.*]] = builtin.unrealized_conversion_cast %[[T1]] : vector<4x3x[2]xf32> to !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T2:.*]] = arith.constant dense<0.000000e+00> : vector<3x[2]xf32>
+// CHECK:       %[[T5:.*]] = builtin.unrealized_conversion_cast %[[T2]] : vector<3x[2]xf32> to !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T4:.*]] = llvm.extractvalue %[[T3]][0, 0] : !llvm.array<4 x array<1 x vector<[2]xf32>>>
+// CHECK:       %[[T6:.*]] = llvm.insertvalue %[[T4]], %[[T5]][0] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T7:.*]] = llvm.insertvalue %[[T4]], %[[T6]][1] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T8:.*]] = llvm.insertvalue %[[T4]], %[[T7]][2] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T10:.*]] = llvm.insertvalue %[[T8]], %[[T9]][0] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T12:.*]] = llvm.extractvalue %[[T3]][1, 0] : !llvm.array<4 x array<1 x vector<[2]xf32>>>
+// CHECK:       %[[T14:.*]] = llvm.insertvalue %[[T12]], %[[T5]][0] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T15:.*]] = llvm.insertvalue %[[T12]], %[[T14]][1] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T16:.*]] = llvm.insertvalue %[[T12]], %[[T15]][2] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T17:.*]] = llvm.insertvalue %[[T16]], %[[T10]][1] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T19:.*]] = llvm.extractvalue %[[T3]][2, 0] : !llvm.array<4 x array<1 x vector<[2]xf32>>>
+// CHECK:       %[[T21:.*]] = llvm.insertvalue %[[T19]], %[[T5]][0] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T22:.*]] = llvm.insertvalue %[[T19]], %[[T21]][1] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T23:.*]] = llvm.insertvalue %[[T19]], %[[T22]][2] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T24:.*]] = llvm.insertvalue %[[T23]], %[[T17]][2] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T26:.*]] = llvm.extractvalue %[[T3]][3, 0] : !llvm.array<4 x array<1 x vector<[2]xf32>>>
+// CHECK:       %[[T28:.*]] = llvm.insertvalue %[[T26]], %[[T5]][0] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T29:.*]] = llvm.insertvalue %[[T26]], %[[T28]][1] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T30:.*]] = llvm.insertvalue %[[T26]], %[[T29]][2] : !llvm.array<3 x vector<[2]xf32>>
+// CHECK:       %[[T31:.*]] = llvm.insertvalue %[[T30]], %[[T24]][3] : !llvm.array<4 x array<3 x vector<[2]xf32>>>
+// CHECK:       %[[T32:.*]] = builtin.unrealized_conversion_cast %[[T31]] : !llvm.array<4 x array<3 x vector<[2]xf32>>> to vector<4x3x[2]xf32>
+// CHECK:       return %[[T32]] : vector<4x3x[2]xf32>
+
+// TODO: Add support for scalable vectors
+
+func.func @broadcast_stretch_in_middle_scalable_v2(%arg0: vector<[4]x1x2xf32>) -> vector<[4]x3x2xf32> {
+  %0 = vector.broadcast %arg0 : vector<[4]x1x2xf32> to vector<[4]x3x2xf32>
+  return %0 : vector<[4]x3x2xf32>
+}
+// CHECK-LABEL: @broadcast_stretch_in_middle_scalable_v2(
+// CHECK-SAME:  %[[A:.*]]: vector<[4]x1x2xf32>) -> vector<[4]x3x2xf32> {
+// CHECK:  vector.broadcast %[[A]] : vector<[4]x1x2xf32> to vector<[4]x3x2xf32>
+
 // -----
 
 func.func @outerproduct(%arg0: vector<2xf32>, %arg1: vector<3xf32>) -> vector<2x3xf32> {

From 435717556908a79748de22c21e3a0ee06a933036 Mon Sep 17 00:00:00 2001
From: Volodymyr Sapsai <vsapsai@apple.com>
Date: Thu, 8 Aug 2024 11:59:47 -0300
Subject: [PATCH 019/112] [re-format][Modules] Follow-up formatting to "Mention
 which AST file's options differ from the current TU options." (#102484)

Fix formatting for fdf8e3e31103bc81917cdb27150877f524bb2669.
---
 clang/include/clang/Serialization/ASTReader.h |  76 ++---
 clang/lib/Frontend/ASTUnit.cpp                |  10 +-
 clang/lib/Frontend/FrontendActions.cpp        |  13 +-
 clang/lib/Serialization/ASTReader.cpp         | 277 ++++++++++--------
 4 files changed, 208 insertions(+), 168 deletions(-)

diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index db93fbd703260d9..549396a3b6b387b 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -130,8 +130,7 @@ class ASTReaderListener {
   ///
   /// \returns true to indicate the options are invalid or false otherwise.
   virtual bool ReadLanguageOptions(const LangOptions &LangOpts,
-                                   StringRef ModuleFilename,
-                                   bool Complain,
+                                   StringRef ModuleFilename, bool Complain,
                                    bool AllowCompatibleDifferences) {
     return false;
   }
@@ -140,7 +139,8 @@ class ASTReaderListener {
   ///
   /// \returns true to indicate the target options are invalid, or false
   /// otherwise.
-  virtual bool ReadTargetOptions(const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
+  virtual bool ReadTargetOptions(const TargetOptions &TargetOpts,
+                                 StringRef ModuleFilename, bool Complain,
                                  bool AllowCompatibleDifferences) {
     return false;
   }
@@ -151,8 +151,7 @@ class ASTReaderListener {
   /// otherwise.
   virtual bool
   ReadDiagnosticOptions(IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts,
-                        StringRef ModuleFilename,
-                        bool Complain) {
+                        StringRef ModuleFilename, bool Complain) {
     return false;
   }
 
@@ -266,13 +265,14 @@ class ChainedASTReaderListener : public ASTReaderListener {
   bool ReadFullVersionInformation(StringRef FullVersion) override;
   void ReadModuleName(StringRef ModuleName) override;
   void ReadModuleMapFile(StringRef ModuleMapPath) override;
-  bool ReadLanguageOptions(const LangOptions &LangOpts, StringRef ModuleFilename, bool Complain,
+  bool ReadLanguageOptions(const LangOptions &LangOpts,
+                           StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override;
-  bool ReadTargetOptions(const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
+  bool ReadTargetOptions(const TargetOptions &TargetOpts,
+                         StringRef ModuleFilename, bool Complain,
                          bool AllowCompatibleDifferences) override;
   bool ReadDiagnosticOptions(IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts,
-                             StringRef ModuleFilename,
-                             bool Complain) override;
+                             StringRef ModuleFilename, bool Complain) override;
   bool ReadFileSystemOptions(const FileSystemOptions &FSOpts,
                              bool Complain) override;
 
@@ -281,8 +281,8 @@ class ChainedASTReaderListener : public ASTReaderListener {
                                StringRef SpecificModuleCachePath,
                                bool Complain) override;
   bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                               StringRef ModuleFilename,
-                               bool ReadMacros, bool Complain,
+                               StringRef ModuleFilename, bool ReadMacros,
+                               bool Complain,
                                std::string &SuggestedPredefines) override;
 
   void ReadCounter(const serialization::ModuleFile &M, unsigned Value) override;
@@ -306,16 +306,17 @@ class PCHValidator : public ASTReaderListener {
   PCHValidator(Preprocessor &PP, ASTReader &Reader)
       : PP(PP), Reader(Reader) {}
 
-  bool ReadLanguageOptions(const LangOptions &LangOpts, StringRef ModuleFilename, bool Complain,
+  bool ReadLanguageOptions(const LangOptions &LangOpts,
+                           StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override;
-  bool ReadTargetOptions(const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
+  bool ReadTargetOptions(const TargetOptions &TargetOpts,
+                         StringRef ModuleFilename, bool Complain,
                          bool AllowCompatibleDifferences) override;
   bool ReadDiagnosticOptions(IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts,
-                             StringRef ModuleFilename,
-                             bool Complain) override;
+                             StringRef ModuleFilename, bool Complain) override;
   bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                               StringRef ModuleFilename,
-                               bool ReadMacros, bool Complain,
+                               StringRef ModuleFilename, bool ReadMacros,
+                               bool Complain,
                                std::string &SuggestedPredefines) override;
   bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
                                StringRef ModuleFilename,
@@ -335,8 +336,8 @@ class SimpleASTReaderListener : public ASTReaderListener {
   SimpleASTReaderListener(Preprocessor &PP) : PP(PP) {}
 
   bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                               StringRef ModuleFilename,
-                               bool ReadMacros, bool Complain,
+                               StringRef ModuleFilename, bool ReadMacros,
+                               bool Complain,
                                std::string &SuggestedPredefines) override;
 };
 
@@ -1377,10 +1378,12 @@ class ASTReader
                                  SmallVectorImpl<ImportedModule> &Loaded,
                                  const ModuleFile *ImportedBy,
                                  unsigned ClientLoadCapabilities);
-  static ASTReadResult ReadOptionsBlock(
-      llvm::BitstreamCursor &Stream, StringRef Filename, unsigned ClientLoadCapabilities,
-      bool AllowCompatibleConfigurationMismatch, ASTReaderListener &Listener,
-      std::string &SuggestedPredefines);
+  static ASTReadResult
+  ReadOptionsBlock(llvm::BitstreamCursor &Stream, StringRef Filename,
+                   unsigned ClientLoadCapabilities,
+                   bool AllowCompatibleConfigurationMismatch,
+                   ASTReaderListener &Listener,
+                   std::string &SuggestedPredefines);
 
   /// Read the unhashed control block.
   ///
@@ -1389,13 +1392,11 @@ class ASTReader
   ASTReadResult readUnhashedControlBlock(ModuleFile &F, bool WasImportedBy,
                                          unsigned ClientLoadCapabilities);
 
-  static ASTReadResult
-  readUnhashedControlBlockImpl(ModuleFile *F, llvm::StringRef StreamData,
-                               StringRef Filename,
-                               unsigned ClientLoadCapabilities,
-                               bool AllowCompatibleConfigurationMismatch,
-                               ASTReaderListener *Listener,
-                               bool ValidateDiagnosticOptions);
+  static ASTReadResult readUnhashedControlBlockImpl(
+      ModuleFile *F, llvm::StringRef StreamData, StringRef Filename,
+      unsigned ClientLoadCapabilities,
+      bool AllowCompatibleConfigurationMismatch, ASTReaderListener *Listener,
+      bool ValidateDiagnosticOptions);
 
   llvm::Error ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities);
   llvm::Error ReadExtensionBlock(ModuleFile &F);
@@ -1408,21 +1409,26 @@ class ASTReader
                                        unsigned ClientLoadCapabilities);
   llvm::Error ReadSubmoduleBlock(ModuleFile &F,
                                  unsigned ClientLoadCapabilities);
-  static bool ParseLanguageOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
+  static bool ParseLanguageOptions(const RecordData &Record,
+                                   StringRef ModuleFilename, bool Complain,
                                    ASTReaderListener &Listener,
                                    bool AllowCompatibleDifferences);
-  static bool ParseTargetOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
+  static bool ParseTargetOptions(const RecordData &Record,
+                                 StringRef ModuleFilename, bool Complain,
                                  ASTReaderListener &Listener,
                                  bool AllowCompatibleDifferences);
-  static bool ParseDiagnosticOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
+  static bool ParseDiagnosticOptions(const RecordData &Record,
+                                     StringRef ModuleFilename, bool Complain,
                                      ASTReaderListener &Listener);
   static bool ParseFileSystemOptions(const RecordData &Record, bool Complain,
                                      ASTReaderListener &Listener);
-  static bool ParseHeaderSearchOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
+  static bool ParseHeaderSearchOptions(const RecordData &Record,
+                                       StringRef ModuleFilename, bool Complain,
                                        ASTReaderListener &Listener);
   static bool ParseHeaderSearchPaths(const RecordData &Record, bool Complain,
                                      ASTReaderListener &Listener);
-  static bool ParsePreprocessorOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
+  static bool ParsePreprocessorOptions(const RecordData &Record,
+                                       StringRef ModuleFilename, bool Complain,
                                        ASTReaderListener &Listener,
                                        std::string &SuggestedPredefines);
 
diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
index aec1402d6cb95b4..84e273a3949ef02 100644
--- a/clang/lib/Frontend/ASTUnit.cpp
+++ b/clang/lib/Frontend/ASTUnit.cpp
@@ -536,7 +536,8 @@ class ASTInfoCollector : public ASTReaderListener {
         LangOpt(LangOpt), TargetOpts(TargetOpts), Target(Target),
         Counter(Counter) {}
 
-  bool ReadLanguageOptions(const LangOptions &LangOpts, StringRef ModuleFilename, bool Complain,
+  bool ReadLanguageOptions(const LangOptions &LangOpts,
+                           StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override {
     if (InitializedLanguage)
       return false;
@@ -598,14 +599,15 @@ class ASTInfoCollector : public ASTReaderListener {
   }
 
   bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                               StringRef ModuleFilename,
-                               bool ReadMacros, bool Complain,
+                               StringRef ModuleFilename, bool ReadMacros,
+                               bool Complain,
                                std::string &SuggestedPredefines) override {
     this->PPOpts = PPOpts;
     return false;
   }
 
-  bool ReadTargetOptions(const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
+  bool ReadTargetOptions(const TargetOptions &TargetOpts,
+                         StringRef ModuleFilename, bool Complain,
                          bool AllowCompatibleDifferences) override {
     // If we've already initialized the target, don't do it again.
     if (Target)
diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp
index f9887fe634ec6cd..9f5d09e33ce244f 100644
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@@ -622,7 +622,8 @@ namespace {
       Out.indent(2) << "Module map file: " << ModuleMapPath << "\n";
     }
 
-    bool ReadLanguageOptions(const LangOptions &LangOpts, StringRef ModuleFilename, bool Complain,
+    bool ReadLanguageOptions(const LangOptions &LangOpts,
+                             StringRef ModuleFilename, bool Complain,
                              bool AllowCompatibleDifferences) override {
       Out.indent(2) << "Language options:\n";
 #define LANGOPT(Name, Bits, Default, Description) \
@@ -645,7 +646,8 @@ namespace {
       return false;
     }
 
-    bool ReadTargetOptions(const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
+    bool ReadTargetOptions(const TargetOptions &TargetOpts,
+                           StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override {
       Out.indent(2) << "Target options:\n";
       Out.indent(4) << "  Triple: " << TargetOpts.Triple << "\n";
@@ -665,7 +667,8 @@ namespace {
     }
 
     bool ReadDiagnosticOptions(IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts,
-                               StringRef ModuleFilename, bool Complain) override {
+                               StringRef ModuleFilename,
+                               bool Complain) override {
       Out.indent(2) << "Diagnostic options:\n";
 #define DIAGOPT(Name, Bits, Default) DUMP_BOOLEAN(DiagOpts->Name, #Name);
 #define ENUM_DIAGOPT(Name, Type, Bits, Default) \
@@ -718,8 +721,8 @@ namespace {
     }
 
     bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                                 StringRef ModuleFilename,
-                                 bool ReadMacros, bool Complain,
+                                 StringRef ModuleFilename, bool ReadMacros,
+                                 bool Complain,
                                  std::string &SuggestedPredefines) override {
       Out.indent(2) << "Preprocessor options:\n";
       DUMP_BOOLEAN(PPOpts.UsePredefines,
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 7fbd0ddde2d8a75..ad8d6c336f27809 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -172,11 +172,9 @@ void ChainedASTReaderListener::ReadModuleMapFile(StringRef ModuleMapPath) {
   Second->ReadModuleMapFile(ModuleMapPath);
 }
 
-bool
-ChainedASTReaderListener::ReadLanguageOptions(const LangOptions &LangOpts,
-                                              StringRef ModuleFilename,
-                                              bool Complain,
-                                              bool AllowCompatibleDifferences) {
+bool ChainedASTReaderListener::ReadLanguageOptions(
+    const LangOptions &LangOpts, StringRef ModuleFilename, bool Complain,
+    bool AllowCompatibleDifferences) {
   return First->ReadLanguageOptions(LangOpts, ModuleFilename, Complain,
                                     AllowCompatibleDifferences) ||
          Second->ReadLanguageOptions(LangOpts, ModuleFilename, Complain,
@@ -193,7 +191,8 @@ bool ChainedASTReaderListener::ReadTargetOptions(
 }
 
 bool ChainedASTReaderListener::ReadDiagnosticOptions(
-    IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts, StringRef ModuleFilename, bool Complain) {
+    IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts, StringRef ModuleFilename,
+    bool Complain) {
   return First->ReadDiagnosticOptions(DiagOpts, ModuleFilename, Complain) ||
          Second->ReadDiagnosticOptions(DiagOpts, ModuleFilename, Complain);
 }
@@ -206,21 +205,21 @@ ChainedASTReaderListener::ReadFileSystemOptions(const FileSystemOptions &FSOpts,
 }
 
 bool ChainedASTReaderListener::ReadHeaderSearchOptions(
-    const HeaderSearchOptions &HSOpts, StringRef ModuleFilename, StringRef SpecificModuleCachePath,
-    bool Complain) {
-  return First->ReadHeaderSearchOptions(HSOpts, ModuleFilename, SpecificModuleCachePath,
-                                        Complain) ||
-         Second->ReadHeaderSearchOptions(HSOpts, ModuleFilename, SpecificModuleCachePath,
-                                         Complain);
+    const HeaderSearchOptions &HSOpts, StringRef ModuleFilename,
+    StringRef SpecificModuleCachePath, bool Complain) {
+  return First->ReadHeaderSearchOptions(HSOpts, ModuleFilename,
+                                        SpecificModuleCachePath, Complain) ||
+         Second->ReadHeaderSearchOptions(HSOpts, ModuleFilename,
+                                         SpecificModuleCachePath, Complain);
 }
 
 bool ChainedASTReaderListener::ReadPreprocessorOptions(
-    const PreprocessorOptions &PPOpts, StringRef ModuleFilename, bool ReadMacros, bool Complain,
-    std::string &SuggestedPredefines) {
-  return First->ReadPreprocessorOptions(PPOpts, ModuleFilename, ReadMacros, Complain,
-                                        SuggestedPredefines) ||
-         Second->ReadPreprocessorOptions(PPOpts, ModuleFilename, ReadMacros, Complain,
-                                         SuggestedPredefines);
+    const PreprocessorOptions &PPOpts, StringRef ModuleFilename,
+    bool ReadMacros, bool Complain, std::string &SuggestedPredefines) {
+  return First->ReadPreprocessorOptions(PPOpts, ModuleFilename, ReadMacros,
+                                        Complain, SuggestedPredefines) ||
+         Second->ReadPreprocessorOptions(PPOpts, ModuleFilename, ReadMacros,
+                                         Complain, SuggestedPredefines);
 }
 
 void ChainedASTReaderListener::ReadCounter(const serialization::ModuleFile &M,
@@ -283,35 +282,37 @@ ASTReaderListener::~ASTReaderListener() = default;
 /// \returns true if the languagae options mis-match, false otherwise.
 static bool checkLanguageOptions(const LangOptions &LangOpts,
                                  const LangOptions &ExistingLangOpts,
-                                 StringRef ModuleFilename, DiagnosticsEngine *Diags,
+                                 StringRef ModuleFilename,
+                                 DiagnosticsEngine *Diags,
                                  bool AllowCompatibleDifferences = true) {
-#define LANGOPT(Name, Bits, Default, Description)                   \
-  if (ExistingLangOpts.Name != LangOpts.Name) {                     \
-    if (Diags) {                                                    \
-      if (Bits == 1)                                                \
-        Diags->Report(diag::err_ast_file_langopt_mismatch)          \
-          << Description << LangOpts.Name << ExistingLangOpts.Name << ModuleFilename; \
-      else                                                          \
-        Diags->Report(diag::err_ast_file_langopt_value_mismatch)    \
-          << Description << ModuleFilename;                         \
-    }                                                               \
-    return true;                                                    \
-  }
-
-#define VALUE_LANGOPT(Name, Bits, Default, Description)   \
-  if (ExistingLangOpts.Name != LangOpts.Name) {           \
-    if (Diags)                                            \
-      Diags->Report(diag::err_ast_file_langopt_value_mismatch) \
-        << Description << ModuleFilename;                 \
-    return true;                                          \
-  }
-
-#define ENUM_LANGOPT(Name, Type, Bits, Default, Description)   \
-  if (ExistingLangOpts.get##Name() != LangOpts.get##Name()) {  \
-    if (Diags)                                                 \
-      Diags->Report(diag::err_ast_file_langopt_value_mismatch) \
-        << Description << ModuleFilename;                      \
-    return true;                                               \
+#define LANGOPT(Name, Bits, Default, Description)                              \
+  if (ExistingLangOpts.Name != LangOpts.Name) {                                \
+    if (Diags) {                                                               \
+      if (Bits == 1)                                                           \
+        Diags->Report(diag::err_ast_file_langopt_mismatch)                     \
+            << Description << LangOpts.Name << ExistingLangOpts.Name           \
+            << ModuleFilename;                                                 \
+      else                                                                     \
+        Diags->Report(diag::err_ast_file_langopt_value_mismatch)               \
+            << Description << ModuleFilename;                                  \
+    }                                                                          \
+    return true;                                                               \
+  }
+
+#define VALUE_LANGOPT(Name, Bits, Default, Description)                        \
+  if (ExistingLangOpts.Name != LangOpts.Name) {                                \
+    if (Diags)                                                                 \
+      Diags->Report(diag::err_ast_file_langopt_value_mismatch)                 \
+          << Description << ModuleFilename;                                    \
+    return true;                                                               \
+  }
+
+#define ENUM_LANGOPT(Name, Type, Bits, Default, Description)                   \
+  if (ExistingLangOpts.get##Name() != LangOpts.get##Name()) {                  \
+    if (Diags)                                                                 \
+      Diags->Report(diag::err_ast_file_langopt_value_mismatch)                 \
+          << Description << ModuleFilename;                                    \
+    return true;                                                               \
   }
 
 #define COMPATIBLE_LANGOPT(Name, Bits, Default, Description)  \
@@ -333,14 +334,15 @@ static bool checkLanguageOptions(const LangOptions &LangOpts,
 
   if (ExistingLangOpts.ModuleFeatures != LangOpts.ModuleFeatures) {
     if (Diags)
-      Diags->Report(diag::err_ast_file_langopt_value_mismatch) << "module features" << ModuleFilename;
+      Diags->Report(diag::err_ast_file_langopt_value_mismatch)
+          << "module features" << ModuleFilename;
     return true;
   }
 
   if (ExistingLangOpts.ObjCRuntime != LangOpts.ObjCRuntime) {
     if (Diags)
       Diags->Report(diag::err_ast_file_langopt_value_mismatch)
-      << "target Objective-C runtime" << ModuleFilename;
+          << "target Objective-C runtime" << ModuleFilename;
     return true;
   }
 
@@ -348,7 +350,7 @@ static bool checkLanguageOptions(const LangOptions &LangOpts,
       LangOpts.CommentOpts.BlockCommandNames) {
     if (Diags)
       Diags->Report(diag::err_ast_file_langopt_value_mismatch)
-        << "block command names" << ModuleFilename;
+          << "block command names" << ModuleFilename;
     return true;
   }
 
@@ -390,14 +392,16 @@ static bool checkLanguageOptions(const LangOptions &LangOpts,
 /// \returns true if the target options mis-match, false otherwise.
 static bool checkTargetOptions(const TargetOptions &TargetOpts,
                                const TargetOptions &ExistingTargetOpts,
-                               StringRef ModuleFilename, DiagnosticsEngine *Diags,
+                               StringRef ModuleFilename,
+                               DiagnosticsEngine *Diags,
                                bool AllowCompatibleDifferences = true) {
-#define CHECK_TARGET_OPT(Field, Name)                             \
-  if (TargetOpts.Field != ExistingTargetOpts.Field) {             \
-    if (Diags)                                                    \
-      Diags->Report(diag::err_ast_file_targetopt_mismatch)        \
-        << ModuleFilename << Name << TargetOpts.Field << ExistingTargetOpts.Field; \
-    return true;                                                  \
+#define CHECK_TARGET_OPT(Field, Name)                                          \
+  if (TargetOpts.Field != ExistingTargetOpts.Field) {                          \
+    if (Diags)                                                                 \
+      Diags->Report(diag::err_ast_file_targetopt_mismatch)                     \
+          << ModuleFilename << Name << TargetOpts.Field                        \
+          << ExistingTargetOpts.Field;                                         \
+    return true;                                                               \
   }
 
   // The triple and ABI must match exactly.
@@ -450,11 +454,9 @@ static bool checkTargetOptions(const TargetOptions &TargetOpts,
   return !UnmatchedReadFeatures.empty() || !UnmatchedExistingFeatures.empty();
 }
 
-bool
-PCHValidator::ReadLanguageOptions(const LangOptions &LangOpts,
-                                  StringRef ModuleFilename,
-                                  bool Complain,
-                                  bool AllowCompatibleDifferences) {
+bool PCHValidator::ReadLanguageOptions(const LangOptions &LangOpts,
+                                       StringRef ModuleFilename, bool Complain,
+                                       bool AllowCompatibleDifferences) {
   const LangOptions &ExistingLangOpts = PP.getLangOpts();
   return checkLanguageOptions(LangOpts, ExistingLangOpts, ModuleFilename,
                               Complain ? &Reader.Diags : nullptr,
@@ -480,7 +482,8 @@ using DeclsMap = llvm::DenseMap<DeclarationName, SmallVector<NamedDecl *, 8>>;
 
 static bool checkDiagnosticGroupMappings(DiagnosticsEngine &StoredDiags,
                                          DiagnosticsEngine &Diags,
-                                         StringRef ModuleFilename, bool Complain) {
+                                         StringRef ModuleFilename,
+                                         bool Complain) {
   using Level = DiagnosticsEngine::Level;
 
   // Check current mappings for new -Werror mappings, and the stored mappings
@@ -498,8 +501,11 @@ static bool checkDiagnosticGroupMappings(DiagnosticsEngine &StoredDiags,
           StoredDiags.getDiagnosticLevel(DiagID, SourceLocation());
       if (StoredLevel < DiagnosticsEngine::Error) {
         if (Complain)
-          Diags.Report(diag::err_ast_file_diagopt_mismatch) << "-Werror=" +
-              Diags.getDiagnosticIDs()->getWarningOptionForDiag(DiagID).str() << ModuleFilename;
+          Diags.Report(diag::err_ast_file_diagopt_mismatch)
+              << "-Werror=" + Diags.getDiagnosticIDs()
+                                  ->getWarningOptionForDiag(DiagID)
+                                  .str()
+              << ModuleFilename;
         return true;
       }
     }
@@ -516,7 +522,8 @@ static bool isExtHandlingFromDiagsError(DiagnosticsEngine &Diags) {
 }
 
 static bool checkDiagnosticMappings(DiagnosticsEngine &StoredDiags,
-                                    DiagnosticsEngine &Diags, StringRef ModuleFilename, bool IsSystem,
+                                    DiagnosticsEngine &Diags,
+                                    StringRef ModuleFilename, bool IsSystem,
                                     bool SystemHeaderWarningsInModule,
                                     bool Complain) {
   // Top-level options
@@ -528,32 +535,37 @@ static bool checkDiagnosticMappings(DiagnosticsEngine &StoredDiags,
     if (StoredDiags.getSuppressSystemWarnings() &&
         !SystemHeaderWarningsInModule) {
       if (Complain)
-        Diags.Report(diag::err_ast_file_diagopt_mismatch) << "-Wsystem-headers" << ModuleFilename;
+        Diags.Report(diag::err_ast_file_diagopt_mismatch)
+            << "-Wsystem-headers" << ModuleFilename;
       return true;
     }
   }
 
   if (Diags.getWarningsAsErrors() && !StoredDiags.getWarningsAsErrors()) {
     if (Complain)
-      Diags.Report(diag::err_ast_file_diagopt_mismatch) << "-Werror" << ModuleFilename;
+      Diags.Report(diag::err_ast_file_diagopt_mismatch)
+          << "-Werror" << ModuleFilename;
     return true;
   }
 
   if (Diags.getWarningsAsErrors() && Diags.getEnableAllWarnings() &&
       !StoredDiags.getEnableAllWarnings()) {
     if (Complain)
-      Diags.Report(diag::err_ast_file_diagopt_mismatch) << "-Weverything -Werror" << ModuleFilename;
+      Diags.Report(diag::err_ast_file_diagopt_mismatch)
+          << "-Weverything -Werror" << ModuleFilename;
     return true;
   }
 
   if (isExtHandlingFromDiagsError(Diags) &&
       !isExtHandlingFromDiagsError(StoredDiags)) {
     if (Complain)
-      Diags.Report(diag::err_ast_file_diagopt_mismatch) << "-pedantic-errors" << ModuleFilename;
+      Diags.Report(diag::err_ast_file_diagopt_mismatch)
+          << "-pedantic-errors" << ModuleFilename;
     return true;
   }
 
-  return checkDiagnosticGroupMappings(StoredDiags, Diags, ModuleFilename, Complain);
+  return checkDiagnosticGroupMappings(StoredDiags, Diags, ModuleFilename,
+                                      Complain);
 }
 
 /// Return the top import module if it is implicit, nullptr otherwise.
@@ -582,7 +594,8 @@ static Module *getTopImportImplicitModule(ModuleManager &ModuleMgr,
 }
 
 bool PCHValidator::ReadDiagnosticOptions(
-    IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts, StringRef ModuleFilename, bool Complain) {
+    IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts, StringRef ModuleFilename,
+    bool Complain) {
   DiagnosticsEngine &ExistingDiags = PP.getDiagnostics();
   IntrusiveRefCntPtr<DiagnosticIDs> DiagIDs(ExistingDiags.getDiagnosticIDs());
   IntrusiveRefCntPtr<DiagnosticsEngine> Diags(
@@ -607,8 +620,9 @@ bool PCHValidator::ReadDiagnosticOptions(
 
   // FIXME: if the diagnostics are incompatible, save a DiagnosticOptions that
   // contains the union of their flags.
-  return checkDiagnosticMappings(*Diags, ExistingDiags, ModuleFilename, TopM->IsSystem,
-                                 SystemHeaderWarningsInModule, Complain);
+  return checkDiagnosticMappings(*Diags, ExistingDiags, ModuleFilename,
+                                 TopM->IsSystem, SystemHeaderWarningsInModule,
+                                 Complain);
 }
 
 /// Collect the macro definitions provided by the given preprocessor
@@ -667,8 +681,8 @@ enum OptionValidation {
 ///        are no differences in the options between the two.
 static bool checkPreprocessorOptions(
     const PreprocessorOptions &PPOpts,
-    const PreprocessorOptions &ExistingPPOpts, StringRef ModuleFilename, bool ReadMacros,
-    DiagnosticsEngine *Diags, FileManager &FileMgr,
+    const PreprocessorOptions &ExistingPPOpts, StringRef ModuleFilename,
+    bool ReadMacros, DiagnosticsEngine *Diags, FileManager &FileMgr,
     std::string &SuggestedPredefines, const LangOptions &LangOpts,
     OptionValidation Validation = OptionValidateContradictions) {
   if (ReadMacros) {
@@ -697,7 +711,8 @@ static bool checkPreprocessorOptions(
           // If strict matches are requested, don't tolerate any extra defines
           // on the command line that are missing in the AST file.
           if (Diags) {
-            Diags->Report(diag::err_ast_file_macro_def_undef) << MacroName << true << ModuleFilename;
+            Diags->Report(diag::err_ast_file_macro_def_undef)
+                << MacroName << true << ModuleFilename;
           }
           return true;
         }
@@ -739,7 +754,8 @@ static bool checkPreprocessorOptions(
       // The macro bodies differ; complain.
       if (Diags) {
         Diags->Report(diag::err_ast_file_macro_def_conflict)
-            << MacroName << Known->second.first << Existing.first << ModuleFilename;
+            << MacroName << Known->second.first << Existing.first
+            << ModuleFilename;
       }
       return true;
     }
@@ -752,7 +768,8 @@ static bool checkPreprocessorOptions(
       // the AST file that are missing on the command line.
       for (const auto &MacroName : ASTFileMacros.keys()) {
         if (Diags) {
-          Diags->Report(diag::err_ast_file_macro_def_undef) << MacroName << false << ModuleFilename;
+          Diags->Report(diag::err_ast_file_macro_def_undef)
+              << MacroName << false << ModuleFilename;
         }
         return true;
       }
@@ -763,7 +780,8 @@ static bool checkPreprocessorOptions(
   if (PPOpts.UsePredefines != ExistingPPOpts.UsePredefines &&
       Validation != OptionValidateNone) {
     if (Diags) {
-      Diags->Report(diag::err_ast_file_undef) << ExistingPPOpts.UsePredefines << ModuleFilename;
+      Diags->Report(diag::err_ast_file_undef)
+          << ExistingPPOpts.UsePredefines << ModuleFilename;
     }
     return true;
   }
@@ -773,7 +791,8 @@ static bool checkPreprocessorOptions(
       PPOpts.DetailedRecord != ExistingPPOpts.DetailedRecord &&
       Validation != OptionValidateNone) {
     if (Diags) {
-      Diags->Report(diag::err_ast_file_pp_detailed_record) << PPOpts.DetailedRecord << ModuleFilename;
+      Diags->Report(diag::err_ast_file_pp_detailed_record)
+          << PPOpts.DetailedRecord << ModuleFilename;
     }
     return true;
   }
@@ -817,22 +836,24 @@ static bool checkPreprocessorOptions(
 }
 
 bool PCHValidator::ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                                           StringRef ModuleFilename, bool ReadMacros, bool Complain,
+                                           StringRef ModuleFilename,
+                                           bool ReadMacros, bool Complain,
                                            std::string &SuggestedPredefines) {
   const PreprocessorOptions &ExistingPPOpts = PP.getPreprocessorOpts();
 
   return checkPreprocessorOptions(
-      PPOpts, ExistingPPOpts, ModuleFilename, ReadMacros, Complain ? &Reader.Diags : nullptr,
-      PP.getFileManager(), SuggestedPredefines, PP.getLangOpts());
+      PPOpts, ExistingPPOpts, ModuleFilename, ReadMacros,
+      Complain ? &Reader.Diags : nullptr, PP.getFileManager(),
+      SuggestedPredefines, PP.getLangOpts());
 }
 
 bool SimpleASTReaderListener::ReadPreprocessorOptions(
-    const PreprocessorOptions &PPOpts, StringRef ModuleFilename, bool ReadMacros, bool Complain,
-    std::string &SuggestedPredefines) {
-  return checkPreprocessorOptions(PPOpts, PP.getPreprocessorOpts(), ModuleFilename, ReadMacros,
-                                  nullptr, PP.getFileManager(),
-                                  SuggestedPredefines, PP.getLangOpts(),
-                                  OptionValidateNone);
+    const PreprocessorOptions &PPOpts, StringRef ModuleFilename,
+    bool ReadMacros, bool Complain, std::string &SuggestedPredefines) {
+  return checkPreprocessorOptions(PPOpts, PP.getPreprocessorOpts(),
+                                  ModuleFilename, ReadMacros, nullptr,
+                                  PP.getFileManager(), SuggestedPredefines,
+                                  PP.getLangOpts(), OptionValidateNone);
 }
 
 /// Check that the specified and the existing module cache paths are equivalent.
@@ -842,7 +863,8 @@ bool SimpleASTReaderListener::ReadPreprocessorOptions(
 static bool checkModuleCachePath(llvm::vfs::FileSystem &VFS,
                                  StringRef SpecificModuleCachePath,
                                  StringRef ExistingModuleCachePath,
-                                 StringRef ModuleFilename, DiagnosticsEngine *Diags,
+                                 StringRef ModuleFilename,
+                                 DiagnosticsEngine *Diags,
                                  const LangOptions &LangOpts,
                                  const PreprocessorOptions &PPOpts) {
   if (!LangOpts.Modules || PPOpts.AllowPCHWithDifferentModulesCachePath ||
@@ -862,11 +884,11 @@ bool PCHValidator::ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
                                            StringRef ModuleFilename,
                                            StringRef SpecificModuleCachePath,
                                            bool Complain) {
-  return checkModuleCachePath(Reader.getFileManager().getVirtualFileSystem(),
-                              SpecificModuleCachePath,
-                              PP.getHeaderSearchInfo().getModuleCachePath(),
-                              ModuleFilename, Complain ? &Reader.Diags : nullptr,
-                              PP.getLangOpts(), PP.getPreprocessorOpts());
+  return checkModuleCachePath(
+      Reader.getFileManager().getVirtualFileSystem(), SpecificModuleCachePath,
+      PP.getHeaderSearchInfo().getModuleCachePath(), ModuleFilename,
+      Complain ? &Reader.Diags : nullptr, PP.getLangOpts(),
+      PP.getPreprocessorOpts());
 }
 
 void PCHValidator::ReadCounter(const ModuleFile &M, unsigned Value) {
@@ -2764,9 +2786,9 @@ static bool isDiagnosedResult(ASTReader::ASTReadResult ARR, unsigned Caps) {
 }
 
 ASTReader::ASTReadResult ASTReader::ReadOptionsBlock(
-    BitstreamCursor &Stream, StringRef Filename, unsigned ClientLoadCapabilities,
-    bool AllowCompatibleConfigurationMismatch, ASTReaderListener &Listener,
-    std::string &SuggestedPredefines) {
+    BitstreamCursor &Stream, StringRef Filename,
+    unsigned ClientLoadCapabilities, bool AllowCompatibleConfigurationMismatch,
+    ASTReaderListener &Listener, std::string &SuggestedPredefines) {
   if (llvm::Error Err = Stream.EnterSubBlock(OPTIONS_BLOCK_ID)) {
     // FIXME this drops errors on the floor.
     consumeError(std::move(Err));
@@ -4875,8 +4897,8 @@ ASTReader::readUnhashedControlBlock(ModuleFile &F, bool WasImportedBy,
   bool DisableValidation = shouldDisableValidationForFile(F);
 
   ASTReadResult Result = readUnhashedControlBlockImpl(
-      &F, F.Data, F.FileName, ClientLoadCapabilities, AllowCompatibleConfigurationMismatch,
-      Listener.get(),
+      &F, F.Data, F.FileName, ClientLoadCapabilities,
+      AllowCompatibleConfigurationMismatch, Listener.get(),
       WasImportedBy ? false : HSOpts.ModulesValidateDiagnosticOptions);
 
   // If F was directly imported by another module, it's implicitly validated by
@@ -4919,9 +4941,9 @@ ASTReader::readUnhashedControlBlock(ModuleFile &F, bool WasImportedBy,
 }
 
 ASTReader::ASTReadResult ASTReader::readUnhashedControlBlockImpl(
-    ModuleFile *F, llvm::StringRef StreamData, StringRef Filename, unsigned ClientLoadCapabilities,
-    bool AllowCompatibleConfigurationMismatch, ASTReaderListener *Listener,
-    bool ValidateDiagnosticOptions) {
+    ModuleFile *F, llvm::StringRef StreamData, StringRef Filename,
+    unsigned ClientLoadCapabilities, bool AllowCompatibleConfigurationMismatch,
+    ASTReaderListener *Listener, bool ValidateDiagnosticOptions) {
   // Initialize a stream.
   BitstreamCursor Stream(StreamData);
 
@@ -5376,34 +5398,37 @@ namespace {
           ExistingModuleCachePath(ExistingModuleCachePath), FileMgr(FileMgr),
           StrictOptionMatches(StrictOptionMatches) {}
 
-    bool ReadLanguageOptions(const LangOptions &LangOpts, StringRef ModuleFilename, bool Complain,
+    bool ReadLanguageOptions(const LangOptions &LangOpts,
+                             StringRef ModuleFilename, bool Complain,
                              bool AllowCompatibleDifferences) override {
-      return checkLanguageOptions(ExistingLangOpts, LangOpts, ModuleFilename, nullptr,
-                                  AllowCompatibleDifferences);
+      return checkLanguageOptions(ExistingLangOpts, LangOpts, ModuleFilename,
+                                  nullptr, AllowCompatibleDifferences);
     }
 
-    bool ReadTargetOptions(const TargetOptions &TargetOpts, StringRef ModuleFilename, bool Complain,
+    bool ReadTargetOptions(const TargetOptions &TargetOpts,
+                           StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override {
-      return checkTargetOptions(ExistingTargetOpts, TargetOpts, ModuleFilename, nullptr,
-                                AllowCompatibleDifferences);
+      return checkTargetOptions(ExistingTargetOpts, TargetOpts, ModuleFilename,
+                                nullptr, AllowCompatibleDifferences);
     }
 
     bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
                                  StringRef ModuleFilename,
                                  StringRef SpecificModuleCachePath,
                                  bool Complain) override {
-      return checkModuleCachePath(
-          FileMgr.getVirtualFileSystem(), SpecificModuleCachePath,
-          ExistingModuleCachePath, ModuleFilename, nullptr, ExistingLangOpts, ExistingPPOpts);
+      return checkModuleCachePath(FileMgr.getVirtualFileSystem(),
+                                  SpecificModuleCachePath,
+                                  ExistingModuleCachePath, ModuleFilename,
+                                  nullptr, ExistingLangOpts, ExistingPPOpts);
     }
 
     bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
-                                 StringRef ModuleFilename,
-                                 bool ReadMacros, bool Complain,
+                                 StringRef ModuleFilename, bool ReadMacros,
+                                 bool Complain,
                                  std::string &SuggestedPredefines) override {
       return checkPreprocessorOptions(
-          PPOpts, ExistingPPOpts, ModuleFilename, ReadMacros, /*Diags=*/nullptr, FileMgr,
-          SuggestedPredefines, ExistingLangOpts,
+          PPOpts, ExistingPPOpts, ModuleFilename, ReadMacros, /*Diags=*/nullptr,
+          FileMgr, SuggestedPredefines, ExistingLangOpts,
           StrictOptionMatches ? OptionValidateStrictMatches
                               : OptionValidateContradictions);
     }
@@ -6079,7 +6104,8 @@ bool ASTReader::ParseLanguageOptions(const RecordData &Record,
                                       AllowCompatibleDifferences);
 }
 
-bool ASTReader::ParseTargetOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
+bool ASTReader::ParseTargetOptions(const RecordData &Record,
+                                   StringRef ModuleFilename, bool Complain,
                                    ASTReaderListener &Listener,
                                    bool AllowCompatibleDifferences) {
   unsigned Idx = 0;
@@ -6099,7 +6125,8 @@ bool ASTReader::ParseTargetOptions(const RecordData &Record, StringRef ModuleFil
                                     AllowCompatibleDifferences);
 }
 
-bool ASTReader::ParseDiagnosticOptions(const RecordData &Record, StringRef ModuleFilename, bool Complain,
+bool ASTReader::ParseDiagnosticOptions(const RecordData &Record,
+                                       StringRef ModuleFilename, bool Complain,
                                        ASTReaderListener &Listener) {
   IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts(new DiagnosticOptions);
   unsigned Idx = 0;
@@ -6125,7 +6152,8 @@ bool ASTReader::ParseFileSystemOptions(const RecordData &Record, bool Complain,
 }
 
 bool ASTReader::ParseHeaderSearchOptions(const RecordData &Record,
-                                         StringRef ModuleFilename, bool Complain,
+                                         StringRef ModuleFilename,
+                                         bool Complain,
                                          ASTReaderListener &Listener) {
   HeaderSearchOptions HSOpts;
   unsigned Idx = 0;
@@ -6144,8 +6172,8 @@ bool ASTReader::ParseHeaderSearchOptions(const RecordData &Record,
   HSOpts.UseLibcxx = Record[Idx++];
   std::string SpecificModuleCachePath = ReadString(Record, Idx);
 
-  return Listener.ReadHeaderSearchOptions(HSOpts, ModuleFilename, SpecificModuleCachePath,
-                                          Complain);
+  return Listener.ReadHeaderSearchOptions(HSOpts, ModuleFilename,
+                                          SpecificModuleCachePath, Complain);
 }
 
 bool ASTReader::ParseHeaderSearchPaths(const RecordData &Record, bool Complain,
@@ -6181,7 +6209,8 @@ bool ASTReader::ParseHeaderSearchPaths(const RecordData &Record, bool Complain,
 }
 
 bool ASTReader::ParsePreprocessorOptions(const RecordData &Record,
-                                         StringRef ModuleFilename, bool Complain,
+                                         StringRef ModuleFilename,
+                                         bool Complain,
                                          ASTReaderListener &Listener,
                                          std::string &SuggestedPredefines) {
   PreprocessorOptions PPOpts;
@@ -6213,8 +6242,8 @@ bool ASTReader::ParsePreprocessorOptions(const RecordData &Record,
   PPOpts.ObjCXXARCStandardLibrary =
     static_cast<ObjCXXARCStandardLibraryKind>(Record[Idx++]);
   SuggestedPredefines.clear();
-  return Listener.ReadPreprocessorOptions(PPOpts, ModuleFilename, ReadMacros, Complain,
-                                          SuggestedPredefines);
+  return Listener.ReadPreprocessorOptions(PPOpts, ModuleFilename, ReadMacros,
+                                          Complain, SuggestedPredefines);
 }
 
 std::pair<ModuleFile *, unsigned>

From b24737d3ac522fc4e36d3be5b52ffca9d53e30b5 Mon Sep 17 00:00:00 2001
From: Jerry Zhang Jian <jerry.zhangjian@sifive.com>
Date: Thu, 8 Aug 2024 23:04:39 +0800
Subject: [PATCH 020/112] [NFC][RISCV] use OS.indent() to replace spaces
 (#102429)

---
 llvm/utils/TableGen/RISCVTargetDefEmitter.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
index 910488a14b9858c..ae5ce32d617b476 100644
--- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
@@ -36,9 +36,9 @@ static void printExtensionTable(raw_ostream &OS,
     if (R->getValueAsBit("Experimental") != Experimental)
       continue;
 
-    OS << "    {\"" << getExtensionName(R) << "\", {"
-       << R->getValueAsInt("MajorVersion") << ", "
-       << R->getValueAsInt("MinorVersion") << "}},\n";
+    OS.indent(4) << "{\"" << getExtensionName(R) << "\", {"
+                 << R->getValueAsInt("MajorVersion") << ", "
+                 << R->getValueAsInt("MinorVersion") << "}},\n";
   }
 
   OS << "};\n\n";
@@ -77,8 +77,8 @@ static void emitRISCVExtensions(RecordKeeper &Records, raw_ostream &OS) {
         if (!ImpliedExt->isSubClassOf("RISCVExtension"))
           continue;
 
-        OS << "    { {\"" << Name << "\"}, \"" << getExtensionName(ImpliedExt)
-           << "\"},\n";
+        OS.indent(4) << "{ {\"" << Name << "\"}, \""
+                     << getExtensionName(ImpliedExt) << "\"},\n";
       }
     }
 
@@ -236,10 +236,10 @@ static void emitRISCVExtensionBitmask(RecordKeeper &RK, raw_ostream &OS) {
            "duplicated bitmask");
 #endif
 
-    OS << "    {"
-       << "\"" << ExtName << "\""
-       << ", " << GroupIDVal << ", " << BitPosVal << "ULL"
-       << "},\n";
+    OS.indent(4) << "{"
+                 << "\"" << ExtName << "\""
+                 << ", " << GroupIDVal << ", " << BitPosVal << "ULL"
+                 << "},\n";
   }
   OS << "};\n";
   OS << "#endif\n";

From b2e69f52bb5da067109b9a7d1f73d0dd1a6bb5ad Mon Sep 17 00:00:00 2001
From: hev <wangrui@loongson.cn>
Date: Thu, 8 Aug 2024 23:05:38 +0800
Subject: [PATCH 021/112] [LoongArch] Add machine function pass to merge base +
 offset (#101139)

This commit references RISC-V to add a machine function pass to merge
the base address and offset.
---
 llvm/lib/Target/LoongArch/CMakeLists.txt      |    1 +
 llvm/lib/Target/LoongArch/LoongArch.h         |    2 +
 .../Target/LoongArch/LoongArchAsmPrinter.cpp  |    6 +
 .../LoongArch/LoongArchMergeBaseOffset.cpp    |  636 ++++
 .../LoongArch/LoongArchTargetMachine.cpp      |    3 +
 llvm/test/CodeGen/LoongArch/block-address.ll  |   10 +-
 .../CodeGen/LoongArch/calling-conv-lp64d.ll   |   21 +-
 llvm/test/CodeGen/LoongArch/double-imm.ll     |    6 +-
 llvm/test/CodeGen/LoongArch/float-imm-vldi.ll | 3060 ++++++-----------
 llvm/test/CodeGen/LoongArch/float-imm.ll      |    6 +-
 llvm/test/CodeGen/LoongArch/ghc-cc.ll         |   51 +-
 llvm/test/CodeGen/LoongArch/global-address.ll |   18 +-
 .../LoongArch/global-variable-code-model.ll   |    6 +-
 .../LoongArch/inline-asm-constraint-f.ll      |    6 +-
 .../LoongArch/inline-asm-constraint-m.ll      |   10 +-
 .../LoongArch/ir-instruction/atomicrmw-fp.ll  |   45 +-
 .../ir-instruction/double-convert.ll          |   12 +-
 .../LoongArch/ir-instruction/float-convert.ll |   18 +-
 .../LoongArch/ir-instruction/load-store.ll    |   28 +-
 .../LoongArch/machinelicm-address-pseudos.ll  |   23 +-
 .../CodeGen/LoongArch/merge-base-offset.ll    |  435 +--
 llvm/test/CodeGen/LoongArch/opt-pipeline.ll   |    1 +
 .../LoongArch/psabi-restricted-scheduling.ll  |   12 +-
 llvm/test/CodeGen/LoongArch/vector-fp-imm.ll  |  126 +-
 ...arch_generated_funcs.ll.generated.expected |    3 +-
 ...ch_generated_funcs.ll.nogenerated.expected |    3 +-
 .../llvm/lib/Target/LoongArch/BUILD.gn        |    1 +
 27 files changed, 1974 insertions(+), 2575 deletions(-)
 create mode 100644 llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp

diff --git a/llvm/lib/Target/LoongArch/CMakeLists.txt b/llvm/lib/Target/LoongArch/CMakeLists.txt
index cadc335a621f2e5..0f674b1b0fa9e22 100644
--- a/llvm/lib/Target/LoongArch/CMakeLists.txt
+++ b/llvm/lib/Target/LoongArch/CMakeLists.txt
@@ -24,6 +24,7 @@ add_llvm_target(LoongArchCodeGen
   LoongArchISelDAGToDAG.cpp
   LoongArchISelLowering.cpp
   LoongArchMCInstLower.cpp
+  LoongArchMergeBaseOffset.cpp
   LoongArchOptWInstrs.cpp
   LoongArchRegisterInfo.cpp
   LoongArchSubtarget.cpp
diff --git a/llvm/lib/Target/LoongArch/LoongArch.h b/llvm/lib/Target/LoongArch/LoongArch.h
index adfb844ee31b649..db6052373888093 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.h
+++ b/llvm/lib/Target/LoongArch/LoongArch.h
@@ -36,12 +36,14 @@ bool lowerLoongArchMachineOperandToMCOperand(const MachineOperand &MO,
 FunctionPass *createLoongArchDeadRegisterDefinitionsPass();
 FunctionPass *createLoongArchExpandAtomicPseudoPass();
 FunctionPass *createLoongArchISelDag(LoongArchTargetMachine &TM);
+FunctionPass *createLoongArchMergeBaseOffsetOptPass();
 FunctionPass *createLoongArchOptWInstrsPass();
 FunctionPass *createLoongArchPreRAExpandPseudoPass();
 FunctionPass *createLoongArchExpandPseudoPass();
 void initializeLoongArchDAGToDAGISelLegacyPass(PassRegistry &);
 void initializeLoongArchDeadRegisterDefinitionsPass(PassRegistry &);
 void initializeLoongArchExpandAtomicPseudoPass(PassRegistry &);
+void initializeLoongArchMergeBaseOffsetOptPass(PassRegistry &);
 void initializeLoongArchOptWInstrsPass(PassRegistry &);
 void initializeLoongArchPreRAExpandPseudoPass(PassRegistry &);
 void initializeLoongArchExpandPseudoPass(PassRegistry &);
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index f478870217ec606..8bb9497a847fa79 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -130,10 +130,16 @@ bool LoongArchAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   OS << "$" << LoongArchInstPrinter::getRegisterName(BaseMO.getReg());
   // Print the offset operand.
   const MachineOperand &OffsetMO = MI->getOperand(OpNo + 1);
+  MCOperand MCO;
+  if (!lowerOperand(OffsetMO, MCO))
+    return true;
   if (OffsetMO.isReg())
     OS << ", $" << LoongArchInstPrinter::getRegisterName(OffsetMO.getReg());
   else if (OffsetMO.isImm())
     OS << ", " << OffsetMO.getImm();
+  else if (OffsetMO.isGlobal() || OffsetMO.isBlockAddress() ||
+           OffsetMO.isMCSymbol())
+    OS << ", " << *MCO.getExpr();
   else
     return true;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp b/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp
new file mode 100644
index 000000000000000..ae50b7a6f923e3a
--- /dev/null
+++ b/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp
@@ -0,0 +1,636 @@
+//===---- LoongArchMergeBaseOffset.cpp - Optimise address calculations ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Merge the offset of address calculation into the offset field
+// of instructions in a global address lowering sequence.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LoongArch.h"
+#include "LoongArchTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetOptions.h"
+#include <optional>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loongarch-merge-base-offset"
+#define LoongArch_MERGE_BASE_OFFSET_NAME "LoongArch Merge Base Offset"
+
+namespace {
+
+class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass {
+  const LoongArchSubtarget *ST = nullptr;
+  MachineRegisterInfo *MRI;
+
+public:
+  static char ID;
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+  bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12,
+                      MachineInstr *&Lo20, MachineInstr *&Hi12,
+                      MachineInstr *&Last);
+
+  bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12,
+                           MachineInstr *&Lo20, MachineInstr *&Hi12,
+                           MachineInstr *&Last);
+  void foldOffset(MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
+                  MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
+                  int64_t Offset);
+  bool foldLargeOffset(MachineInstr &Hi20, MachineInstr &Lo12,
+                       MachineInstr *&Lo20, MachineInstr *&Hi12,
+                       MachineInstr *&Last, MachineInstr &TailAdd,
+                       Register GAReg);
+
+  bool foldIntoMemoryOps(MachineInstr &Hi20, MachineInstr &Lo12,
+                         MachineInstr *&Lo20, MachineInstr *&Hi12,
+                         MachineInstr *&Last);
+
+  LoongArchMergeBaseOffsetOpt() : MachineFunctionPass(ID) {}
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override {
+    return LoongArch_MERGE_BASE_OFFSET_NAME;
+  }
+};
+} // end anonymous namespace
+
+char LoongArchMergeBaseOffsetOpt::ID = 0;
+INITIALIZE_PASS(LoongArchMergeBaseOffsetOpt, DEBUG_TYPE,
+                LoongArch_MERGE_BASE_OFFSET_NAME, false, false)
+
+// Detect either of the patterns:
+//
+// 1. (small/medium):
+//   pcalau12i vreg1, %pc_hi20(s)
+//   addi.d    vreg2, vreg1, %pc_lo12(s)
+//
+// 2. (large):
+//   pcalau12i vreg1, %pc_hi20(s)
+//   addi.d    vreg2, $zero, %pc_lo12(s)
+//   lu32i.d   vreg3, vreg2, %pc64_lo20(s)
+//   lu52i.d   vreg4, vreg3, %pc64_hi12(s)
+//   add.d     vreg5, vreg4, vreg1
+
+// The pattern is only accepted if:
+//    1) For small and medium pattern, the first instruction has only one use,
+//       which is the ADDI.
+//    2) For large pattern, the first four instructions each have only one use,
+//       and the user of the fourth instruction is ADD.
+//    3) The address operands have the appropriate type, reflecting the
+//       lowering of a global address or constant pool using the pattern.
+//    4) The offset value in the Global Address or Constant Pool is 0.
+bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
+                                                 MachineInstr *&Lo12,
+                                                 MachineInstr *&Lo20,
+                                                 MachineInstr *&Hi12,
+                                                 MachineInstr *&Last) {
+  if (Hi20.getOpcode() != LoongArch::PCALAU12I)
+    return false;
+
+  const MachineOperand &Hi20Op1 = Hi20.getOperand(1);
+  if (Hi20Op1.getTargetFlags() != LoongArchII::MO_PCREL_HI)
+    return false;
+
+  auto isGlobalOrCPIOrBlockAddress = [](const MachineOperand &Op) {
+    return Op.isGlobal() || Op.isCPI() || Op.isBlockAddress();
+  };
+
+  if (!isGlobalOrCPIOrBlockAddress(Hi20Op1) || Hi20Op1.getOffset() != 0)
+    return false;
+
+  Register HiDestReg = Hi20.getOperand(0).getReg();
+  if (!MRI->hasOneUse(HiDestReg))
+    return false;
+
+  MachineInstr *UseInst = &*MRI->use_instr_begin(HiDestReg);
+  if (UseInst->getOpcode() != LoongArch::ADD_D) {
+    Lo12 = UseInst;
+    if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
+        (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
+      return false;
+  } else {
+    assert(ST->is64Bit());
+    Last = UseInst;
+
+    Register LastOp1Reg = Last->getOperand(1).getReg();
+    if (!LastOp1Reg.isVirtual())
+      return false;
+    Hi12 = MRI->getVRegDef(LastOp1Reg);
+    const MachineOperand &Hi12Op2 = Hi12->getOperand(2);
+    if (Hi12Op2.getTargetFlags() != LoongArchII::MO_PCREL64_HI)
+      return false;
+    if (!isGlobalOrCPIOrBlockAddress(Hi12Op2) || Hi12Op2.getOffset() != 0)
+      return false;
+    if (!MRI->hasOneUse(Hi12->getOperand(0).getReg()))
+      return false;
+
+    Lo20 = MRI->getVRegDef(Hi12->getOperand(1).getReg());
+    const MachineOperand &Lo20Op2 = Lo20->getOperand(2);
+    if (Lo20Op2.getTargetFlags() != LoongArchII::MO_PCREL64_LO)
+      return false;
+    if (!isGlobalOrCPIOrBlockAddress(Lo20Op2) || Lo20Op2.getOffset() != 0)
+      return false;
+    if (!MRI->hasOneUse(Lo20->getOperand(0).getReg()))
+      return false;
+
+    Lo12 = MRI->getVRegDef(Lo20->getOperand(1).getReg());
+    if (!MRI->hasOneUse(Lo12->getOperand(0).getReg()))
+      return false;
+  }
+
+  const MachineOperand &Lo12Op2 = Lo12->getOperand(2);
+  assert(Hi20.getOpcode() == LoongArch::PCALAU12I);
+  if (Lo12Op2.getTargetFlags() != LoongArchII::MO_PCREL_LO ||
+      !(isGlobalOrCPIOrBlockAddress(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
+      Lo12Op2.getOffset() != 0)
+    return false;
+
+  if (Hi20Op1.isGlobal()) {
+    LLVM_DEBUG(dbgs() << "  Found lowered global address: "
+                      << *Hi20Op1.getGlobal() << "\n");
+  } else if (Hi20Op1.isBlockAddress()) {
+    LLVM_DEBUG(dbgs() << "  Found lowered basic address: "
+                      << *Hi20Op1.getBlockAddress() << "\n");
+  } else if (Hi20Op1.isCPI()) {
+    LLVM_DEBUG(dbgs() << "  Found lowered constant pool: " << Hi20Op1.getIndex()
+                      << "\n");
+  }
+
+  return true;
+}
+
+// Update the offset in Hi20, Lo12, Lo20 and Hi12 instructions.
+// Delete the tail instruction and update all the uses to use the
+// output from Last.
+void LoongArchMergeBaseOffsetOpt::foldOffset(
+    MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
+    MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &Tail,
+    int64_t Offset) {
+  assert(isInt<32>(Offset) && "Unexpected offset");
+  // Put the offset back in Hi and the Lo
+  Hi20.getOperand(1).setOffset(Offset);
+  Lo12.getOperand(2).setOffset(Offset);
+  if (Lo20 && Hi12) {
+    Lo20->getOperand(2).setOffset(Offset);
+    Hi12->getOperand(2).setOffset(Offset);
+  }
+  // Delete the tail instruction.
+  MachineInstr *Def = Last ? Last : &Lo12;
+  MRI->constrainRegClass(Def->getOperand(0).getReg(),
+                         MRI->getRegClass(Tail.getOperand(0).getReg()));
+  MRI->replaceRegWith(Tail.getOperand(0).getReg(), Def->getOperand(0).getReg());
+  Tail.eraseFromParent();
+  LLVM_DEBUG(dbgs() << "  Merged offset " << Offset << " into base.\n"
+                    << "     " << Hi20 << "     " << Lo12;);
+  if (Lo20 && Hi12) {
+    LLVM_DEBUG(dbgs() << "     " << *Lo20 << "     " << *Hi12;);
+  }
+}
+
+// Detect patterns for large offsets that are passed into an ADD instruction.
+// If the pattern is found, updates the offset in Hi20, Lo12, Lo20 and Hi12
+// instructions and deletes TailAdd and the instructions that produced the
+// offset.
+//
+//                     Base address lowering is of the form:
+//                       Hi20:  pcalau12i vreg1, %pc_hi20(s)
+//                       Lo12:  addi.d vreg2, vreg1, %pc_lo12(s)
+//                       /                                  \
+//                      /                                    \
+//                     /                                      \
+//                    /  The large offset can be of two forms: \
+//  1) Offset that has non zero bits in lower      2) Offset that has non zero
+//     12 bits and upper 20 bits                      bits in upper 20 bits only
+//   OffsetHi: lu12i.w vreg3, 4
+//   OffsetLo: ori voff, vreg3, 188                 OffsetHi: lu12i.w voff, 128
+//                    \                                        /
+//                     \                                      /
+//                      \                                    /
+//                       \                                  /
+//                        TailAdd: add.d  vreg4, vreg2, voff
+bool LoongArchMergeBaseOffsetOpt::foldLargeOffset(
+    MachineInstr &Hi20, MachineInstr &Lo12, MachineInstr *&Lo20,
+    MachineInstr *&Hi12, MachineInstr *&Last, MachineInstr &TailAdd,
+    Register GAReg) {
+  assert((TailAdd.getOpcode() == LoongArch::ADD_W ||
+          TailAdd.getOpcode() == LoongArch::ADD_D) &&
+         "Expected ADD instruction!");
+  Register Rs = TailAdd.getOperand(1).getReg();
+  Register Rt = TailAdd.getOperand(2).getReg();
+  Register Reg = Rs == GAReg ? Rt : Rs;
+
+  // Can't fold if the register has more than one use.
+  if (!Reg.isVirtual() || !MRI->hasOneUse(Reg))
+    return false;
+  // This can point to an ORI or a LU12I.W:
+  MachineInstr &OffsetTail = *MRI->getVRegDef(Reg);
+  if (OffsetTail.getOpcode() == LoongArch::ORI) {
+    // The offset value has non zero bits in both %hi and %lo parts.
+    // Detect an ORI that feeds from a LU12I.W instruction.
+    MachineOperand &OriImmOp = OffsetTail.getOperand(2);
+    if (OriImmOp.getTargetFlags() != LoongArchII::MO_None)
+      return false;
+    Register OriReg = OffsetTail.getOperand(1).getReg();
+    int64_t OffLo = OriImmOp.getImm();
+
+    // Handle rs1 of ORI is R0.
+    if (OriReg == LoongArch::R0) {
+      LLVM_DEBUG(dbgs() << "  Offset Instrs: " << OffsetTail);
+      foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, OffLo);
+      OffsetTail.eraseFromParent();
+      return true;
+    }
+
+    MachineInstr &OffsetLu12i = *MRI->getVRegDef(OriReg);
+    MachineOperand &Lu12iImmOp = OffsetLu12i.getOperand(1);
+    if (OffsetLu12i.getOpcode() != LoongArch::LU12I_W ||
+        Lu12iImmOp.getTargetFlags() != LoongArchII::MO_None ||
+        !MRI->hasOneUse(OffsetLu12i.getOperand(0).getReg()))
+      return false;
+    int64_t Offset = SignExtend64<32>(Lu12iImmOp.getImm() << 12);
+    Offset += OffLo;
+    // LU12I.W+ORI sign extends the result.
+    Offset = SignExtend64<32>(Offset);
+    LLVM_DEBUG(dbgs() << "  Offset Instrs: " << OffsetTail
+                      << "                 " << OffsetLu12i);
+    foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset);
+    OffsetTail.eraseFromParent();
+    OffsetLu12i.eraseFromParent();
+    return true;
+  } else if (OffsetTail.getOpcode() == LoongArch::LU12I_W) {
+    // The offset value has all zero bits in the lower 12 bits. Only LU12I.W
+    // exists.
+    LLVM_DEBUG(dbgs() << "  Offset Instr: " << OffsetTail);
+    int64_t Offset = SignExtend64<32>(OffsetTail.getOperand(1).getImm() << 12);
+    foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailAdd, Offset);
+    OffsetTail.eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,
+                                                      MachineInstr &Lo12,
+                                                      MachineInstr *&Lo20,
+                                                      MachineInstr *&Hi12,
+                                                      MachineInstr *&Last) {
+  Register DestReg =
+      Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();
+
+  // Look for arithmetic instructions we can get an offset from.
+  // We might be able to remove the arithmetic instructions by folding the
+  // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I).
+  if (!MRI->hasOneUse(DestReg))
+    return false;
+
+  // DestReg has only one use.
+  MachineInstr &Tail = *MRI->use_instr_begin(DestReg);
+  switch (Tail.getOpcode()) {
+  default:
+    LLVM_DEBUG(dbgs() << "Don't know how to get offset from this instr:"
+                      << Tail);
+    break;
+  case LoongArch::ADDI_W:
+    if (ST->is64Bit())
+      return false;
+    [[fallthrough]];
+  case LoongArch::ADDI_D:
+  case LoongArch::ADDU16I_D: {
+    // Offset is simply an immediate operand.
+    int64_t Offset = Tail.getOperand(2).getImm();
+    if (Tail.getOpcode() == LoongArch::ADDU16I_D)
+      Offset = SignExtend64<32>(Offset << 16);
+
+    // We might have two ADDIs in a row.
+    Register TailDestReg = Tail.getOperand(0).getReg();
+    if (MRI->hasOneUse(TailDestReg)) {
+      MachineInstr &TailTail = *MRI->use_instr_begin(TailDestReg);
+      if (ST->is64Bit() && TailTail.getOpcode() == LoongArch::ADDI_W)
+        return false;
+      if (TailTail.getOpcode() == LoongArch::ADDI_W ||
+          TailTail.getOpcode() == LoongArch::ADDI_D) {
+        Offset += TailTail.getOperand(2).getImm();
+        LLVM_DEBUG(dbgs() << "  Offset Instrs: " << Tail << TailTail);
+        foldOffset(Hi20, Lo12, Lo20, Hi12, Last, TailTail, Offset);
+        Tail.eraseFromParent();
+        return true;
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "  Offset Instr: " << Tail);
+    foldOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, Offset);
+    return true;
+  }
+  case LoongArch::ADD_W:
+    if (ST->is64Bit())
+      return false;
+    [[fallthrough]];
+  case LoongArch::ADD_D:
+    // The offset is too large to fit in the immediate field of ADDI.
+    // This can be in two forms:
+    // 1) LU12I.W hi_offset followed by:
+    //    ORI lo_offset
+    //    This happens in case the offset has non zero bits in
+    //    both hi 20 and lo 12 bits.
+    // 2) LU12I.W (offset20)
+    //    This happens in case the lower 12 bits of the offset are zeros.
+    return foldLargeOffset(Hi20, Lo12, Lo20, Hi12, Last, Tail, DestReg);
+    break;
+  }
+
+  return false;
+}
+
+// Memory access opcode mapping for transforms.
+static unsigned getNewOpc(unsigned Op, bool isLarge) {
+  switch (Op) {
+  case LoongArch::LD_B:
+    return isLarge ? LoongArch::LDX_B : LoongArch::LD_B;
+  case LoongArch::LD_H:
+    return isLarge ? LoongArch::LDX_H : LoongArch::LD_H;
+  case LoongArch::LD_W:
+  case LoongArch::LDPTR_W:
+    return isLarge ? LoongArch::LDX_W : LoongArch::LD_W;
+  case LoongArch::LD_D:
+  case LoongArch::LDPTR_D:
+    return isLarge ? LoongArch::LDX_D : LoongArch::LD_D;
+  case LoongArch::LD_BU:
+    return isLarge ? LoongArch::LDX_BU : LoongArch::LD_BU;
+  case LoongArch::LD_HU:
+    return isLarge ? LoongArch::LDX_HU : LoongArch::LD_HU;
+  case LoongArch::LD_WU:
+    return isLarge ? LoongArch::LDX_WU : LoongArch::LD_WU;
+  case LoongArch::FLD_S:
+    return isLarge ? LoongArch::FLDX_S : LoongArch::FLD_S;
+  case LoongArch::FLD_D:
+    return isLarge ? LoongArch::FLDX_D : LoongArch::FLD_D;
+  case LoongArch::ST_B:
+    return isLarge ? LoongArch::STX_B : LoongArch::ST_B;
+  case LoongArch::ST_H:
+    return isLarge ? LoongArch::STX_H : LoongArch::ST_H;
+  case LoongArch::ST_W:
+  case LoongArch::STPTR_W:
+    return isLarge ? LoongArch::STX_W : LoongArch::ST_W;
+  case LoongArch::ST_D:
+  case LoongArch::STPTR_D:
+    return isLarge ? LoongArch::STX_D : LoongArch::ST_D;
+  case LoongArch::FST_S:
+    return isLarge ? LoongArch::FSTX_S : LoongArch::FST_S;
+  case LoongArch::FST_D:
+    return isLarge ? LoongArch::FSTX_D : LoongArch::FST_D;
+  default:
+    llvm_unreachable("Unexpected opcode for replacement");
+  }
+}
+
+bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
+                                                    MachineInstr &Lo12,
+                                                    MachineInstr *&Lo20,
+                                                    MachineInstr *&Hi12,
+                                                    MachineInstr *&Last) {
+  Register DestReg =
+      Last ? Last->getOperand(0).getReg() : Lo12.getOperand(0).getReg();
+
+  // If all the uses are memory ops with the same offset, we can transform:
+  //
+  // 1. (small/medium):
+  //   pcalau12i vreg1, %pc_hi20(s)
+  //   addi.d    vreg2, vreg1, %pc_lo12(s)
+  //   ld.w      vreg3, 8(vreg2)
+  //
+  //   =>
+  //
+  //   pcalau12i vreg1, %pc_hi20(s+8)
+  //   ld.w      vreg3, vreg1, %pc_lo12(s+8)(vreg1)
+  //
+  // 2. (large):
+  //   pcalau12i vreg1, %pc_hi20(s)
+  //   addi.d    vreg2, $zero, %pc_lo12(s)
+  //   lu32i.d   vreg3, vreg2, %pc64_lo20(s)
+  //   lu52i.d   vreg4, vreg3, %pc64_hi12(s)
+  //   add.d     vreg5, vreg4, vreg1
+  //   ld.w      vreg6, 8(vreg5)
+  //
+  //   =>
+  //
+  //   pcalau12i vreg1, %pc_hi20(s+8)
+  //   addi.d    vreg2, $zero, %pc_lo12(s+8)
+  //   lu32i.d   vreg3, vreg2, %pc64_lo20(s+8)
+  //   lu52i.d   vreg4, vreg3, %pc64_hi12(s+8)
+  //   ldx.w     vreg6, vreg4, vreg1
+
+  std::optional<int64_t> CommonOffset;
+  DenseMap<const MachineInstr *, SmallVector<unsigned>>
+      InlineAsmMemoryOpIndexesMap;
+  for (const MachineInstr &UseMI : MRI->use_instructions(DestReg)) {
+    switch (UseMI.getOpcode()) {
+    default:
+      LLVM_DEBUG(dbgs() << "Not a load or store instruction: " << UseMI);
+      return false;
+    case LoongArch::LD_B:
+    case LoongArch::LD_H:
+    case LoongArch::LD_W:
+    case LoongArch::LD_D:
+    case LoongArch::LD_BU:
+    case LoongArch::LD_HU:
+    case LoongArch::LD_WU:
+    case LoongArch::LDPTR_W:
+    case LoongArch::LDPTR_D:
+    case LoongArch::FLD_S:
+    case LoongArch::FLD_D:
+    case LoongArch::ST_B:
+    case LoongArch::ST_H:
+    case LoongArch::ST_W:
+    case LoongArch::ST_D:
+    case LoongArch::STPTR_W:
+    case LoongArch::STPTR_D:
+    case LoongArch::FST_S:
+    case LoongArch::FST_D: {
+      if (UseMI.getOperand(1).isFI())
+        return false;
+      // Register defined by Lo should not be the value register.
+      if (DestReg == UseMI.getOperand(0).getReg())
+        return false;
+      assert(DestReg == UseMI.getOperand(1).getReg() &&
+             "Expected base address use");
+      // All load/store instructions must use the same offset.
+      int64_t Offset = UseMI.getOperand(2).getImm();
+      if (CommonOffset && Offset != CommonOffset)
+        return false;
+      CommonOffset = Offset;
+      break;
+    }
+    case LoongArch::INLINEASM:
+    case LoongArch::INLINEASM_BR: {
+      // We can't do this for large pattern.
+      if (Last)
+        return false;
+      SmallVector<unsigned> InlineAsmMemoryOpIndexes;
+      unsigned NumOps = 0;
+      for (unsigned I = InlineAsm::MIOp_FirstOperand;
+           I < UseMI.getNumOperands(); I += 1 + NumOps) {
+        const MachineOperand &FlagsMO = UseMI.getOperand(I);
+        // Should be an imm.
+        if (!FlagsMO.isImm())
+          continue;
+
+        const InlineAsm::Flag Flags(FlagsMO.getImm());
+        NumOps = Flags.getNumOperandRegisters();
+
+        // Memory constraints have two operands.
+        if (NumOps != 2 || !Flags.isMemKind()) {
+          // If the register is used by something other than a memory contraint,
+          // we should not fold.
+          for (unsigned J = 0; J < NumOps; ++J) {
+            const MachineOperand &MO = UseMI.getOperand(I + 1 + J);
+            if (MO.isReg() && MO.getReg() == DestReg)
+              return false;
+          }
+          continue;
+        }
+
+        // We can only do this for constraint m.
+        if (Flags.getMemoryConstraintID() != InlineAsm::ConstraintCode::m)
+          return false;
+
+        const MachineOperand &AddrMO = UseMI.getOperand(I + 1);
+        if (!AddrMO.isReg() || AddrMO.getReg() != DestReg)
+          continue;
+
+        const MachineOperand &OffsetMO = UseMI.getOperand(I + 2);
+        if (!OffsetMO.isImm())
+          continue;
+
+        // All inline asm memory operands must use the same offset.
+        int64_t Offset = OffsetMO.getImm();
+        if (CommonOffset && Offset != CommonOffset)
+          return false;
+        CommonOffset = Offset;
+        InlineAsmMemoryOpIndexes.push_back(I + 1);
+      }
+      InlineAsmMemoryOpIndexesMap.insert(
+          std::make_pair(&UseMI, InlineAsmMemoryOpIndexes));
+      break;
+    }
+    }
+  }
+
+  // We found a common offset.
+  // Update the offsets in global address lowering.
+  // We may have already folded some arithmetic so we need to add to any
+  // existing offset.
+  int64_t NewOffset = Hi20.getOperand(1).getOffset() + *CommonOffset;
+  // LA32 ignores the upper 32 bits.
+  if (!ST->is64Bit())
+    NewOffset = SignExtend64<32>(NewOffset);
+  // We can only fold simm32 offsets.
+  if (!isInt<32>(NewOffset))
+    return false;
+
+  Hi20.getOperand(1).setOffset(NewOffset);
+  MachineOperand &ImmOp = Lo12.getOperand(2);
+  ImmOp.setOffset(NewOffset);
+  if (Lo20 && Hi12) {
+    Lo20->getOperand(2).setOffset(NewOffset);
+    Hi12->getOperand(2).setOffset(NewOffset);
+  }
+
+  // Update the immediate in the load/store instructions to add the offset.
+  const LoongArchInstrInfo &TII = *ST->getInstrInfo();
+  for (MachineInstr &UseMI :
+       llvm::make_early_inc_range(MRI->use_instructions(DestReg))) {
+    if (UseMI.getOpcode() == LoongArch::INLINEASM ||
+        UseMI.getOpcode() == LoongArch::INLINEASM_BR) {
+      auto &InlineAsmMemoryOpIndexes = InlineAsmMemoryOpIndexesMap[&UseMI];
+      for (unsigned I : InlineAsmMemoryOpIndexes) {
+        MachineOperand &MO = UseMI.getOperand(I + 1);
+        switch (ImmOp.getType()) {
+        case MachineOperand::MO_GlobalAddress:
+          MO.ChangeToGA(ImmOp.getGlobal(), ImmOp.getOffset(),
+                        ImmOp.getTargetFlags());
+          break;
+        case MachineOperand::MO_MCSymbol:
+          MO.ChangeToMCSymbol(ImmOp.getMCSymbol(), ImmOp.getTargetFlags());
+          MO.setOffset(ImmOp.getOffset());
+          break;
+        case MachineOperand::MO_BlockAddress:
+          MO.ChangeToBA(ImmOp.getBlockAddress(), ImmOp.getOffset(),
+                        ImmOp.getTargetFlags());
+          break;
+        default:
+          report_fatal_error("unsupported machine operand type");
+          break;
+        }
+      }
+    } else {
+      UseMI.setDesc(TII.get(getNewOpc(UseMI.getOpcode(), Last)));
+      if (Last) {
+        UseMI.removeOperand(2);
+        UseMI.removeOperand(1);
+        UseMI.addOperand(Last->getOperand(1));
+        UseMI.addOperand(Last->getOperand(2));
+        UseMI.getOperand(1).setIsKill(false);
+        UseMI.getOperand(2).setIsKill(false);
+      } else {
+        UseMI.removeOperand(2);
+        UseMI.addOperand(ImmOp);
+      }
+    }
+  }
+
+  if (Last) {
+    Last->eraseFromParent();
+    return true;
+  }
+
+  MRI->replaceRegWith(Lo12.getOperand(0).getReg(), Hi20.getOperand(0).getReg());
+  Lo12.eraseFromParent();
+  return true;
+}
+
+bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(Fn.getFunction()))
+    return false;
+
+  ST = &Fn.getSubtarget<LoongArchSubtarget>();
+
+  bool MadeChange = false;
+  MRI = &Fn.getRegInfo();
+  for (MachineBasicBlock &MBB : Fn) {
+    LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");
+    for (MachineInstr &Hi20 : MBB) {
+      MachineInstr *Lo12 = nullptr;
+      MachineInstr *Lo20 = nullptr;
+      MachineInstr *Hi12 = nullptr;
+      MachineInstr *Last = nullptr;
+      if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))
+        continue;
+      MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last);
+      MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last);
+    }
+  }
+
+  return MadeChange;
+}
+
+/// Returns an instance of the Merge Base Offset Optimization pass.
+FunctionPass *llvm::createLoongArchMergeBaseOffsetOptPass() {
+  return new LoongArchMergeBaseOffsetOpt();
+}
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index e83fc08696aea53..4401aadfe78485a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -35,6 +35,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeLoongArchTarget() {
   RegisterTargetMachine<LoongArchTargetMachine> Y(getTheLoongArch64Target());
   auto *PR = PassRegistry::getPassRegistry();
   initializeLoongArchDeadRegisterDefinitionsPass(*PR);
+  initializeLoongArchMergeBaseOffsetOptPass(*PR);
   initializeLoongArchOptWInstrsPass(*PR);
   initializeLoongArchPreRAExpandPseudoPass(*PR);
   initializeLoongArchDAGToDAGISelLegacyPass(*PR);
@@ -216,6 +217,8 @@ void LoongArchPassConfig::addMachineSSAOptimization() {
 
 void LoongArchPassConfig::addPreRegAlloc() {
   addPass(createLoongArchPreRAExpandPseudoPass());
+  if (TM->getOptLevel() != CodeGenOptLevel::None)
+    addPass(createLoongArchMergeBaseOffsetOptPass());
 }
 
 bool LoongArchPassConfig::addRegAssignAndRewriteFast() {
diff --git a/llvm/test/CodeGen/LoongArch/block-address.ll b/llvm/test/CodeGen/LoongArch/block-address.ll
index eaba81f3563d7f3..114cbb73a512592 100644
--- a/llvm/test/CodeGen/LoongArch/block-address.ll
+++ b/llvm/test/CodeGen/LoongArch/block-address.ll
@@ -8,11 +8,10 @@ define void @test_blockaddress() nounwind {
 ; LA32-LABEL: test_blockaddress:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(addr)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(addr)
 ; LA32-NEXT:    pcalau12i $a1, %pc_hi20(.Ltmp0)
 ; LA32-NEXT:    addi.w $a1, $a1, %pc_lo12(.Ltmp0)
-; LA32-NEXT:    st.w $a1, $a0, 0
-; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    st.w $a1, $a0, %pc_lo12(addr)
+; LA32-NEXT:    ld.w $a0, $a0, %pc_lo12(addr)
 ; LA32-NEXT:    jr $a0
 ; LA32-NEXT:  .Ltmp0: # Block address taken
 ; LA32-NEXT:  .LBB0_1: # %block
@@ -21,11 +20,10 @@ define void @test_blockaddress() nounwind {
 ; LA64-LABEL: test_blockaddress:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(addr)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(addr)
 ; LA64-NEXT:    pcalau12i $a1, %pc_hi20(.Ltmp0)
 ; LA64-NEXT:    addi.d $a1, $a1, %pc_lo12(.Ltmp0)
-; LA64-NEXT:    st.d $a1, $a0, 0
-; LA64-NEXT:    ld.d $a0, $a0, 0
+; LA64-NEXT:    st.d $a1, $a0, %pc_lo12(addr)
+; LA64-NEXT:    ld.d $a0, $a0, %pc_lo12(addr)
 ; LA64-NEXT:    jr $a0
 ; LA64-NEXT:  .Ltmp0: # Block address taken
 ; LA64-NEXT:  .LBB0_1: # %block
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll b/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll
index cc6ba057019c652..34fbec03c535b07 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll
@@ -64,26 +64,19 @@ define i64 @caller_double_in_gpr_exhausted_fprs() nounwind {
 ; CHECK-NEXT:    addi.d $sp, $sp, -16
 ; CHECK-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    fld.d $fa1, $a0, 0
+; CHECK-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI3_0)
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_1)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_1)
-; CHECK-NEXT:    fld.d $fa2, $a0, 0
+; CHECK-NEXT:    fld.d $fa2, $a0, %pc_lo12(.LCPI3_1)
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_2)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_2)
-; CHECK-NEXT:    fld.d $fa3, $a0, 0
+; CHECK-NEXT:    fld.d $fa3, $a0, %pc_lo12(.LCPI3_2)
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_3)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_3)
-; CHECK-NEXT:    fld.d $fa4, $a0, 0
+; CHECK-NEXT:    fld.d $fa4, $a0, %pc_lo12(.LCPI3_3)
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_4)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_4)
-; CHECK-NEXT:    fld.d $fa5, $a0, 0
+; CHECK-NEXT:    fld.d $fa5, $a0, %pc_lo12(.LCPI3_4)
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_5)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_5)
-; CHECK-NEXT:    fld.d $fa6, $a0, 0
+; CHECK-NEXT:    fld.d $fa6, $a0, %pc_lo12(.LCPI3_5)
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_6)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_6)
-; CHECK-NEXT:    fld.d $fa7, $a0, 0
+; CHECK-NEXT:    fld.d $fa7, $a0, %pc_lo12(.LCPI3_6)
 ; CHECK-NEXT:    addi.d $a0, $zero, 1
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    ffint.d.l $fa0, $fa0
diff --git a/llvm/test/CodeGen/LoongArch/double-imm.ll b/llvm/test/CodeGen/LoongArch/double-imm.ll
index 3e89db3ec5c8ccd..8d50b27907d72b4 100644
--- a/llvm/test/CodeGen/LoongArch/double-imm.ll
+++ b/llvm/test/CodeGen/LoongArch/double-imm.ll
@@ -36,15 +36,13 @@ define double @f64_constant_pi() nounwind {
 ; LA32-LABEL: f64_constant_pi:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA32-NEXT:    fld.d $fa0, $a0, 0
+; LA32-NEXT:    fld.d $fa0, $a0, %pc_lo12(.LCPI2_0)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: f64_constant_pi:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA64-NEXT:    fld.d $fa0, $a0, 0
+; LA64-NEXT:    fld.d $fa0, $a0, %pc_lo12(.LCPI2_0)
 ; LA64-NEXT:    ret
   ret double 3.1415926535897931159979634685441851615905761718750
 }
diff --git a/llvm/test/CodeGen/LoongArch/float-imm-vldi.ll b/llvm/test/CodeGen/LoongArch/float-imm-vldi.ll
index 0c6856bafd23d76..f228efb6b658824 100644
--- a/llvm/test/CodeGen/LoongArch/float-imm-vldi.ll
+++ b/llvm/test/CodeGen/LoongArch/float-imm-vldi.ll
@@ -6,21 +6,17 @@ define dso_local { float, double } @test1() {
 ; LA32-LABEL: test1:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI0_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI0_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI0_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI0_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test1:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI0_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI0_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI0_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 2.0000000000, double 2.0000000000 }
@@ -30,21 +26,17 @@ define dso_local { float, double } @test2() {
 ; LA32-LABEL: test2:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI1_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI1_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI1_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI1_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test2:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI1_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI1_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI1_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 2.1250000000, double 2.1250000000 }
@@ -54,21 +46,17 @@ define dso_local { float, double } @test3() {
 ; LA32-LABEL: test3:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI2_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI2_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test3:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI2_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI2_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 2.2500000000, double 2.2500000000 }
@@ -78,21 +66,17 @@ define dso_local { float, double } @test4() {
 ; LA32-LABEL: test4:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI3_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI3_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI3_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI3_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test4:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI3_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI3_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI3_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 2.3750000000, double 2.3750000000 }
@@ -102,21 +86,17 @@ define dso_local { float, double } @test5() {
 ; LA32-LABEL: test5:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI4_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI4_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI4_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI4_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test5:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI4_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI4_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI4_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 2.5000000000, double 2.5000000000 }
@@ -126,21 +106,17 @@ define dso_local { float, double } @test6() {
 ; LA32-LABEL: test6:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI5_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI5_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI5_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI5_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test6:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI5_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI5_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 2.6250000000, double 2.6250000000 }
@@ -150,21 +126,17 @@ define dso_local { float, double } @test7() {
 ; LA32-LABEL: test7:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI6_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI6_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI6_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI6_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test7:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI6_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI6_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI6_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI6_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 2.7500000000, double 2.7500000000 }
@@ -174,21 +146,17 @@ define dso_local { float, double } @test8() {
 ; LA32-LABEL: test8:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI7_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI7_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI7_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI7_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test8:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI7_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI7_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI7_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI7_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 2.8750000000, double 2.8750000000 }
@@ -198,21 +166,17 @@ define dso_local { float, double } @test9() {
 ; LA32-LABEL: test9:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI8_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI8_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI8_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI8_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test9:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI8_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI8_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI8_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI8_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 3.0000000000, double 3.0000000000 }
@@ -222,21 +186,17 @@ define dso_local { float, double } @test10() {
 ; LA32-LABEL: test10:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI9_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI9_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI9_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI9_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test10:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI9_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI9_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI9_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI9_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 3.1250000000, double 3.1250000000 }
@@ -246,21 +206,17 @@ define dso_local { float, double } @test11() {
 ; LA32-LABEL: test11:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI10_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI10_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI10_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI10_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test11:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI10_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI10_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI10_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI10_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 3.2500000000, double 3.2500000000 }
@@ -270,21 +226,17 @@ define dso_local { float, double } @test12() {
 ; LA32-LABEL: test12:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI11_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI11_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI11_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI11_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test12:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI11_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI11_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI11_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI11_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 3.3750000000, double 3.3750000000 }
@@ -294,21 +246,17 @@ define dso_local { float, double } @test13() {
 ; LA32-LABEL: test13:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI12_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI12_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI12_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI12_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test13:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI12_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI12_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI12_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI12_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 3.5000000000, double 3.5000000000 }
@@ -318,21 +266,17 @@ define dso_local { float, double } @test14() {
 ; LA32-LABEL: test14:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI13_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI13_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI13_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI13_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test14:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI13_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI13_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI13_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI13_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 3.6250000000, double 3.6250000000 }
@@ -342,21 +286,17 @@ define dso_local { float, double } @test15() {
 ; LA32-LABEL: test15:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI14_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI14_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI14_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI14_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test15:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI14_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI14_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI14_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI14_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 3.7500000000, double 3.7500000000 }
@@ -366,21 +306,17 @@ define dso_local { float, double } @test16() {
 ; LA32-LABEL: test16:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI15_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI15_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI15_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI15_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test16:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI15_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI15_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI15_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI15_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 3.8750000000, double 3.8750000000 }
@@ -390,21 +326,17 @@ define dso_local { float, double } @test17() {
 ; LA32-LABEL: test17:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI16_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI16_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI16_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI16_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test17:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI16_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI16_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI16_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI16_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 4.0000000000, double 4.0000000000 }
@@ -414,21 +346,17 @@ define dso_local { float, double } @test18() {
 ; LA32-LABEL: test18:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI17_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI17_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI17_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI17_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test18:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI17_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI17_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI17_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI17_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 4.2500000000, double 4.2500000000 }
@@ -438,21 +366,17 @@ define dso_local { float, double } @test19() {
 ; LA32-LABEL: test19:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI18_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI18_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI18_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI18_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test19:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI18_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI18_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI18_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI18_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 4.5000000000, double 4.5000000000 }
@@ -462,21 +386,17 @@ define dso_local { float, double } @test20() {
 ; LA32-LABEL: test20:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI19_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI19_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI19_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI19_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test20:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI19_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI19_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI19_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI19_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 4.7500000000, double 4.7500000000 }
@@ -486,21 +406,17 @@ define dso_local { float, double } @test21() {
 ; LA32-LABEL: test21:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI20_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI20_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI20_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI20_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test21:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI20_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI20_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI20_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI20_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 5.0000000000, double 5.0000000000 }
@@ -510,21 +426,17 @@ define dso_local { float, double } @test22() {
 ; LA32-LABEL: test22:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI21_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI21_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI21_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI21_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test22:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI21_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI21_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI21_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI21_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 5.2500000000, double 5.2500000000 }
@@ -534,21 +446,17 @@ define dso_local { float, double } @test23() {
 ; LA32-LABEL: test23:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI22_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI22_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI22_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI22_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI22_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI22_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test23:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI22_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI22_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI22_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI22_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI22_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI22_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 5.5000000000, double 5.5000000000 }
@@ -558,21 +466,17 @@ define dso_local { float, double } @test24() {
 ; LA32-LABEL: test24:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI23_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI23_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI23_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI23_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI23_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI23_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test24:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI23_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI23_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI23_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI23_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI23_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI23_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 5.7500000000, double 5.7500000000 }
@@ -582,21 +486,17 @@ define dso_local { float, double } @test25() {
 ; LA32-LABEL: test25:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI24_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI24_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI24_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI24_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test25:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI24_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI24_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI24_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI24_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 6.0000000000, double 6.0000000000 }
@@ -606,21 +506,17 @@ define dso_local { float, double } @test26() {
 ; LA32-LABEL: test26:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI25_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI25_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI25_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI25_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI25_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI25_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test26:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI25_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI25_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI25_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI25_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI25_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI25_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 6.2500000000, double 6.2500000000 }
@@ -630,21 +526,17 @@ define dso_local { float, double } @test27() {
 ; LA32-LABEL: test27:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI26_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI26_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI26_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI26_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI26_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI26_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test27:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI26_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI26_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI26_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI26_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI26_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI26_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 6.5000000000, double 6.5000000000 }
@@ -654,21 +546,17 @@ define dso_local { float, double } @test28() {
 ; LA32-LABEL: test28:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI27_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI27_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI27_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI27_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI27_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI27_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test28:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI27_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI27_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI27_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI27_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI27_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI27_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 6.7500000000, double 6.7500000000 }
@@ -678,21 +566,17 @@ define dso_local { float, double } @test29() {
 ; LA32-LABEL: test29:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI28_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI28_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI28_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI28_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI28_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI28_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test29:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI28_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI28_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI28_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI28_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI28_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI28_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 7.0000000000, double 7.0000000000 }
@@ -702,21 +586,17 @@ define dso_local { float, double } @test30() {
 ; LA32-LABEL: test30:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI29_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI29_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI29_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI29_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test30:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI29_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI29_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI29_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI29_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 7.2500000000, double 7.2500000000 }
@@ -726,21 +606,17 @@ define dso_local { float, double } @test31() {
 ; LA32-LABEL: test31:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI30_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI30_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI30_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI30_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI30_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI30_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test31:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI30_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI30_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI30_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI30_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI30_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI30_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 7.5000000000, double 7.5000000000 }
@@ -750,21 +626,17 @@ define dso_local { float, double } @test32() {
 ; LA32-LABEL: test32:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI31_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI31_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI31_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI31_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI31_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI31_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test32:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI31_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI31_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI31_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI31_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI31_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI31_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 7.7500000000, double 7.7500000000 }
@@ -774,21 +646,17 @@ define dso_local { float, double } @test33() {
 ; LA32-LABEL: test33:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI32_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI32_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI32_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI32_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI32_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI32_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test33:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI32_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI32_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI32_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI32_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI32_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI32_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 8.0000000000, double 8.0000000000 }
@@ -798,21 +666,17 @@ define dso_local { float, double } @test34() {
 ; LA32-LABEL: test34:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI33_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI33_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI33_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI33_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI33_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI33_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test34:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI33_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI33_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI33_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI33_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI33_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI33_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 8.5000000000, double 8.5000000000 }
@@ -822,21 +686,17 @@ define dso_local { float, double } @test35() {
 ; LA32-LABEL: test35:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI34_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI34_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI34_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI34_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI34_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI34_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test35:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI34_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI34_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI34_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI34_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI34_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI34_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 9.0000000000, double 9.0000000000 }
@@ -846,21 +706,17 @@ define dso_local { float, double } @test36() {
 ; LA32-LABEL: test36:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI35_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI35_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI35_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI35_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI35_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI35_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test36:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI35_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI35_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI35_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI35_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI35_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI35_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 9.5000000000, double 9.5000000000 }
@@ -870,21 +726,17 @@ define dso_local { float, double } @test37() {
 ; LA32-LABEL: test37:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI36_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI36_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI36_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI36_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI36_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI36_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test37:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI36_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI36_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI36_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI36_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI36_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI36_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 10.0000000000, double 10.0000000000 }
@@ -894,21 +746,17 @@ define dso_local { float, double } @test38() {
 ; LA32-LABEL: test38:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI37_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI37_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI37_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI37_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test38:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI37_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI37_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI37_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI37_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 10.5000000000, double 10.5000000000 }
@@ -918,21 +766,17 @@ define dso_local { float, double } @test39() {
 ; LA32-LABEL: test39:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI38_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI38_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI38_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI38_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI38_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI38_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test39:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI38_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI38_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI38_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI38_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI38_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI38_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 11.0000000000, double 11.0000000000 }
@@ -942,21 +786,17 @@ define dso_local { float, double } @test40() {
 ; LA32-LABEL: test40:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI39_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI39_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI39_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI39_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI39_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI39_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test40:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI39_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI39_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI39_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI39_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI39_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI39_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 11.5000000000, double 11.5000000000 }
@@ -966,21 +806,17 @@ define dso_local { float, double } @test41() {
 ; LA32-LABEL: test41:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI40_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI40_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI40_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI40_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI40_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI40_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test41:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI40_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI40_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI40_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI40_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI40_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI40_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 12.0000000000, double 12.0000000000 }
@@ -990,21 +826,17 @@ define dso_local { float, double } @test42() {
 ; LA32-LABEL: test42:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI41_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI41_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI41_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI41_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI41_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI41_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test42:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI41_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI41_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI41_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI41_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI41_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI41_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 12.5000000000, double 12.5000000000 }
@@ -1014,21 +846,17 @@ define dso_local { float, double } @test43() {
 ; LA32-LABEL: test43:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI42_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI42_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI42_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI42_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI42_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI42_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test43:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI42_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI42_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI42_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI42_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI42_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI42_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 13.0000000000, double 13.0000000000 }
@@ -1038,21 +866,17 @@ define dso_local { float, double } @test44() {
 ; LA32-LABEL: test44:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI43_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI43_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI43_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI43_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI43_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI43_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test44:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI43_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI43_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI43_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI43_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI43_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI43_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 13.5000000000, double 13.5000000000 }
@@ -1062,21 +886,17 @@ define dso_local { float, double } @test45() {
 ; LA32-LABEL: test45:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI44_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI44_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI44_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI44_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI44_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI44_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test45:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI44_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI44_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI44_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI44_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI44_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI44_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 14.0000000000, double 14.0000000000 }
@@ -1086,21 +906,17 @@ define dso_local { float, double } @test46() {
 ; LA32-LABEL: test46:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI45_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI45_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI45_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI45_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI45_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI45_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test46:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI45_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI45_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI45_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI45_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI45_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI45_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 14.5000000000, double 14.5000000000 }
@@ -1110,21 +926,17 @@ define dso_local { float, double } @test47() {
 ; LA32-LABEL: test47:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI46_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI46_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI46_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI46_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI46_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI46_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test47:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI46_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI46_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI46_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI46_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI46_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI46_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 15.0000000000, double 15.0000000000 }
@@ -1134,21 +946,17 @@ define dso_local { float, double } @test48() {
 ; LA32-LABEL: test48:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI47_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI47_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI47_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI47_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI47_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI47_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test48:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI47_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI47_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI47_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI47_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI47_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI47_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 15.5000000000, double 15.5000000000 }
@@ -1158,21 +966,17 @@ define dso_local { float, double } @test49() {
 ; LA32-LABEL: test49:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI48_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI48_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI48_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI48_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI48_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI48_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test49:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI48_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI48_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI48_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI48_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI48_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI48_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 16.0000000000, double 16.0000000000 }
@@ -1182,21 +986,17 @@ define dso_local { float, double } @test50() {
 ; LA32-LABEL: test50:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI49_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI49_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI49_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI49_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI49_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI49_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test50:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI49_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI49_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI49_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI49_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI49_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI49_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 17.0000000000, double 17.0000000000 }
@@ -1206,21 +1006,17 @@ define dso_local { float, double } @test51() {
 ; LA32-LABEL: test51:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI50_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI50_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI50_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI50_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI50_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI50_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test51:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI50_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI50_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI50_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI50_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI50_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI50_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 18.0000000000, double 18.0000000000 }
@@ -1230,21 +1026,17 @@ define dso_local { float, double } @test52() {
 ; LA32-LABEL: test52:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI51_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI51_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI51_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI51_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI51_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI51_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test52:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI51_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI51_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI51_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI51_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI51_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI51_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 19.0000000000, double 19.0000000000 }
@@ -1254,21 +1046,17 @@ define dso_local { float, double } @test53() {
 ; LA32-LABEL: test53:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI52_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI52_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI52_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI52_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI52_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI52_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test53:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI52_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI52_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI52_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI52_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI52_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI52_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 20.0000000000, double 20.0000000000 }
@@ -1278,21 +1066,17 @@ define dso_local { float, double } @test54() {
 ; LA32-LABEL: test54:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI53_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI53_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI53_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI53_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI53_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI53_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test54:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI53_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI53_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI53_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI53_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI53_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI53_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 21.0000000000, double 21.0000000000 }
@@ -1302,21 +1086,17 @@ define dso_local { float, double } @test55() {
 ; LA32-LABEL: test55:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI54_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI54_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI54_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI54_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI54_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI54_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test55:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI54_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI54_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI54_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI54_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI54_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI54_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 22.0000000000, double 22.0000000000 }
@@ -1326,21 +1106,17 @@ define dso_local { float, double } @test56() {
 ; LA32-LABEL: test56:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI55_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI55_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI55_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI55_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI55_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI55_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test56:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI55_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI55_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI55_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI55_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI55_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI55_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 23.0000000000, double 23.0000000000 }
@@ -1350,21 +1126,17 @@ define dso_local { float, double } @test57() {
 ; LA32-LABEL: test57:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI56_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI56_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI56_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI56_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI56_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI56_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test57:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI56_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI56_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI56_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI56_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI56_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI56_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 24.0000000000, double 24.0000000000 }
@@ -1374,21 +1146,17 @@ define dso_local { float, double } @test58() {
 ; LA32-LABEL: test58:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI57_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI57_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI57_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI57_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI57_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI57_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test58:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI57_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI57_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI57_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI57_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI57_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI57_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 25.0000000000, double 25.0000000000 }
@@ -1398,21 +1166,17 @@ define dso_local { float, double } @test59() {
 ; LA32-LABEL: test59:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI58_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI58_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI58_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI58_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI58_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI58_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test59:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI58_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI58_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI58_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI58_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI58_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI58_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 26.0000000000, double 26.0000000000 }
@@ -1422,21 +1186,17 @@ define dso_local { float, double } @test60() {
 ; LA32-LABEL: test60:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI59_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI59_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI59_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI59_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI59_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI59_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test60:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI59_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI59_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI59_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI59_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI59_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI59_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 27.0000000000, double 27.0000000000 }
@@ -1446,21 +1206,17 @@ define dso_local { float, double } @test61() {
 ; LA32-LABEL: test61:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI60_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI60_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI60_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI60_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI60_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI60_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test61:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI60_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI60_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI60_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI60_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI60_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI60_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 28.0000000000, double 28.0000000000 }
@@ -1470,21 +1226,17 @@ define dso_local { float, double } @test62() {
 ; LA32-LABEL: test62:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI61_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI61_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI61_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI61_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI61_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI61_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test62:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI61_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI61_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI61_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI61_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI61_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI61_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 29.0000000000, double 29.0000000000 }
@@ -1494,21 +1246,17 @@ define dso_local { float, double } @test63() {
 ; LA32-LABEL: test63:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI62_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI62_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI62_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI62_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI62_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI62_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test63:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI62_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI62_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI62_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI62_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI62_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI62_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 30.0000000000, double 30.0000000000 }
@@ -1518,21 +1266,17 @@ define dso_local { float, double } @test64() {
 ; LA32-LABEL: test64:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI63_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI63_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI63_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI63_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI63_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI63_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test64:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI63_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI63_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI63_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI63_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI63_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI63_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 31.0000000000, double 31.0000000000 }
@@ -1542,21 +1286,17 @@ define dso_local { float, double } @test65() {
 ; LA32-LABEL: test65:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI64_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI64_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI64_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI64_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI64_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI64_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test65:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI64_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI64_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI64_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI64_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI64_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI64_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.1250000000, double 0.1250000000 }
@@ -1566,21 +1306,17 @@ define dso_local { float, double } @test66() {
 ; LA32-LABEL: test66:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI65_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI65_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI65_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI65_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI65_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI65_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test66:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI65_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI65_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI65_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI65_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI65_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI65_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.1328125000, double 0.1328125000 }
@@ -1590,21 +1326,17 @@ define dso_local { float, double } @test67() {
 ; LA32-LABEL: test67:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI66_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI66_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI66_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI66_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI66_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI66_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test67:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI66_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI66_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI66_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI66_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI66_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI66_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.1406250000, double 0.1406250000 }
@@ -1614,21 +1346,17 @@ define dso_local { float, double } @test68() {
 ; LA32-LABEL: test68:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI67_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI67_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI67_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI67_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI67_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI67_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test68:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI67_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI67_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI67_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI67_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI67_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI67_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.1484375000, double 0.1484375000 }
@@ -1638,21 +1366,17 @@ define dso_local { float, double } @test69() {
 ; LA32-LABEL: test69:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI68_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI68_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI68_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI68_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI68_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI68_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test69:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI68_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI68_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI68_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI68_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI68_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI68_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.1562500000, double 0.1562500000 }
@@ -1662,21 +1386,17 @@ define dso_local { float, double } @test70() {
 ; LA32-LABEL: test70:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI69_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI69_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI69_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI69_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI69_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI69_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test70:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI69_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI69_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI69_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI69_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI69_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI69_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.1640625000, double 0.1640625000 }
@@ -1686,21 +1406,17 @@ define dso_local { float, double } @test71() {
 ; LA32-LABEL: test71:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI70_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI70_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI70_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI70_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI70_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI70_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test71:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI70_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI70_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI70_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI70_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI70_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI70_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.1718750000, double 0.1718750000 }
@@ -1710,21 +1426,17 @@ define dso_local { float, double } @test72() {
 ; LA32-LABEL: test72:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI71_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI71_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI71_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI71_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI71_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI71_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test72:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI71_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI71_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI71_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI71_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI71_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI71_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.1796875000, double 0.1796875000 }
@@ -1734,21 +1446,17 @@ define dso_local { float, double } @test73() {
 ; LA32-LABEL: test73:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI72_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI72_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI72_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI72_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI72_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI72_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test73:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI72_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI72_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI72_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI72_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI72_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI72_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.1875000000, double 0.1875000000 }
@@ -1758,21 +1466,17 @@ define dso_local { float, double } @test74() {
 ; LA32-LABEL: test74:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI73_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI73_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI73_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI73_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI73_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI73_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test74:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI73_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI73_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI73_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI73_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI73_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI73_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.1953125000, double 0.1953125000 }
@@ -1782,21 +1486,17 @@ define dso_local { float, double } @test75() {
 ; LA32-LABEL: test75:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI74_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI74_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI74_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI74_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI74_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI74_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test75:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI74_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI74_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI74_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI74_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI74_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI74_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.2031250000, double 0.2031250000 }
@@ -1806,21 +1506,17 @@ define dso_local { float, double } @test76() {
 ; LA32-LABEL: test76:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI75_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI75_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI75_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI75_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI75_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI75_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test76:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI75_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI75_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI75_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI75_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI75_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI75_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.2109375000, double 0.2109375000 }
@@ -1830,21 +1526,17 @@ define dso_local { float, double } @test77() {
 ; LA32-LABEL: test77:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI76_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI76_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI76_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI76_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI76_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI76_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test77:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI76_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI76_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI76_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI76_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI76_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI76_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.2187500000, double 0.2187500000 }
@@ -1854,21 +1546,17 @@ define dso_local { float, double } @test78() {
 ; LA32-LABEL: test78:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI77_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI77_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI77_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI77_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI77_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI77_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test78:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI77_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI77_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI77_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI77_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI77_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI77_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.2265625000, double 0.2265625000 }
@@ -1878,21 +1566,17 @@ define dso_local { float, double } @test79() {
 ; LA32-LABEL: test79:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI78_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI78_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI78_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI78_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI78_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI78_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test79:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI78_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI78_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI78_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI78_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI78_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI78_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.2343750000, double 0.2343750000 }
@@ -1902,21 +1586,17 @@ define dso_local { float, double } @test80() {
 ; LA32-LABEL: test80:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI79_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI79_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI79_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI79_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI79_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI79_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test80:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI79_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI79_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI79_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI79_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI79_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI79_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.2421875000, double 0.2421875000 }
@@ -1926,21 +1606,17 @@ define dso_local { float, double } @test81() {
 ; LA32-LABEL: test81:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI80_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI80_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI80_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI80_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI80_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI80_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test81:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI80_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI80_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI80_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI80_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI80_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI80_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.2500000000, double 0.2500000000 }
@@ -1950,21 +1626,17 @@ define dso_local { float, double } @test82() {
 ; LA32-LABEL: test82:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI81_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI81_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI81_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI81_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI81_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI81_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test82:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI81_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI81_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI81_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI81_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI81_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI81_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.2656250000, double 0.2656250000 }
@@ -1974,21 +1646,17 @@ define dso_local { float, double } @test83() {
 ; LA32-LABEL: test83:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI82_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI82_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI82_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI82_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI82_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI82_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test83:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI82_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI82_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI82_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI82_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI82_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI82_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.2812500000, double 0.2812500000 }
@@ -1998,21 +1666,17 @@ define dso_local { float, double } @test84() {
 ; LA32-LABEL: test84:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI83_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI83_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI83_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI83_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI83_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI83_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test84:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI83_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI83_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI83_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI83_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI83_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI83_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.2968750000, double 0.2968750000 }
@@ -2022,21 +1686,17 @@ define dso_local { float, double } @test85() {
 ; LA32-LABEL: test85:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI84_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI84_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI84_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI84_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI84_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI84_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test85:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI84_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI84_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI84_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI84_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI84_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI84_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.3125000000, double 0.3125000000 }
@@ -2046,21 +1706,17 @@ define dso_local { float, double } @test86() {
 ; LA32-LABEL: test86:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI85_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI85_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI85_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI85_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI85_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI85_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test86:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI85_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI85_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI85_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI85_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI85_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI85_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.3281250000, double 0.3281250000 }
@@ -2070,21 +1726,17 @@ define dso_local { float, double } @test87() {
 ; LA32-LABEL: test87:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI86_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI86_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI86_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI86_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI86_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI86_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test87:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI86_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI86_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI86_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI86_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI86_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI86_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.3437500000, double 0.3437500000 }
@@ -2094,21 +1746,17 @@ define dso_local { float, double } @test88() {
 ; LA32-LABEL: test88:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI87_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI87_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI87_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI87_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI87_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI87_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test88:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI87_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI87_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI87_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI87_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI87_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI87_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.3593750000, double 0.3593750000 }
@@ -2118,21 +1766,17 @@ define dso_local { float, double } @test89() {
 ; LA32-LABEL: test89:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI88_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI88_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI88_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI88_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI88_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI88_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test89:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI88_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI88_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI88_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI88_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI88_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI88_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.3750000000, double 0.3750000000 }
@@ -2142,21 +1786,17 @@ define dso_local { float, double } @test90() {
 ; LA32-LABEL: test90:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI89_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI89_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI89_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI89_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI89_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI89_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test90:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI89_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI89_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI89_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI89_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI89_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI89_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.3906250000, double 0.3906250000 }
@@ -2166,21 +1806,17 @@ define dso_local { float, double } @test91() {
 ; LA32-LABEL: test91:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI90_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI90_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI90_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI90_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI90_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI90_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test91:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI90_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI90_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI90_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI90_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI90_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI90_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.4062500000, double 0.4062500000 }
@@ -2190,21 +1826,17 @@ define dso_local { float, double } @test92() {
 ; LA32-LABEL: test92:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI91_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI91_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI91_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI91_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI91_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI91_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test92:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI91_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI91_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI91_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI91_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI91_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI91_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.4218750000, double 0.4218750000 }
@@ -2214,21 +1846,17 @@ define dso_local { float, double } @test93() {
 ; LA32-LABEL: test93:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI92_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI92_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI92_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI92_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI92_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI92_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test93:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI92_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI92_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI92_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI92_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI92_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI92_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.4375000000, double 0.4375000000 }
@@ -2238,21 +1866,17 @@ define dso_local { float, double } @test94() {
 ; LA32-LABEL: test94:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI93_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI93_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI93_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI93_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI93_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI93_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test94:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI93_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI93_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI93_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI93_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI93_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI93_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.4531250000, double 0.4531250000 }
@@ -2262,21 +1886,17 @@ define dso_local { float, double } @test95() {
 ; LA32-LABEL: test95:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI94_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI94_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI94_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI94_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI94_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI94_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test95:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI94_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI94_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI94_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI94_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI94_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI94_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.4687500000, double 0.4687500000 }
@@ -2286,21 +1906,17 @@ define dso_local { float, double } @test96() {
 ; LA32-LABEL: test96:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI95_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI95_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI95_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI95_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI95_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI95_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test96:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI95_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI95_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI95_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI95_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI95_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI95_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.4843750000, double 0.4843750000 }
@@ -2310,21 +1926,17 @@ define dso_local { float, double } @test97() {
 ; LA32-LABEL: test97:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI96_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI96_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI96_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI96_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI96_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI96_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test97:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI96_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI96_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI96_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI96_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI96_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI96_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.5000000000, double 0.5000000000 }
@@ -2334,21 +1946,17 @@ define dso_local { float, double } @test98() {
 ; LA32-LABEL: test98:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI97_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI97_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI97_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI97_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI97_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI97_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test98:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI97_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI97_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI97_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI97_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI97_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI97_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.5312500000, double 0.5312500000 }
@@ -2358,21 +1966,17 @@ define dso_local { float, double } @test99() {
 ; LA32-LABEL: test99:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI98_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI98_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI98_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI98_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI98_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI98_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test99:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI98_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI98_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI98_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI98_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI98_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI98_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.5625000000, double 0.5625000000 }
@@ -2382,21 +1986,17 @@ define dso_local { float, double } @test100() {
 ; LA32-LABEL: test100:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI99_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI99_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI99_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI99_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI99_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI99_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test100:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI99_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI99_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI99_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI99_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI99_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI99_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.5937500000, double 0.5937500000 }
@@ -2406,21 +2006,17 @@ define dso_local { float, double } @test101() {
 ; LA32-LABEL: test101:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI100_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI100_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI100_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI100_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI100_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI100_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test101:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI100_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI100_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI100_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI100_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI100_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI100_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.6250000000, double 0.6250000000 }
@@ -2430,21 +2026,17 @@ define dso_local { float, double } @test102() {
 ; LA32-LABEL: test102:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI101_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI101_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI101_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI101_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI101_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI101_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test102:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI101_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI101_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI101_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI101_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI101_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI101_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.6562500000, double 0.6562500000 }
@@ -2454,21 +2046,17 @@ define dso_local { float, double } @test103() {
 ; LA32-LABEL: test103:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI102_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI102_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI102_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI102_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI102_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI102_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test103:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI102_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI102_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI102_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI102_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI102_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI102_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.6875000000, double 0.6875000000 }
@@ -2478,21 +2066,17 @@ define dso_local { float, double } @test104() {
 ; LA32-LABEL: test104:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI103_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI103_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI103_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI103_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI103_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI103_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test104:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI103_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI103_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI103_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI103_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI103_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI103_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.7187500000, double 0.7187500000 }
@@ -2502,21 +2086,17 @@ define dso_local { float, double } @test105() {
 ; LA32-LABEL: test105:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI104_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI104_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI104_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI104_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI104_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI104_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test105:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI104_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI104_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI104_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI104_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI104_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI104_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.7500000000, double 0.7500000000 }
@@ -2526,21 +2106,17 @@ define dso_local { float, double } @test106() {
 ; LA32-LABEL: test106:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI105_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI105_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI105_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI105_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI105_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI105_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test106:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI105_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI105_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI105_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI105_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI105_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI105_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.7812500000, double 0.7812500000 }
@@ -2550,21 +2126,17 @@ define dso_local { float, double } @test107() {
 ; LA32-LABEL: test107:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI106_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI106_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI106_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI106_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI106_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI106_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test107:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI106_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI106_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI106_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI106_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI106_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI106_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.8125000000, double 0.8125000000 }
@@ -2574,21 +2146,17 @@ define dso_local { float, double } @test108() {
 ; LA32-LABEL: test108:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI107_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI107_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI107_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI107_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI107_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI107_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test108:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI107_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI107_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI107_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI107_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI107_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI107_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.8437500000, double 0.8437500000 }
@@ -2598,21 +2166,17 @@ define dso_local { float, double } @test109() {
 ; LA32-LABEL: test109:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI108_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI108_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI108_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI108_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI108_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI108_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test109:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI108_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI108_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI108_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI108_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI108_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI108_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.8750000000, double 0.8750000000 }
@@ -2622,21 +2186,17 @@ define dso_local { float, double } @test110() {
 ; LA32-LABEL: test110:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI109_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI109_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI109_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI109_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI109_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI109_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test110:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI109_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI109_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI109_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI109_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI109_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI109_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.9062500000, double 0.9062500000 }
@@ -2646,21 +2206,17 @@ define dso_local { float, double } @test111() {
 ; LA32-LABEL: test111:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI110_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI110_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI110_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI110_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI110_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI110_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test111:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI110_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI110_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI110_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI110_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI110_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI110_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.9375000000, double 0.9375000000 }
@@ -2670,21 +2226,17 @@ define dso_local { float, double } @test112() {
 ; LA32-LABEL: test112:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI111_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI111_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI111_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI111_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI111_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI111_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test112:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI111_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI111_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI111_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI111_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI111_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI111_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 0.9687500000, double 0.9687500000 }
@@ -2716,21 +2268,17 @@ define dso_local { float, double } @test114() {
 ; LA32-LABEL: test114:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI113_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI113_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI113_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI113_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI113_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI113_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test114:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI113_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI113_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI113_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI113_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI113_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI113_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.0625000000, double 1.0625000000 }
@@ -2740,21 +2288,17 @@ define dso_local { float, double } @test115() {
 ; LA32-LABEL: test115:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI114_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI114_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI114_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI114_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI114_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI114_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test115:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI114_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI114_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI114_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI114_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI114_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI114_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.1250000000, double 1.1250000000 }
@@ -2764,21 +2308,17 @@ define dso_local { float, double } @test116() {
 ; LA32-LABEL: test116:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI115_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI115_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI115_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI115_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI115_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI115_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test116:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI115_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI115_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI115_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI115_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI115_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI115_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.1875000000, double 1.1875000000 }
@@ -2788,21 +2328,17 @@ define dso_local { float, double } @test117() {
 ; LA32-LABEL: test117:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI116_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI116_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI116_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI116_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI116_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI116_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test117:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI116_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI116_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI116_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI116_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI116_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI116_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.2500000000, double 1.2500000000 }
@@ -2812,21 +2348,17 @@ define dso_local { float, double } @test118() {
 ; LA32-LABEL: test118:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI117_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI117_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI117_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI117_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI117_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI117_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test118:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI117_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI117_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI117_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI117_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI117_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI117_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.3125000000, double 1.3125000000 }
@@ -2836,21 +2368,17 @@ define dso_local { float, double } @test119() {
 ; LA32-LABEL: test119:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI118_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI118_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI118_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI118_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI118_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI118_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test119:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI118_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI118_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI118_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI118_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI118_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI118_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.3750000000, double 1.3750000000 }
@@ -2860,21 +2388,17 @@ define dso_local { float, double } @test120() {
 ; LA32-LABEL: test120:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI119_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI119_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI119_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI119_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI119_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI119_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test120:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI119_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI119_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI119_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI119_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI119_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI119_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.4375000000, double 1.4375000000 }
@@ -2884,21 +2408,17 @@ define dso_local { float, double } @test121() {
 ; LA32-LABEL: test121:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI120_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI120_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI120_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI120_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI120_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI120_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test121:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI120_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI120_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI120_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI120_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI120_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI120_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.5000000000, double 1.5000000000 }
@@ -2908,21 +2428,17 @@ define dso_local { float, double } @test122() {
 ; LA32-LABEL: test122:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI121_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI121_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI121_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI121_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI121_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI121_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test122:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI121_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI121_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI121_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI121_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI121_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI121_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.5625000000, double 1.5625000000 }
@@ -2932,21 +2448,17 @@ define dso_local { float, double } @test123() {
 ; LA32-LABEL: test123:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI122_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI122_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI122_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI122_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI122_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI122_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test123:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI122_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI122_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI122_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI122_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI122_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI122_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.6250000000, double 1.6250000000 }
@@ -2956,21 +2468,17 @@ define dso_local { float, double } @test124() {
 ; LA32-LABEL: test124:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI123_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI123_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI123_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI123_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI123_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI123_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test124:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI123_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI123_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI123_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI123_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI123_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI123_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.6875000000, double 1.6875000000 }
@@ -2980,21 +2488,17 @@ define dso_local { float, double } @test125() {
 ; LA32-LABEL: test125:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI124_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI124_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI124_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI124_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI124_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI124_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test125:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI124_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI124_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI124_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI124_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI124_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI124_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.7500000000, double 1.7500000000 }
@@ -3004,21 +2508,17 @@ define dso_local { float, double } @test126() {
 ; LA32-LABEL: test126:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI125_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI125_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI125_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI125_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI125_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI125_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test126:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI125_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI125_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI125_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI125_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI125_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI125_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.8125000000, double 1.8125000000 }
@@ -3028,21 +2528,17 @@ define dso_local { float, double } @test127() {
 ; LA32-LABEL: test127:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI126_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI126_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI126_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI126_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI126_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI126_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test127:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI126_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI126_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI126_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI126_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI126_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI126_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.8750000000, double 1.8750000000 }
@@ -3052,21 +2548,17 @@ define dso_local { float, double } @test128() {
 ; LA32-LABEL: test128:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI127_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI127_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI127_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI127_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI127_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI127_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test128:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI127_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI127_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI127_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI127_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI127_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI127_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float 1.9375000000, double 1.9375000000 }
@@ -3076,21 +2568,17 @@ define dso_local { float, double } @test129() {
 ; LA32-LABEL: test129:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI128_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI128_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI128_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI128_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI128_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI128_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test129:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI128_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI128_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI128_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI128_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI128_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI128_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -2.0000000000, double -2.0000000000 }
@@ -3100,21 +2588,17 @@ define dso_local { float, double } @test130() {
 ; LA32-LABEL: test130:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI129_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI129_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI129_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI129_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI129_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI129_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test130:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI129_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI129_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI129_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI129_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI129_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI129_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -2.1250000000, double -2.1250000000 }
@@ -3124,21 +2608,17 @@ define dso_local { float, double } @test131() {
 ; LA32-LABEL: test131:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI130_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI130_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI130_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI130_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI130_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI130_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test131:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI130_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI130_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI130_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI130_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI130_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI130_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -2.2500000000, double -2.2500000000 }
@@ -3148,21 +2628,17 @@ define dso_local { float, double } @test132() {
 ; LA32-LABEL: test132:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI131_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI131_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI131_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI131_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI131_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI131_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test132:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI131_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI131_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI131_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI131_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI131_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI131_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -2.3750000000, double -2.3750000000 }
@@ -3172,21 +2648,17 @@ define dso_local { float, double } @test133() {
 ; LA32-LABEL: test133:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI132_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI132_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI132_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI132_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI132_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI132_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test133:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI132_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI132_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI132_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI132_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI132_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI132_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -2.5000000000, double -2.5000000000 }
@@ -3196,21 +2668,17 @@ define dso_local { float, double } @test134() {
 ; LA32-LABEL: test134:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI133_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI133_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI133_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI133_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI133_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI133_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test134:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI133_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI133_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI133_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI133_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI133_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI133_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -2.6250000000, double -2.6250000000 }
@@ -3220,21 +2688,17 @@ define dso_local { float, double } @test135() {
 ; LA32-LABEL: test135:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI134_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI134_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI134_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI134_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI134_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI134_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test135:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI134_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI134_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI134_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI134_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI134_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI134_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -2.7500000000, double -2.7500000000 }
@@ -3244,21 +2708,17 @@ define dso_local { float, double } @test136() {
 ; LA32-LABEL: test136:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI135_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI135_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI135_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI135_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI135_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI135_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test136:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI135_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI135_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI135_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI135_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI135_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI135_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -2.8750000000, double -2.8750000000 }
@@ -3268,21 +2728,17 @@ define dso_local { float, double } @test137() {
 ; LA32-LABEL: test137:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI136_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI136_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI136_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI136_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI136_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI136_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test137:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI136_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI136_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI136_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI136_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI136_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI136_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -3.0000000000, double -3.0000000000 }
@@ -3292,21 +2748,17 @@ define dso_local { float, double } @test138() {
 ; LA32-LABEL: test138:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI137_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI137_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI137_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI137_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI137_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI137_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test138:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI137_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI137_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI137_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI137_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI137_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI137_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -3.1250000000, double -3.1250000000 }
@@ -3316,21 +2768,17 @@ define dso_local { float, double } @test139() {
 ; LA32-LABEL: test139:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI138_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI138_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI138_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI138_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI138_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI138_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test139:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI138_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI138_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI138_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI138_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI138_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI138_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -3.2500000000, double -3.2500000000 }
@@ -3340,21 +2788,17 @@ define dso_local { float, double } @test140() {
 ; LA32-LABEL: test140:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI139_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI139_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI139_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI139_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI139_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI139_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test140:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI139_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI139_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI139_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI139_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI139_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI139_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -3.3750000000, double -3.3750000000 }
@@ -3364,21 +2808,17 @@ define dso_local { float, double } @test141() {
 ; LA32-LABEL: test141:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI140_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI140_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI140_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI140_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI140_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI140_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test141:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI140_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI140_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI140_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI140_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI140_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI140_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -3.5000000000, double -3.5000000000 }
@@ -3388,21 +2828,17 @@ define dso_local { float, double } @test142() {
 ; LA32-LABEL: test142:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI141_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI141_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI141_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI141_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI141_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI141_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test142:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI141_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI141_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI141_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI141_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI141_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI141_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -3.6250000000, double -3.6250000000 }
@@ -3412,21 +2848,17 @@ define dso_local { float, double } @test143() {
 ; LA32-LABEL: test143:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI142_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI142_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI142_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI142_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI142_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI142_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test143:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI142_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI142_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI142_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI142_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI142_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI142_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -3.7500000000, double -3.7500000000 }
@@ -3436,21 +2868,17 @@ define dso_local { float, double } @test144() {
 ; LA32-LABEL: test144:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI143_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI143_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI143_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI143_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI143_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI143_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test144:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI143_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI143_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI143_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI143_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI143_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI143_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -3.8750000000, double -3.8750000000 }
@@ -3460,21 +2888,17 @@ define dso_local { float, double } @test145() {
 ; LA32-LABEL: test145:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI144_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI144_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI144_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI144_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI144_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI144_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test145:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI144_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI144_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI144_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI144_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI144_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI144_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -4.0000000000, double -4.0000000000 }
@@ -3484,21 +2908,17 @@ define dso_local { float, double } @test146() {
 ; LA32-LABEL: test146:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI145_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI145_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI145_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI145_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI145_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI145_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test146:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI145_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI145_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI145_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI145_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI145_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI145_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -4.2500000000, double -4.2500000000 }
@@ -3508,21 +2928,17 @@ define dso_local { float, double } @test147() {
 ; LA32-LABEL: test147:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI146_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI146_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI146_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI146_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI146_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI146_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test147:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI146_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI146_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI146_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI146_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI146_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI146_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -4.5000000000, double -4.5000000000 }
@@ -3532,21 +2948,17 @@ define dso_local { float, double } @test148() {
 ; LA32-LABEL: test148:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI147_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI147_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI147_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI147_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI147_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI147_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test148:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI147_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI147_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI147_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI147_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI147_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI147_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -4.7500000000, double -4.7500000000 }
@@ -3556,21 +2968,17 @@ define dso_local { float, double } @test149() {
 ; LA32-LABEL: test149:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI148_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI148_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI148_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI148_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI148_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI148_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test149:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI148_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI148_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI148_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI148_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI148_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI148_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -5.0000000000, double -5.0000000000 }
@@ -3580,21 +2988,17 @@ define dso_local { float, double } @test150() {
 ; LA32-LABEL: test150:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI149_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI149_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI149_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI149_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI149_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI149_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test150:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI149_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI149_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI149_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI149_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI149_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI149_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -5.2500000000, double -5.2500000000 }
@@ -3604,21 +3008,17 @@ define dso_local { float, double } @test151() {
 ; LA32-LABEL: test151:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI150_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI150_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI150_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI150_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI150_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI150_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test151:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI150_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI150_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI150_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI150_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI150_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI150_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -5.5000000000, double -5.5000000000 }
@@ -3628,21 +3028,17 @@ define dso_local { float, double } @test152() {
 ; LA32-LABEL: test152:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI151_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI151_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI151_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI151_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI151_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI151_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test152:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI151_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI151_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI151_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI151_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI151_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI151_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -5.7500000000, double -5.7500000000 }
@@ -3652,21 +3048,17 @@ define dso_local { float, double } @test153() {
 ; LA32-LABEL: test153:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI152_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI152_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI152_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI152_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI152_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI152_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test153:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI152_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI152_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI152_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI152_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI152_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI152_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -6.0000000000, double -6.0000000000 }
@@ -3676,21 +3068,17 @@ define dso_local { float, double } @test154() {
 ; LA32-LABEL: test154:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI153_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI153_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI153_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI153_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI153_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI153_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test154:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI153_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI153_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI153_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI153_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI153_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI153_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -6.2500000000, double -6.2500000000 }
@@ -3700,21 +3088,17 @@ define dso_local { float, double } @test155() {
 ; LA32-LABEL: test155:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI154_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI154_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI154_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI154_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI154_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI154_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test155:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI154_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI154_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI154_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI154_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI154_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI154_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -6.5000000000, double -6.5000000000 }
@@ -3724,21 +3108,17 @@ define dso_local { float, double } @test156() {
 ; LA32-LABEL: test156:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI155_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI155_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI155_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI155_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI155_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI155_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test156:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI155_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI155_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI155_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI155_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI155_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI155_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -6.7500000000, double -6.7500000000 }
@@ -3748,21 +3128,17 @@ define dso_local { float, double } @test157() {
 ; LA32-LABEL: test157:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI156_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI156_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI156_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI156_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI156_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI156_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test157:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI156_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI156_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI156_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI156_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI156_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI156_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -7.0000000000, double -7.0000000000 }
@@ -3772,21 +3148,17 @@ define dso_local { float, double } @test158() {
 ; LA32-LABEL: test158:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI157_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI157_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI157_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI157_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI157_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI157_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test158:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI157_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI157_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI157_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI157_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI157_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI157_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -7.2500000000, double -7.2500000000 }
@@ -3796,21 +3168,17 @@ define dso_local { float, double } @test159() {
 ; LA32-LABEL: test159:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI158_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI158_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI158_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI158_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI158_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI158_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test159:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI158_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI158_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI158_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI158_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI158_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI158_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -7.5000000000, double -7.5000000000 }
@@ -3820,21 +3188,17 @@ define dso_local { float, double } @test160() {
 ; LA32-LABEL: test160:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI159_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI159_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI159_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI159_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI159_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI159_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test160:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI159_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI159_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI159_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI159_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI159_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI159_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -7.7500000000, double -7.7500000000 }
@@ -3844,21 +3208,17 @@ define dso_local { float, double } @test161() {
 ; LA32-LABEL: test161:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI160_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI160_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI160_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI160_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI160_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI160_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test161:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI160_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI160_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI160_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI160_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI160_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI160_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -8.0000000000, double -8.0000000000 }
@@ -3868,21 +3228,17 @@ define dso_local { float, double } @test162() {
 ; LA32-LABEL: test162:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI161_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI161_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI161_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI161_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI161_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI161_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test162:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI161_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI161_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI161_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI161_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI161_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI161_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -8.5000000000, double -8.5000000000 }
@@ -3892,21 +3248,17 @@ define dso_local { float, double } @test163() {
 ; LA32-LABEL: test163:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI162_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI162_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI162_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI162_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI162_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI162_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test163:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI162_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI162_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI162_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI162_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI162_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI162_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -9.0000000000, double -9.0000000000 }
@@ -3916,21 +3268,17 @@ define dso_local { float, double } @test164() {
 ; LA32-LABEL: test164:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI163_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI163_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI163_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI163_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI163_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI163_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test164:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI163_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI163_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI163_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI163_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI163_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI163_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -9.5000000000, double -9.5000000000 }
@@ -3940,21 +3288,17 @@ define dso_local { float, double } @test165() {
 ; LA32-LABEL: test165:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI164_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI164_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI164_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI164_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI164_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI164_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test165:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI164_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI164_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI164_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI164_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI164_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI164_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -10.0000000000, double -10.0000000000 }
@@ -3964,21 +3308,17 @@ define dso_local { float, double } @test166() {
 ; LA32-LABEL: test166:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI165_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI165_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI165_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI165_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI165_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI165_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test166:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI165_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI165_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI165_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI165_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI165_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI165_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -10.5000000000, double -10.5000000000 }
@@ -3988,21 +3328,17 @@ define dso_local { float, double } @test167() {
 ; LA32-LABEL: test167:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI166_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI166_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI166_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI166_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI166_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI166_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test167:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI166_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI166_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI166_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI166_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI166_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI166_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -11.0000000000, double -11.0000000000 }
@@ -4012,21 +3348,17 @@ define dso_local { float, double } @test168() {
 ; LA32-LABEL: test168:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI167_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI167_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI167_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI167_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI167_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI167_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test168:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI167_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI167_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI167_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI167_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI167_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI167_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -11.5000000000, double -11.5000000000 }
@@ -4036,21 +3368,17 @@ define dso_local { float, double } @test169() {
 ; LA32-LABEL: test169:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI168_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI168_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI168_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI168_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI168_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI168_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test169:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI168_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI168_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI168_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI168_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI168_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI168_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -12.0000000000, double -12.0000000000 }
@@ -4060,21 +3388,17 @@ define dso_local { float, double } @test170() {
 ; LA32-LABEL: test170:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI169_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI169_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI169_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI169_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI169_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI169_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test170:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI169_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI169_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI169_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI169_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI169_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI169_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -12.5000000000, double -12.5000000000 }
@@ -4084,21 +3408,17 @@ define dso_local { float, double } @test171() {
 ; LA32-LABEL: test171:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI170_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI170_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI170_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI170_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI170_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI170_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test171:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI170_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI170_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI170_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI170_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI170_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI170_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -13.0000000000, double -13.0000000000 }
@@ -4108,21 +3428,17 @@ define dso_local { float, double } @test172() {
 ; LA32-LABEL: test172:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI171_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI171_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI171_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI171_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI171_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI171_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test172:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI171_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI171_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI171_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI171_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI171_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI171_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -13.5000000000, double -13.5000000000 }
@@ -4132,21 +3448,17 @@ define dso_local { float, double } @test173() {
 ; LA32-LABEL: test173:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI172_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI172_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI172_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI172_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI172_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI172_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test173:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI172_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI172_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI172_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI172_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI172_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI172_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -14.0000000000, double -14.0000000000 }
@@ -4156,21 +3468,17 @@ define dso_local { float, double } @test174() {
 ; LA32-LABEL: test174:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI173_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI173_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI173_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI173_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI173_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI173_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test174:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI173_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI173_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI173_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI173_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI173_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI173_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -14.5000000000, double -14.5000000000 }
@@ -4180,21 +3488,17 @@ define dso_local { float, double } @test175() {
 ; LA32-LABEL: test175:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI174_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI174_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI174_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI174_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI174_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI174_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test175:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI174_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI174_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI174_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI174_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI174_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI174_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -15.0000000000, double -15.0000000000 }
@@ -4204,21 +3508,17 @@ define dso_local { float, double } @test176() {
 ; LA32-LABEL: test176:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI175_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI175_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI175_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI175_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI175_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI175_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test176:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI175_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI175_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI175_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI175_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI175_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI175_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -15.5000000000, double -15.5000000000 }
@@ -4228,21 +3528,17 @@ define dso_local { float, double } @test177() {
 ; LA32-LABEL: test177:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI176_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI176_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI176_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI176_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI176_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI176_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test177:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI176_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI176_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI176_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI176_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI176_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI176_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -16.0000000000, double -16.0000000000 }
@@ -4252,21 +3548,17 @@ define dso_local { float, double } @test178() {
 ; LA32-LABEL: test178:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI177_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI177_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI177_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI177_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI177_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI177_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test178:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI177_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI177_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI177_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI177_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI177_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI177_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -17.0000000000, double -17.0000000000 }
@@ -4276,21 +3568,17 @@ define dso_local { float, double } @test179() {
 ; LA32-LABEL: test179:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI178_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI178_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI178_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI178_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI178_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI178_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test179:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI178_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI178_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI178_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI178_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI178_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI178_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -18.0000000000, double -18.0000000000 }
@@ -4300,21 +3588,17 @@ define dso_local { float, double } @test180() {
 ; LA32-LABEL: test180:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI179_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI179_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI179_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI179_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI179_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI179_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test180:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI179_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI179_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI179_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI179_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI179_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI179_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -19.0000000000, double -19.0000000000 }
@@ -4324,21 +3608,17 @@ define dso_local { float, double } @test181() {
 ; LA32-LABEL: test181:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI180_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI180_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI180_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI180_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI180_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI180_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test181:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI180_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI180_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI180_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI180_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI180_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI180_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -20.0000000000, double -20.0000000000 }
@@ -4348,21 +3628,17 @@ define dso_local { float, double } @test182() {
 ; LA32-LABEL: test182:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI181_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI181_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI181_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI181_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI181_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI181_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test182:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI181_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI181_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI181_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI181_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI181_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI181_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -21.0000000000, double -21.0000000000 }
@@ -4372,21 +3648,17 @@ define dso_local { float, double } @test183() {
 ; LA32-LABEL: test183:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI182_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI182_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI182_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI182_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI182_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI182_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test183:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI182_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI182_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI182_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI182_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI182_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI182_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -22.0000000000, double -22.0000000000 }
@@ -4396,21 +3668,17 @@ define dso_local { float, double } @test184() {
 ; LA32-LABEL: test184:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI183_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI183_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI183_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI183_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI183_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI183_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test184:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI183_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI183_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI183_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI183_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI183_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI183_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -23.0000000000, double -23.0000000000 }
@@ -4420,21 +3688,17 @@ define dso_local { float, double } @test185() {
 ; LA32-LABEL: test185:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI184_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI184_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI184_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI184_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI184_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI184_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test185:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI184_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI184_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI184_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI184_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI184_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI184_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -24.0000000000, double -24.0000000000 }
@@ -4444,21 +3708,17 @@ define dso_local { float, double } @test186() {
 ; LA32-LABEL: test186:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI185_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI185_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI185_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI185_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI185_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI185_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test186:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI185_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI185_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI185_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI185_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI185_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI185_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -25.0000000000, double -25.0000000000 }
@@ -4468,21 +3728,17 @@ define dso_local { float, double } @test187() {
 ; LA32-LABEL: test187:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI186_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI186_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI186_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI186_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI186_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI186_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test187:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI186_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI186_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI186_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI186_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI186_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI186_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -26.0000000000, double -26.0000000000 }
@@ -4492,21 +3748,17 @@ define dso_local { float, double } @test188() {
 ; LA32-LABEL: test188:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI187_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI187_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI187_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI187_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI187_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI187_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test188:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI187_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI187_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI187_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI187_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI187_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI187_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -27.0000000000, double -27.0000000000 }
@@ -4516,21 +3768,17 @@ define dso_local { float, double } @test189() {
 ; LA32-LABEL: test189:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI188_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI188_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI188_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI188_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI188_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI188_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test189:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI188_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI188_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI188_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI188_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI188_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI188_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -28.0000000000, double -28.0000000000 }
@@ -4540,21 +3788,17 @@ define dso_local { float, double } @test190() {
 ; LA32-LABEL: test190:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI189_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI189_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI189_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI189_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI189_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI189_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test190:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI189_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI189_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI189_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI189_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI189_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI189_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -29.0000000000, double -29.0000000000 }
@@ -4564,21 +3808,17 @@ define dso_local { float, double } @test191() {
 ; LA32-LABEL: test191:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI190_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI190_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI190_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI190_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI190_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI190_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test191:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI190_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI190_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI190_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI190_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI190_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI190_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -30.0000000000, double -30.0000000000 }
@@ -4588,21 +3828,17 @@ define dso_local { float, double } @test192() {
 ; LA32-LABEL: test192:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI191_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI191_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI191_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI191_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI191_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI191_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test192:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI191_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI191_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI191_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI191_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI191_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI191_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -31.0000000000, double -31.0000000000 }
@@ -4612,21 +3848,17 @@ define dso_local { float, double } @test193() {
 ; LA32-LABEL: test193:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI192_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI192_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI192_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI192_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI192_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI192_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test193:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI192_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI192_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI192_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI192_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI192_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI192_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.1250000000, double -0.1250000000 }
@@ -4636,21 +3868,17 @@ define dso_local { float, double } @test194() {
 ; LA32-LABEL: test194:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI193_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI193_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI193_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI193_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI193_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI193_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test194:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI193_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI193_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI193_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI193_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI193_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI193_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.1328125000, double -0.1328125000 }
@@ -4660,21 +3888,17 @@ define dso_local { float, double } @test195() {
 ; LA32-LABEL: test195:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI194_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI194_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI194_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI194_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI194_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI194_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test195:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI194_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI194_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI194_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI194_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI194_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI194_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.1406250000, double -0.1406250000 }
@@ -4684,21 +3908,17 @@ define dso_local { float, double } @test196() {
 ; LA32-LABEL: test196:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI195_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI195_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI195_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI195_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI195_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI195_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test196:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI195_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI195_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI195_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI195_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI195_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI195_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.1484375000, double -0.1484375000 }
@@ -4708,21 +3928,17 @@ define dso_local { float, double } @test197() {
 ; LA32-LABEL: test197:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI196_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI196_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI196_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI196_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI196_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI196_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test197:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI196_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI196_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI196_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI196_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI196_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI196_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.1562500000, double -0.1562500000 }
@@ -4732,21 +3948,17 @@ define dso_local { float, double } @test198() {
 ; LA32-LABEL: test198:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI197_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI197_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI197_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI197_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI197_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI197_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test198:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI197_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI197_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI197_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI197_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI197_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI197_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.1640625000, double -0.1640625000 }
@@ -4756,21 +3968,17 @@ define dso_local { float, double } @test199() {
 ; LA32-LABEL: test199:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI198_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI198_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI198_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI198_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI198_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI198_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test199:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI198_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI198_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI198_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI198_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI198_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI198_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.1718750000, double -0.1718750000 }
@@ -4780,21 +3988,17 @@ define dso_local { float, double } @test200() {
 ; LA32-LABEL: test200:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI199_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI199_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI199_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI199_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI199_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI199_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test200:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI199_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI199_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI199_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI199_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI199_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI199_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.1796875000, double -0.1796875000 }
@@ -4804,21 +4008,17 @@ define dso_local { float, double } @test201() {
 ; LA32-LABEL: test201:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI200_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI200_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI200_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI200_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI200_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI200_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test201:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI200_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI200_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI200_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI200_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI200_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI200_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.1875000000, double -0.1875000000 }
@@ -4828,21 +4028,17 @@ define dso_local { float, double } @test202() {
 ; LA32-LABEL: test202:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI201_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI201_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI201_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI201_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI201_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI201_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test202:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI201_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI201_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI201_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI201_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI201_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI201_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.1953125000, double -0.1953125000 }
@@ -4852,21 +4048,17 @@ define dso_local { float, double } @test203() {
 ; LA32-LABEL: test203:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI202_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI202_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI202_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI202_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI202_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI202_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test203:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI202_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI202_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI202_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI202_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI202_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI202_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.2031250000, double -0.2031250000 }
@@ -4876,21 +4068,17 @@ define dso_local { float, double } @test204() {
 ; LA32-LABEL: test204:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI203_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI203_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI203_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI203_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI203_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI203_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test204:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI203_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI203_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI203_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI203_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI203_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI203_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.2109375000, double -0.2109375000 }
@@ -4900,21 +4088,17 @@ define dso_local { float, double } @test205() {
 ; LA32-LABEL: test205:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI204_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI204_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI204_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI204_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI204_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI204_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test205:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI204_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI204_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI204_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI204_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI204_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI204_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.2187500000, double -0.2187500000 }
@@ -4924,21 +4108,17 @@ define dso_local { float, double } @test206() {
 ; LA32-LABEL: test206:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI205_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI205_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI205_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI205_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI205_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI205_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test206:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI205_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI205_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI205_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI205_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI205_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI205_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.2265625000, double -0.2265625000 }
@@ -4948,21 +4128,17 @@ define dso_local { float, double } @test207() {
 ; LA32-LABEL: test207:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI206_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI206_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI206_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI206_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI206_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI206_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test207:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI206_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI206_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI206_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI206_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI206_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI206_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.2343750000, double -0.2343750000 }
@@ -4972,21 +4148,17 @@ define dso_local { float, double } @test208() {
 ; LA32-LABEL: test208:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI207_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI207_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI207_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI207_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI207_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI207_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test208:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI207_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI207_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI207_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI207_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI207_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI207_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.2421875000, double -0.2421875000 }
@@ -4996,21 +4168,17 @@ define dso_local { float, double } @test209() {
 ; LA32-LABEL: test209:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI208_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI208_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI208_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI208_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI208_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI208_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test209:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI208_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI208_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI208_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI208_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI208_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI208_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.2500000000, double -0.2500000000 }
@@ -5020,21 +4188,17 @@ define dso_local { float, double } @test210() {
 ; LA32-LABEL: test210:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI209_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI209_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI209_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI209_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI209_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI209_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test210:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI209_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI209_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI209_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI209_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI209_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI209_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.2656250000, double -0.2656250000 }
@@ -5044,21 +4208,17 @@ define dso_local { float, double } @test211() {
 ; LA32-LABEL: test211:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI210_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI210_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI210_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI210_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI210_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI210_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test211:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI210_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI210_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI210_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI210_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI210_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI210_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.2812500000, double -0.2812500000 }
@@ -5068,21 +4228,17 @@ define dso_local { float, double } @test212() {
 ; LA32-LABEL: test212:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI211_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI211_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI211_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI211_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI211_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI211_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test212:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI211_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI211_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI211_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI211_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI211_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI211_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.2968750000, double -0.2968750000 }
@@ -5092,21 +4248,17 @@ define dso_local { float, double } @test213() {
 ; LA32-LABEL: test213:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI212_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI212_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI212_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI212_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI212_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI212_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test213:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI212_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI212_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI212_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI212_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI212_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI212_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.3125000000, double -0.3125000000 }
@@ -5116,21 +4268,17 @@ define dso_local { float, double } @test214() {
 ; LA32-LABEL: test214:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI213_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI213_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI213_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI213_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI213_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI213_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test214:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI213_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI213_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI213_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI213_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI213_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI213_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.3281250000, double -0.3281250000 }
@@ -5140,21 +4288,17 @@ define dso_local { float, double } @test215() {
 ; LA32-LABEL: test215:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI214_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI214_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI214_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI214_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI214_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI214_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test215:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI214_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI214_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI214_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI214_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI214_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI214_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.3437500000, double -0.3437500000 }
@@ -5164,21 +4308,17 @@ define dso_local { float, double } @test216() {
 ; LA32-LABEL: test216:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI215_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI215_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI215_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI215_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI215_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI215_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test216:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI215_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI215_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI215_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI215_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI215_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI215_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.3593750000, double -0.3593750000 }
@@ -5188,21 +4328,17 @@ define dso_local { float, double } @test217() {
 ; LA32-LABEL: test217:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI216_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI216_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI216_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI216_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI216_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI216_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test217:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI216_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI216_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI216_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI216_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI216_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI216_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.3750000000, double -0.3750000000 }
@@ -5212,21 +4348,17 @@ define dso_local { float, double } @test218() {
 ; LA32-LABEL: test218:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI217_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI217_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI217_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI217_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI217_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI217_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test218:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI217_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI217_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI217_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI217_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI217_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI217_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.3906250000, double -0.3906250000 }
@@ -5236,21 +4368,17 @@ define dso_local { float, double } @test219() {
 ; LA32-LABEL: test219:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI218_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI218_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI218_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI218_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI218_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI218_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test219:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI218_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI218_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI218_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI218_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI218_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI218_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.4062500000, double -0.4062500000 }
@@ -5260,21 +4388,17 @@ define dso_local { float, double } @test220() {
 ; LA32-LABEL: test220:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI219_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI219_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI219_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI219_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI219_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI219_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test220:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI219_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI219_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI219_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI219_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI219_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI219_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.4218750000, double -0.4218750000 }
@@ -5284,21 +4408,17 @@ define dso_local { float, double } @test221() {
 ; LA32-LABEL: test221:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI220_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI220_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI220_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI220_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI220_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI220_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test221:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI220_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI220_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI220_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI220_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI220_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI220_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.4375000000, double -0.4375000000 }
@@ -5308,21 +4428,17 @@ define dso_local { float, double } @test222() {
 ; LA32-LABEL: test222:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI221_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI221_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI221_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI221_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI221_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI221_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test222:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI221_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI221_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI221_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI221_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI221_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI221_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.4531250000, double -0.4531250000 }
@@ -5332,21 +4448,17 @@ define dso_local { float, double } @test223() {
 ; LA32-LABEL: test223:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI222_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI222_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI222_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI222_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI222_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI222_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test223:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI222_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI222_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI222_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI222_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI222_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI222_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.4687500000, double -0.4687500000 }
@@ -5356,21 +4468,17 @@ define dso_local { float, double } @test224() {
 ; LA32-LABEL: test224:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI223_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI223_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI223_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI223_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI223_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI223_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test224:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI223_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI223_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI223_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI223_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI223_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI223_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.4843750000, double -0.4843750000 }
@@ -5380,21 +4488,17 @@ define dso_local { float, double } @test225() {
 ; LA32-LABEL: test225:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI224_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI224_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI224_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI224_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI224_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI224_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test225:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI224_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI224_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI224_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI224_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI224_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI224_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.5000000000, double -0.5000000000 }
@@ -5404,21 +4508,17 @@ define dso_local { float, double } @test226() {
 ; LA32-LABEL: test226:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI225_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI225_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI225_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI225_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI225_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI225_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test226:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI225_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI225_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI225_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI225_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI225_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI225_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.5312500000, double -0.5312500000 }
@@ -5428,21 +4528,17 @@ define dso_local { float, double } @test227() {
 ; LA32-LABEL: test227:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI226_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI226_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI226_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI226_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI226_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI226_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test227:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI226_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI226_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI226_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI226_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI226_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI226_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.5625000000, double -0.5625000000 }
@@ -5452,21 +4548,17 @@ define dso_local { float, double } @test228() {
 ; LA32-LABEL: test228:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI227_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI227_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI227_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI227_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI227_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI227_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test228:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI227_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI227_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI227_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI227_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI227_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI227_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.5937500000, double -0.5937500000 }
@@ -5476,21 +4568,17 @@ define dso_local { float, double } @test229() {
 ; LA32-LABEL: test229:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI228_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI228_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI228_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI228_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI228_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI228_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test229:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI228_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI228_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI228_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI228_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI228_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI228_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.6250000000, double -0.6250000000 }
@@ -5500,21 +4588,17 @@ define dso_local { float, double } @test230() {
 ; LA32-LABEL: test230:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI229_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI229_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI229_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI229_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI229_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI229_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test230:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI229_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI229_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI229_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI229_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI229_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI229_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.6562500000, double -0.6562500000 }
@@ -5524,21 +4608,17 @@ define dso_local { float, double } @test231() {
 ; LA32-LABEL: test231:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI230_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI230_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI230_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI230_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI230_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI230_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test231:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI230_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI230_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI230_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI230_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI230_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI230_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.6875000000, double -0.6875000000 }
@@ -5548,21 +4628,17 @@ define dso_local { float, double } @test232() {
 ; LA32-LABEL: test232:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI231_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI231_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI231_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI231_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI231_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI231_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test232:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI231_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI231_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI231_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI231_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI231_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI231_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.7187500000, double -0.7187500000 }
@@ -5572,21 +4648,17 @@ define dso_local { float, double } @test233() {
 ; LA32-LABEL: test233:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI232_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI232_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI232_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI232_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI232_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI232_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test233:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI232_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI232_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI232_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI232_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI232_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI232_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.7500000000, double -0.7500000000 }
@@ -5596,21 +4668,17 @@ define dso_local { float, double } @test234() {
 ; LA32-LABEL: test234:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI233_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI233_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI233_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI233_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI233_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI233_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test234:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI233_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI233_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI233_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI233_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI233_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI233_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.7812500000, double -0.7812500000 }
@@ -5620,21 +4688,17 @@ define dso_local { float, double } @test235() {
 ; LA32-LABEL: test235:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI234_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI234_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI234_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI234_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI234_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI234_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test235:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI234_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI234_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI234_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI234_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI234_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI234_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.8125000000, double -0.8125000000 }
@@ -5644,21 +4708,17 @@ define dso_local { float, double } @test236() {
 ; LA32-LABEL: test236:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI235_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI235_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI235_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI235_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI235_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI235_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test236:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI235_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI235_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI235_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI235_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI235_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI235_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.8437500000, double -0.8437500000 }
@@ -5668,21 +4728,17 @@ define dso_local { float, double } @test237() {
 ; LA32-LABEL: test237:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI236_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI236_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI236_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI236_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI236_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI236_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test237:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI236_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI236_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI236_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI236_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI236_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI236_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.8750000000, double -0.8750000000 }
@@ -5692,21 +4748,17 @@ define dso_local { float, double } @test238() {
 ; LA32-LABEL: test238:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI237_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI237_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI237_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI237_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI237_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI237_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test238:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI237_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI237_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI237_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI237_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI237_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI237_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.9062500000, double -0.9062500000 }
@@ -5716,21 +4768,17 @@ define dso_local { float, double } @test239() {
 ; LA32-LABEL: test239:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI238_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI238_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI238_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI238_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI238_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI238_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test239:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI238_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI238_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI238_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI238_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI238_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI238_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.9375000000, double -0.9375000000 }
@@ -5740,21 +4788,17 @@ define dso_local { float, double } @test240() {
 ; LA32-LABEL: test240:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI239_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI239_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI239_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI239_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI239_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI239_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test240:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI239_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI239_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI239_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI239_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI239_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI239_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -0.9687500000, double -0.9687500000 }
@@ -5764,21 +4808,17 @@ define dso_local { float, double } @test241() {
 ; LA32-LABEL: test241:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI240_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI240_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI240_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI240_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI240_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI240_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test241:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI240_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI240_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI240_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI240_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI240_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI240_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.0000000000, double -1.0000000000 }
@@ -5788,21 +4828,17 @@ define dso_local { float, double } @test242() {
 ; LA32-LABEL: test242:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI241_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI241_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI241_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI241_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI241_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI241_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test242:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI241_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI241_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI241_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI241_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI241_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI241_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.0625000000, double -1.0625000000 }
@@ -5812,21 +4848,17 @@ define dso_local { float, double } @test243() {
 ; LA32-LABEL: test243:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI242_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI242_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI242_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI242_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI242_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI242_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test243:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI242_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI242_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI242_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI242_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI242_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI242_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.1250000000, double -1.1250000000 }
@@ -5836,21 +4868,17 @@ define dso_local { float, double } @test244() {
 ; LA32-LABEL: test244:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI243_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI243_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI243_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI243_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI243_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI243_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test244:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI243_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI243_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI243_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI243_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI243_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI243_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.1875000000, double -1.1875000000 }
@@ -5860,21 +4888,17 @@ define dso_local { float, double } @test245() {
 ; LA32-LABEL: test245:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI244_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI244_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI244_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI244_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI244_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI244_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test245:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI244_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI244_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI244_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI244_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI244_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI244_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.2500000000, double -1.2500000000 }
@@ -5884,21 +4908,17 @@ define dso_local { float, double } @test246() {
 ; LA32-LABEL: test246:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI245_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI245_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI245_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI245_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI245_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI245_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test246:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI245_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI245_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI245_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI245_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI245_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI245_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.3125000000, double -1.3125000000 }
@@ -5908,21 +4928,17 @@ define dso_local { float, double } @test247() {
 ; LA32-LABEL: test247:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI246_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI246_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI246_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI246_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI246_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI246_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test247:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI246_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI246_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI246_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI246_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI246_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI246_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.3750000000, double -1.3750000000 }
@@ -5932,21 +4948,17 @@ define dso_local { float, double } @test248() {
 ; LA32-LABEL: test248:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI247_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI247_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI247_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI247_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI247_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI247_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test248:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI247_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI247_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI247_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI247_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI247_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI247_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.4375000000, double -1.4375000000 }
@@ -5956,21 +4968,17 @@ define dso_local { float, double } @test249() {
 ; LA32-LABEL: test249:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI248_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI248_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI248_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI248_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI248_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI248_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test249:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI248_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI248_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI248_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI248_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI248_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI248_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.5000000000, double -1.5000000000 }
@@ -5980,21 +4988,17 @@ define dso_local { float, double } @test250() {
 ; LA32-LABEL: test250:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI249_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI249_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI249_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI249_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI249_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI249_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test250:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI249_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI249_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI249_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI249_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI249_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI249_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.5625000000, double -1.5625000000 }
@@ -6004,21 +5008,17 @@ define dso_local { float, double } @test251() {
 ; LA32-LABEL: test251:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI250_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI250_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI250_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI250_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI250_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI250_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test251:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI250_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI250_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI250_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI250_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI250_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI250_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.6250000000, double -1.6250000000 }
@@ -6028,21 +5028,17 @@ define dso_local { float, double } @test252() {
 ; LA32-LABEL: test252:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI251_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI251_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI251_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI251_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI251_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI251_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test252:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI251_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI251_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI251_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI251_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI251_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI251_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.6875000000, double -1.6875000000 }
@@ -6052,21 +5048,17 @@ define dso_local { float, double } @test253() {
 ; LA32-LABEL: test253:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI252_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI252_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI252_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI252_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI252_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI252_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test253:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI252_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI252_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI252_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI252_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI252_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI252_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.7500000000, double -1.7500000000 }
@@ -6076,21 +5068,17 @@ define dso_local { float, double } @test254() {
 ; LA32-LABEL: test254:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI253_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI253_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI253_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI253_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI253_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI253_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test254:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI253_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI253_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI253_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI253_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI253_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI253_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.8125000000, double -1.8125000000 }
@@ -6100,21 +5088,17 @@ define dso_local { float, double } @test255() {
 ; LA32-LABEL: test255:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI254_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI254_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI254_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI254_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI254_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI254_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test255:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI254_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI254_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI254_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI254_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI254_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI254_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.8750000000, double -1.8750000000 }
@@ -6124,21 +5108,17 @@ define dso_local { float, double } @test256() {
 ; LA32-LABEL: test256:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI255_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI255_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI255_0)
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI255_1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI255_1)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI255_1)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: test256:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI255_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI255_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI255_0)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI255_1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI255_1)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI255_1)
 ; LA64-NEXT:    ret
 entry:
   ret { float, double } { float -1.9375000000, double -1.9375000000 }
diff --git a/llvm/test/CodeGen/LoongArch/float-imm.ll b/llvm/test/CodeGen/LoongArch/float-imm.ll
index e2cbf4bf9b3e874..006a9e64b190de7 100644
--- a/llvm/test/CodeGen/LoongArch/float-imm.ll
+++ b/llvm/test/CodeGen/LoongArch/float-imm.ll
@@ -34,15 +34,13 @@ define float @f32_constant_pi() nounwind {
 ; LA32-LABEL: f32_constant_pi:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI2_0)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: f32_constant_pi:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI2_0)
 ; LA64-NEXT:    ret
   ret float 3.14159274101257324218750
 }
diff --git a/llvm/test/CodeGen/LoongArch/ghc-cc.ll b/llvm/test/CodeGen/LoongArch/ghc-cc.ll
index 735315d323a3626..f99759b4b5ed563 100644
--- a/llvm/test/CodeGen/LoongArch/ghc-cc.ll
+++ b/llvm/test/CodeGen/LoongArch/ghc-cc.ll
@@ -27,56 +27,39 @@ define ghccc void @foo() nounwind {
 ; LA64-LABEL: foo:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(d4)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(d4)
-; LA64-NEXT:    fld.d $fs7, $a0, 0
+; LA64-NEXT:    fld.d $fs7, $a0, %pc_lo12(d4)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(d3)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(d3)
-; LA64-NEXT:    fld.d $fs6, $a0, 0
+; LA64-NEXT:    fld.d $fs6, $a0, %pc_lo12(d3)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(d2)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(d2)
-; LA64-NEXT:    fld.d $fs5, $a0, 0
+; LA64-NEXT:    fld.d $fs5, $a0, %pc_lo12(d2)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(d1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(d1)
-; LA64-NEXT:    fld.d $fs4, $a0, 0
+; LA64-NEXT:    fld.d $fs4, $a0, %pc_lo12(d1)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(f4)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(f4)
-; LA64-NEXT:    fld.s $fs3, $a0, 0
+; LA64-NEXT:    fld.s $fs3, $a0, %pc_lo12(f4)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(f3)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(f3)
-; LA64-NEXT:    fld.s $fs2, $a0, 0
+; LA64-NEXT:    fld.s $fs2, $a0, %pc_lo12(f3)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(f2)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(f2)
-; LA64-NEXT:    fld.s $fs1, $a0, 0
+; LA64-NEXT:    fld.s $fs1, $a0, %pc_lo12(f2)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(f1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(f1)
-; LA64-NEXT:    fld.s $fs0, $a0, 0
+; LA64-NEXT:    fld.s $fs0, $a0, %pc_lo12(f1)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(splim)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(splim)
-; LA64-NEXT:    ld.d $s8, $a0, 0
+; LA64-NEXT:    ld.d $s8, $a0, %pc_lo12(splim)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(r5)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(r5)
-; LA64-NEXT:    ld.d $s7, $a0, 0
+; LA64-NEXT:    ld.d $s7, $a0, %pc_lo12(r5)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(r4)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(r4)
-; LA64-NEXT:    ld.d $s6, $a0, 0
+; LA64-NEXT:    ld.d $s6, $a0, %pc_lo12(r4)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(r3)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(r3)
-; LA64-NEXT:    ld.d $s5, $a0, 0
+; LA64-NEXT:    ld.d $s5, $a0, %pc_lo12(r3)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(r2)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(r2)
-; LA64-NEXT:    ld.d $s4, $a0, 0
+; LA64-NEXT:    ld.d $s4, $a0, %pc_lo12(r2)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(r1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(r1)
-; LA64-NEXT:    ld.d $s3, $a0, 0
+; LA64-NEXT:    ld.d $s3, $a0, %pc_lo12(r1)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(hp)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(hp)
-; LA64-NEXT:    ld.d $s2, $a0, 0
+; LA64-NEXT:    ld.d $s2, $a0, %pc_lo12(hp)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(sp)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(sp)
-; LA64-NEXT:    ld.d $s1, $a0, 0
+; LA64-NEXT:    ld.d $s1, $a0, %pc_lo12(sp)
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(base)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(base)
-; LA64-NEXT:    ld.d $s0, $a0, 0
+; LA64-NEXT:    ld.d $s0, $a0, %pc_lo12(base)
 ; LA64-NEXT:    b %plt(bar)
 
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/global-address.ll b/llvm/test/CodeGen/LoongArch/global-address.ll
index 2423dd81a4d3a8b..89ea48c3b1cbf68 100644
--- a/llvm/test/CodeGen/LoongArch/global-address.ll
+++ b/llvm/test/CodeGen/LoongArch/global-address.ll
@@ -16,8 +16,7 @@ define void @foo() nounwind {
 ; LA32NOPIC-NEXT:    ld.w $a0, $a0, %got_pc_lo12(G)
 ; LA32NOPIC-NEXT:    ld.w $zero, $a0, 0
 ; LA32NOPIC-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LA32NOPIC-NEXT:    addi.w $a0, $a0, %pc_lo12(g)
-; LA32NOPIC-NEXT:    ld.w $zero, $a0, 0
+; LA32NOPIC-NEXT:    ld.w $zero, $a0, %pc_lo12(g)
 ; LA32NOPIC-NEXT:    ret
 ;
 ; LA32PIC-LABEL: foo:
@@ -26,8 +25,7 @@ define void @foo() nounwind {
 ; LA32PIC-NEXT:    ld.w $a0, $a0, %got_pc_lo12(G)
 ; LA32PIC-NEXT:    ld.w $zero, $a0, 0
 ; LA32PIC-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; LA32PIC-NEXT:    addi.w $a0, $a0, %pc_lo12(.Lg$local)
-; LA32PIC-NEXT:    ld.w $zero, $a0, 0
+; LA32PIC-NEXT:    ld.w $zero, $a0, %pc_lo12(.Lg$local)
 ; LA32PIC-NEXT:    ret
 ;
 ; LA64NOPIC-LABEL: foo:
@@ -36,8 +34,7 @@ define void @foo() nounwind {
 ; LA64NOPIC-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; LA64NOPIC-NEXT:    ld.w $zero, $a0, 0
 ; LA64NOPIC-NEXT:    pcalau12i $a0, %pc_hi20(g)
-; LA64NOPIC-NEXT:    addi.d $a0, $a0, %pc_lo12(g)
-; LA64NOPIC-NEXT:    ld.w $zero, $a0, 0
+; LA64NOPIC-NEXT:    ld.w $zero, $a0, %pc_lo12(g)
 ; LA64NOPIC-NEXT:    ret
 ;
 ; LA64PIC-LABEL: foo:
@@ -46,8 +43,7 @@ define void @foo() nounwind {
 ; LA64PIC-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; LA64PIC-NEXT:    ld.w $zero, $a0, 0
 ; LA64PIC-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; LA64PIC-NEXT:    addi.d $a0, $a0, %pc_lo12(.Lg$local)
-; LA64PIC-NEXT:    ld.w $zero, $a0, 0
+; LA64PIC-NEXT:    ld.w $zero, $a0, %pc_lo12(.Lg$local)
 ; LA64PIC-NEXT:    ret
 ;
 ; LA64LARGENOPIC-LABEL: foo:
@@ -62,8 +58,7 @@ define void @foo() nounwind {
 ; LA64LARGENOPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(g)
 ; LA64LARGENOPIC-NEXT:    lu32i.d $a1, %pc64_lo20(g)
 ; LA64LARGENOPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g)
-; LA64LARGENOPIC-NEXT:    add.d $a0, $a1, $a0
-; LA64LARGENOPIC-NEXT:    ld.w $zero, $a0, 0
+; LA64LARGENOPIC-NEXT:    ldx.w $zero, $a1, $a0
 ; LA64LARGENOPIC-NEXT:    ret
 ;
 ; LA64LARGEPIC-LABEL: foo:
@@ -78,8 +73,7 @@ define void @foo() nounwind {
 ; LA64LARGEPIC-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
 ; LA64LARGEPIC-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
 ; LA64LARGEPIC-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
-; LA64LARGEPIC-NEXT:    add.d $a0, $a1, $a0
-; LA64LARGEPIC-NEXT:    ld.w $zero, $a0, 0
+; LA64LARGEPIC-NEXT:    ldx.w $zero, $a1, $a0
 ; LA64LARGEPIC-NEXT:    ret
   %V = load volatile i32, ptr @G
   %v = load volatile i32, ptr @g
diff --git a/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll b/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll
index 2b7a862ecde11e1..04f6a635778eba5 100644
--- a/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll
+++ b/llvm/test/CodeGen/LoongArch/global-variable-code-model.ll
@@ -7,8 +7,7 @@ define dso_local signext i32 @local_small() #0 {
 ; CHECK-LABEL: local_small:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(a)
-; CHECK-NEXT:    addi.d $a0, $a0, %pc_lo12(a)
-; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    ld.w $a0, $a0, %pc_lo12(a)
 ; CHECK-NEXT:    ret
   %1 = load i32, ptr @a, align 4
   ret i32 %1
@@ -23,8 +22,7 @@ define dso_local signext i32 @local_large() #0 {
 ; CHECK-NEXT:    addi.d $a1, $zero, %pc_lo12(b)
 ; CHECK-NEXT:    lu32i.d $a1, %pc64_lo20(b)
 ; CHECK-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(b)
-; CHECK-NEXT:    add.d $a0, $a1, $a0
-; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    ldx.w $a0, $a1, $a0
 ; CHECK-NEXT:    ret
   %1 = load i32, ptr @b, align 4
   ret i32 %1
diff --git a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll
index fa675e4bbb32434..be9ea29b54c3320 100644
--- a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll
+++ b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll
@@ -10,8 +10,7 @@ define double @constraint_f_double(double %a) nounwind {
 ; LA32-LABEL: constraint_f_double:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(gd)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(gd)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(gd)
 ; LA32-NEXT:    #APP
 ; LA32-NEXT:    fadd.d $fa0, $fa0, $fa1
 ; LA32-NEXT:    #NO_APP
@@ -20,8 +19,7 @@ define double @constraint_f_double(double %a) nounwind {
 ; LA64-LABEL: constraint_f_double:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(gd)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(gd)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(gd)
 ; LA64-NEXT:    #APP
 ; LA64-NEXT:    fadd.d $fa0, $fa0, $fa1
 ; LA64-NEXT:    #NO_APP
diff --git a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll
index becb3cae46b8c05..565ccdbe6880fbf 100644
--- a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll
+++ b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-m.ll
@@ -147,19 +147,17 @@ define i32 @m_offset_2048(ptr %p) nounwind {
 define i32 @m_addr_pcrel() nounwind {
 ; LA32-LABEL: m_addr_pcrel:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA32-NEXT:    addi.w $a1, $a0, %pc_lo12(g_i32)
+; LA32-NEXT:    pcalau12i $a1, %pc_hi20(g_i32)
 ; LA32-NEXT:    #APP
-; LA32-NEXT:    ld.w $a0, $a1, 0
+; LA32-NEXT:    ld.w $a0, $a1, %pc_lo12(g_i32)
 ; LA32-NEXT:    #NO_APP
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: m_addr_pcrel:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA64-NEXT:    addi.d $a1, $a0, %pc_lo12(g_i32)
+; LA64-NEXT:    pcalau12i $a1, %pc_hi20(g_i32)
 ; LA64-NEXT:    #APP
-; LA64-NEXT:    ld.w $a0, $a1, 0
+; LA64-NEXT:    ld.w $a0, $a1, %pc_lo12(g_i32)
 ; LA64-NEXT:    #NO_APP
 ; LA64-NEXT:    ret
   %1 = tail call i32 asm sideeffect "ld.w $0, $1", "=&r,*m"(ptr nonnull elementtype(i32) @g_i32)
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
index 193fa6c08600aec..7e320d9245f1c2f 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
@@ -79,8 +79,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    fld.s $fa0, $a0, 0
 ; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI1_0)
-; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI1_0)
-; LA64F-NEXT:    fld.s $fa1, $a1, 0
+; LA64F-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI1_0)
 ; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB1_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
@@ -113,8 +112,7 @@ define float @float_fsub_acquire(ptr %p) nounwind {
 ; LA64D:       # %bb.0:
 ; LA64D-NEXT:    fld.s $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI1_0)
-; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI1_0)
-; LA64D-NEXT:    fld.s $fa1, $a1, 0
+; LA64D-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI1_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB1_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
@@ -413,8 +411,7 @@ define double @double_fsub_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    move $fp, $a0
 ; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
-; LA64D-NEXT:    fld.d $fs0, $a0, 0
+; LA64D-NEXT:    fld.d $fs0, $a0, %pc_lo12(.LCPI5_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB5_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -665,8 +662,7 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    fld.s $fa0, $a0, 0
 ; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI9_0)
-; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI9_0)
-; LA64F-NEXT:    fld.s $fa1, $a1, 0
+; LA64F-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI9_0)
 ; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB9_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
@@ -699,8 +695,7 @@ define float @float_fsub_release(ptr %p) nounwind {
 ; LA64D:       # %bb.0:
 ; LA64D-NEXT:    fld.s $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI9_0)
-; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI9_0)
-; LA64D-NEXT:    fld.s $fa1, $a1, 0
+; LA64D-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI9_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB9_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
@@ -999,8 +994,7 @@ define double @double_fsub_release(ptr %p) nounwind {
 ; LA64D-NEXT:    move $fp, $a0
 ; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI13_0)
-; LA64D-NEXT:    fld.d $fs0, $a0, 0
+; LA64D-NEXT:    fld.d $fs0, $a0, %pc_lo12(.LCPI13_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB13_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1251,8 +1245,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    fld.s $fa0, $a0, 0
 ; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI17_0)
-; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI17_0)
-; LA64F-NEXT:    fld.s $fa1, $a1, 0
+; LA64F-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI17_0)
 ; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB17_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
@@ -1285,8 +1278,7 @@ define float @float_fsub_acq_rel(ptr %p) nounwind {
 ; LA64D:       # %bb.0:
 ; LA64D-NEXT:    fld.s $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI17_0)
-; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI17_0)
-; LA64D-NEXT:    fld.s $fa1, $a1, 0
+; LA64D-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI17_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB17_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
@@ -1585,8 +1577,7 @@ define double @double_fsub_acq_rel(ptr %p) nounwind {
 ; LA64D-NEXT:    move $fp, $a0
 ; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI21_0)
-; LA64D-NEXT:    fld.d $fs0, $a0, 0
+; LA64D-NEXT:    fld.d $fs0, $a0, %pc_lo12(.LCPI21_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB21_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -1837,8 +1828,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    fld.s $fa0, $a0, 0
 ; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI25_0)
-; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI25_0)
-; LA64F-NEXT:    fld.s $fa1, $a1, 0
+; LA64F-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI25_0)
 ; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB25_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
@@ -1871,8 +1861,7 @@ define float @float_fsub_seq_cst(ptr %p) nounwind {
 ; LA64D:       # %bb.0:
 ; LA64D-NEXT:    fld.s $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI25_0)
-; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI25_0)
-; LA64D-NEXT:    fld.s $fa1, $a1, 0
+; LA64D-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI25_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB25_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
@@ -2171,8 +2160,7 @@ define double @double_fsub_seq_cst(ptr %p) nounwind {
 ; LA64D-NEXT:    move $fp, $a0
 ; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI29_0)
-; LA64D-NEXT:    fld.d $fs0, $a0, 0
+; LA64D-NEXT:    fld.d $fs0, $a0, %pc_lo12(.LCPI29_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB29_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -2423,8 +2411,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    fld.s $fa0, $a0, 0
 ; LA64F-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI33_0)
-; LA64F-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI33_0)
-; LA64F-NEXT:    fld.s $fa1, $a1, 0
+; LA64F-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI33_0)
 ; LA64F-NEXT:    .p2align 4, , 16
 ; LA64F-NEXT:  .LBB33_1: # %atomicrmw.start
 ; LA64F-NEXT:    # =>This Loop Header: Depth=1
@@ -2457,8 +2444,7 @@ define float @float_fsub_monotonic(ptr %p) nounwind {
 ; LA64D:       # %bb.0:
 ; LA64D-NEXT:    fld.s $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI33_0)
-; LA64D-NEXT:    addi.d $a1, $a1, %pc_lo12(.LCPI33_0)
-; LA64D-NEXT:    fld.s $fa1, $a1, 0
+; LA64D-NEXT:    fld.s $fa1, $a1, %pc_lo12(.LCPI33_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB33_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Loop Header: Depth=1
@@ -2757,8 +2743,7 @@ define double @double_fsub_monotonic(ptr %p) nounwind {
 ; LA64D-NEXT:    move $fp, $a0
 ; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI37_0)
-; LA64D-NEXT:    fld.d $fs0, $a0, 0
+; LA64D-NEXT:    fld.d $fs0, $a0, %pc_lo12(.LCPI37_0)
 ; LA64D-NEXT:    .p2align 4, , 16
 ; LA64D-NEXT:  .LBB37_1: # %atomicrmw.start
 ; LA64D-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
index ef117f974887157..8d08942c314aa45 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll
@@ -117,8 +117,7 @@ define i32 @convert_double_to_u32(double %a) nounwind {
 ; LA32-LABEL: convert_double_to_u32:
 ; LA32:       # %bb.0:
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI7_0)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI7_0)
 ; LA32-NEXT:    fcmp.clt.d $fcc0, $fa0, $fa1
 ; LA32-NEXT:    fsub.d $fa1, $fa0, $fa1
 ; LA32-NEXT:    ftintrz.w.d $fa1, $fa1
@@ -174,8 +173,7 @@ define i64 @convert_double_to_u64(double %a) nounwind {
 ; LA64-LABEL: convert_double_to_u64:
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI9_0)
-; LA64-NEXT:    fld.d $fa1, $a0, 0
+; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI9_0)
 ; LA64-NEXT:    fcmp.clt.d $fcc0, $fa0, $fa1
 ; LA64-NEXT:    fsub.d $fa1, $fa0, $fa1
 ; LA64-NEXT:    ftintrz.l.d $fa1, $fa1
@@ -234,8 +232,7 @@ define double @convert_u32_to_double(i32 %a) nounwind {
 ; LA32-NEXT:    st.w $a0, $sp, 8
 ; LA32-NEXT:    fld.d $fa0, $sp, 8
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI12_0)
-; LA32-NEXT:    fld.d $fa1, $a0, 0
+; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI12_0)
 ; LA32-NEXT:    fsub.d $fa0, $fa0, $fa1
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    ret
@@ -264,8 +261,7 @@ define double @convert_u64_to_double(i64 %a) nounwind {
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    srli.d $a1, $a0, 32
 ; LA64-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI13_0)
-; LA64-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI13_0)
-; LA64-NEXT:    fld.d $fa0, $a2, 0
+; LA64-NEXT:    fld.d $fa0, $a2, %pc_lo12(.LCPI13_0)
 ; LA64-NEXT:    lu52i.d $a2, $zero, 1107
 ; LA64-NEXT:    or $a1, $a1, $a2
 ; LA64-NEXT:    movgr2fr.d $fa1, $a1
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/float-convert.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/float-convert.ll
index b01b84ba385ec84..b7de5a592c35983 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/float-convert.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/float-convert.ll
@@ -182,8 +182,7 @@ define i32 @convert_float_to_u32(float %a) nounwind {
 ; LA32F-LABEL: convert_float_to_u32:
 ; LA32F:       # %bb.0:
 ; LA32F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; LA32F-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI6_0)
-; LA32F-NEXT:    fld.s $fa1, $a0, 0
+; LA32F-NEXT:    fld.s $fa1, $a0, %pc_lo12(.LCPI6_0)
 ; LA32F-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
 ; LA32F-NEXT:    fsub.s $fa1, $fa0, $fa1
 ; LA32F-NEXT:    ftintrz.w.s $fa1, $fa1
@@ -201,8 +200,7 @@ define i32 @convert_float_to_u32(float %a) nounwind {
 ; LA32D-LABEL: convert_float_to_u32:
 ; LA32D:       # %bb.0:
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI6_0)
-; LA32D-NEXT:    fld.s $fa1, $a0, 0
+; LA32D-NEXT:    fld.s $fa1, $a0, %pc_lo12(.LCPI6_0)
 ; LA32D-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
 ; LA32D-NEXT:    fsub.s $fa1, $fa0, $fa1
 ; LA32D-NEXT:    ftintrz.w.s $fa1, $fa1
@@ -220,8 +218,7 @@ define i32 @convert_float_to_u32(float %a) nounwind {
 ; LA64F-LABEL: convert_float_to_u32:
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; LA64F-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI6_0)
-; LA64F-NEXT:    fld.s $fa1, $a0, 0
+; LA64F-NEXT:    fld.s $fa1, $a0, %pc_lo12(.LCPI6_0)
 ; LA64F-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
 ; LA64F-NEXT:    fsub.s $fa1, $fa0, $fa1
 ; LA64F-NEXT:    ftintrz.w.s $fa1, $fa1
@@ -267,8 +264,7 @@ define i64 @convert_float_to_u64(float %a) nounwind {
 ; LA64F-LABEL: convert_float_to_u64:
 ; LA64F:       # %bb.0:
 ; LA64F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; LA64F-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI7_0)
-; LA64F-NEXT:    fld.s $fa1, $a0, 0
+; LA64F-NEXT:    fld.s $fa1, $a0, %pc_lo12(.LCPI7_0)
 ; LA64F-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
 ; LA64F-NEXT:    fsub.s $fa1, $fa0, $fa1
 ; LA64F-NEXT:    ftintrz.w.s $fa1, $fa1
@@ -286,8 +282,7 @@ define i64 @convert_float_to_u64(float %a) nounwind {
 ; LA64D-LABEL: convert_float_to_u64:
 ; LA64D:       # %bb.0:
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI7_0)
-; LA64D-NEXT:    fld.s $fa1, $a0, 0
+; LA64D-NEXT:    fld.s $fa1, $a0, %pc_lo12(.LCPI7_0)
 ; LA64D-NEXT:    fcmp.clt.s $fcc0, $fa0, $fa1
 ; LA64D-NEXT:    fsub.s $fa1, $fa0, $fa1
 ; LA64D-NEXT:    ftintrz.l.s $fa1, $fa1
@@ -506,8 +501,7 @@ define float @convert_u32_to_float(i32 %a) nounwind {
 ; LA32D-NEXT:    st.w $a0, $sp, 8
 ; LA32D-NEXT:    fld.d $fa0, $sp, 8
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_0)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI14_0)
-; LA32D-NEXT:    fld.d $fa1, $a0, 0
+; LA32D-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI14_0)
 ; LA32D-NEXT:    fsub.d $fa0, $fa0, $fa1
 ; LA32D-NEXT:    fcvt.s.d $fa0, $fa0
 ; LA32D-NEXT:    addi.w $sp, $sp, 16
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll
index 26f44adc6135865..772ae8d81a88bf0 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll
@@ -11,38 +11,34 @@
 define i32 @load_store_global() nounwind {
 ; LA32NOPIC-LABEL: load_store_global:
 ; LA32NOPIC:       # %bb.0:
-; LA32NOPIC-NEXT:    pcalau12i $a0, %pc_hi20(G)
-; LA32NOPIC-NEXT:    addi.w $a1, $a0, %pc_lo12(G)
-; LA32NOPIC-NEXT:    ld.w $a0, $a1, 0
+; LA32NOPIC-NEXT:    pcalau12i $a1, %pc_hi20(G)
+; LA32NOPIC-NEXT:    ld.w $a0, $a1, %pc_lo12(G)
 ; LA32NOPIC-NEXT:    addi.w $a0, $a0, 1
-; LA32NOPIC-NEXT:    st.w $a0, $a1, 0
+; LA32NOPIC-NEXT:    st.w $a0, $a1, %pc_lo12(G)
 ; LA32NOPIC-NEXT:    ret
 ;
 ; LA32PIC-LABEL: load_store_global:
 ; LA32PIC:       # %bb.0:
-; LA32PIC-NEXT:    pcalau12i $a0, %pc_hi20(.LG$local)
-; LA32PIC-NEXT:    addi.w $a1, $a0, %pc_lo12(.LG$local)
-; LA32PIC-NEXT:    ld.w $a0, $a1, 0
+; LA32PIC-NEXT:    pcalau12i $a1, %pc_hi20(.LG$local)
+; LA32PIC-NEXT:    ld.w $a0, $a1, %pc_lo12(.LG$local)
 ; LA32PIC-NEXT:    addi.w $a0, $a0, 1
-; LA32PIC-NEXT:    st.w $a0, $a1, 0
+; LA32PIC-NEXT:    st.w $a0, $a1, %pc_lo12(.LG$local)
 ; LA32PIC-NEXT:    ret
 ;
 ; LA64NOPIC-LABEL: load_store_global:
 ; LA64NOPIC:       # %bb.0:
-; LA64NOPIC-NEXT:    pcalau12i $a0, %pc_hi20(G)
-; LA64NOPIC-NEXT:    addi.d $a1, $a0, %pc_lo12(G)
-; LA64NOPIC-NEXT:    ld.w $a0, $a1, 0
+; LA64NOPIC-NEXT:    pcalau12i $a1, %pc_hi20(G)
+; LA64NOPIC-NEXT:    ld.w $a0, $a1, %pc_lo12(G)
 ; LA64NOPIC-NEXT:    addi.w $a0, $a0, 1
-; LA64NOPIC-NEXT:    st.w $a0, $a1, 0
+; LA64NOPIC-NEXT:    st.w $a0, $a1, %pc_lo12(G)
 ; LA64NOPIC-NEXT:    ret
 ;
 ; LA64PIC-LABEL: load_store_global:
 ; LA64PIC:       # %bb.0:
-; LA64PIC-NEXT:    pcalau12i $a0, %pc_hi20(.LG$local)
-; LA64PIC-NEXT:    addi.d $a1, $a0, %pc_lo12(.LG$local)
-; LA64PIC-NEXT:    ld.w $a0, $a1, 0
+; LA64PIC-NEXT:    pcalau12i $a1, %pc_hi20(.LG$local)
+; LA64PIC-NEXT:    ld.w $a0, $a1, %pc_lo12(.LG$local)
 ; LA64PIC-NEXT:    addi.w $a0, $a0, 1
-; LA64PIC-NEXT:    st.w $a0, $a1, 0
+; LA64PIC-NEXT:    st.w $a0, $a1, %pc_lo12(.LG$local)
 ; LA64PIC-NEXT:    ret
   %v = load i32, ptr @G
   %sum = add i32 %v, 1
diff --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
index 5248468b6027d52..ba72ef5bd7ba4be 100644
--- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
@@ -15,11 +15,10 @@ define void @test_la_pcrel(i32 signext %n) {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    move $a1, $zero
 ; LA32-NEXT:    pcalau12i $a2, %pc_hi20(l)
-; LA32-NEXT:    addi.w $a2, $a2, %pc_lo12(l)
 ; LA32-NEXT:    .p2align 4, , 16
 ; LA32-NEXT:  .LBB0_1: # %loop
 ; LA32-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA32-NEXT:    ld.w $zero, $a2, 0
+; LA32-NEXT:    ld.w $zero, $a2, %pc_lo12(l)
 ; LA32-NEXT:    addi.w $a1, $a1, 1
 ; LA32-NEXT:    blt $a1, $a0, .LBB0_1
 ; LA32-NEXT:  # %bb.2: # %ret
@@ -29,11 +28,10 @@ define void @test_la_pcrel(i32 signext %n) {
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    move $a1, $zero
 ; LA64-NEXT:    pcalau12i $a2, %pc_hi20(l)
-; LA64-NEXT:    addi.d $a2, $a2, %pc_lo12(l)
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB0_1: # %loop
 ; LA64-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64-NEXT:    ld.w $zero, $a2, 0
+; LA64-NEXT:    ld.w $zero, $a2, %pc_lo12(l)
 ; LA64-NEXT:    addi.w $a1, $a1, 1
 ; LA64-NEXT:    blt $a1, $a0, .LBB0_1
 ; LA64-NEXT:  # %bb.2: # %ret
@@ -41,18 +39,17 @@ define void @test_la_pcrel(i32 signext %n) {
 ;
 ; LA64LARGE-LABEL: test_la_pcrel:
 ; LA64LARGE:       # %bb.0: # %entry
-; LA64LARGE-NEXT:    pcalau12i $a2, %pc_hi20(l)
-; LA64LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(l)
-; LA64LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(l)
-; LA64LARGE-NEXT:    lu52i.d $a3, $a1, %pc64_hi12(l)
-; LA64LARGE-NEXT:    move $a1, $zero
-; LA64LARGE-NEXT:    add.d $a2, $a3, $a2
+; LA64LARGE-NEXT:    pcalau12i $a1, %pc_hi20(l)
+; LA64LARGE-NEXT:    addi.d $a2, $zero, %pc_lo12(l)
+; LA64LARGE-NEXT:    lu32i.d $a2, %pc64_lo20(l)
+; LA64LARGE-NEXT:    lu52i.d $a2, $a2, %pc64_hi12(l)
+; LA64LARGE-NEXT:    move $a3, $zero
 ; LA64LARGE-NEXT:    .p2align 4, , 16
 ; LA64LARGE-NEXT:  .LBB0_1: # %loop
 ; LA64LARGE-NEXT:    # =>This Inner Loop Header: Depth=1
-; LA64LARGE-NEXT:    ld.w $zero, $a2, 0
-; LA64LARGE-NEXT:    addi.w $a1, $a1, 1
-; LA64LARGE-NEXT:    blt $a1, $a0, .LBB0_1
+; LA64LARGE-NEXT:    ldx.w $zero, $a2, $a1
+; LA64LARGE-NEXT:    addi.w $a3, $a3, 1
+; LA64LARGE-NEXT:    blt $a3, $a0, .LBB0_1
 ; LA64LARGE-NEXT:  # %bb.2: # %ret
 ; LA64LARGE-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
index 48d18dbedcaf265..32a4c4bdd1508a9 100644
--- a/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
+++ b/llvm/test/CodeGen/LoongArch/merge-base-offset.ll
@@ -12,15 +12,13 @@ define dso_local signext i8 @load_s8() nounwind {
 ; LA32-LABEL: load_s8:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i8)
-; LA32-NEXT:    ld.b $a0, $a0, 0
+; LA32-NEXT:    ld.b $a0, $a0, %pc_lo12(g_i8)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_s8:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i8)
-; LA64-NEXT:    ld.b $a0, $a0, 0
+; LA64-NEXT:    ld.b $a0, $a0, %pc_lo12(g_i8)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_s8:
@@ -29,8 +27,7 @@ define dso_local signext i8 @load_s8() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i8)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i8)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i8)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.b $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.b $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i8, ptr @g_i8
@@ -41,15 +38,13 @@ define dso_local zeroext i8 @load_u8() nounwind {
 ; LA32-LABEL: load_u8:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i8)
-; LA32-NEXT:    ld.bu $a0, $a0, 0
+; LA32-NEXT:    ld.bu $a0, $a0, %pc_lo12(g_i8)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_u8:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i8)
-; LA64-NEXT:    ld.bu $a0, $a0, 0
+; LA64-NEXT:    ld.bu $a0, $a0, %pc_lo12(g_i8)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_u8:
@@ -58,8 +53,7 @@ define dso_local zeroext i8 @load_u8() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i8)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i8)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i8)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.bu $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.bu $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i8, ptr @g_i8
@@ -70,17 +64,15 @@ define dso_local void @store_i8() nounwind {
 ; LA32-LABEL: store_i8:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i8)
 ; LA32-NEXT:    ori $a1, $zero, 1
-; LA32-NEXT:    st.b $a1, $a0, 0
+; LA32-NEXT:    st.b $a1, $a0, %pc_lo12(g_i8)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_i8:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i8)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i8)
 ; LA64-NEXT:    ori $a1, $zero, 1
-; LA64-NEXT:    st.b $a1, $a0, 0
+; LA64-NEXT:    st.b $a1, $a0, %pc_lo12(g_i8)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_i8:
@@ -89,9 +81,8 @@ define dso_local void @store_i8() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i8)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i8)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i8)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 1
-; LA64-LARGE-NEXT:    st.b $a1, $a0, 0
+; LA64-LARGE-NEXT:    ori $a2, $zero, 1
+; LA64-LARGE-NEXT:    stx.b $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store i8 1, ptr @g_i8
@@ -104,15 +95,13 @@ define dso_local signext i16 @load_s16() nounwind {
 ; LA32-LABEL: load_s16:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i16)
-; LA32-NEXT:    ld.h $a0, $a0, 0
+; LA32-NEXT:    ld.h $a0, $a0, %pc_lo12(g_i16)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_s16:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i16)
-; LA64-NEXT:    ld.h $a0, $a0, 0
+; LA64-NEXT:    ld.h $a0, $a0, %pc_lo12(g_i16)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_s16:
@@ -121,8 +110,7 @@ define dso_local signext i16 @load_s16() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i16)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i16)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i16)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.h $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.h $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i16, ptr @g_i16
@@ -133,15 +121,13 @@ define dso_local zeroext i16 @load_u16() nounwind {
 ; LA32-LABEL: load_u16:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i16)
-; LA32-NEXT:    ld.hu $a0, $a0, 0
+; LA32-NEXT:    ld.hu $a0, $a0, %pc_lo12(g_i16)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_u16:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i16)
-; LA64-NEXT:    ld.hu $a0, $a0, 0
+; LA64-NEXT:    ld.hu $a0, $a0, %pc_lo12(g_i16)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_u16:
@@ -150,8 +136,7 @@ define dso_local zeroext i16 @load_u16() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i16)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i16)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i16)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.hu $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.hu $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i16, ptr @g_i16
@@ -162,17 +147,15 @@ define dso_local void @store_i16() nounwind {
 ; LA32-LABEL: store_i16:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i16)
 ; LA32-NEXT:    ori $a1, $zero, 1
-; LA32-NEXT:    st.h $a1, $a0, 0
+; LA32-NEXT:    st.h $a1, $a0, %pc_lo12(g_i16)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_i16:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i16)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i16)
 ; LA64-NEXT:    ori $a1, $zero, 1
-; LA64-NEXT:    st.h $a1, $a0, 0
+; LA64-NEXT:    st.h $a1, $a0, %pc_lo12(g_i16)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_i16:
@@ -181,9 +164,8 @@ define dso_local void @store_i16() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i16)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i16)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i16)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 1
-; LA64-LARGE-NEXT:    st.h $a1, $a0, 0
+; LA64-LARGE-NEXT:    ori $a2, $zero, 1
+; LA64-LARGE-NEXT:    stx.h $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store i16 1, ptr @g_i16
@@ -196,15 +178,13 @@ define dso_local signext i32 @load_s32() nounwind {
 ; LA32-LABEL: load_s32:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i32)
-; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ld.w $a0, $a0, %pc_lo12(g_i32)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_s32:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i32)
-; LA64-NEXT:    ld.w $a0, $a0, 0
+; LA64-NEXT:    ld.w $a0, $a0, %pc_lo12(g_i32)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_s32:
@@ -213,8 +193,7 @@ define dso_local signext i32 @load_s32() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i32)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i32)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.w $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.w $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i32, ptr @g_i32
@@ -225,15 +204,13 @@ define dso_local zeroext i32 @load_u32() nounwind {
 ; LA32-LABEL: load_u32:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i32)
-; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ld.w $a0, $a0, %pc_lo12(g_i32)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_u32:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i32)
-; LA64-NEXT:    ld.wu $a0, $a0, 0
+; LA64-NEXT:    ld.wu $a0, $a0, %pc_lo12(g_i32)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_u32:
@@ -242,8 +219,7 @@ define dso_local zeroext i32 @load_u32() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i32)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i32)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.wu $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.wu $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i32, ptr @g_i32
@@ -254,17 +230,15 @@ define dso_local void @store_i32() nounwind {
 ; LA32-LABEL: store_i32:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_i32)
 ; LA32-NEXT:    ori $a1, $zero, 1
-; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a1, $a0, %pc_lo12(g_i32)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_i32:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i32)
 ; LA64-NEXT:    ori $a1, $zero, 1
-; LA64-NEXT:    st.w $a1, $a0, 0
+; LA64-NEXT:    st.w $a1, $a0, %pc_lo12(g_i32)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_i32:
@@ -273,9 +247,8 @@ define dso_local void @store_i32() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i32)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i32)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 1
-; LA64-LARGE-NEXT:    st.w $a1, $a0, 0
+; LA64-LARGE-NEXT:    ori $a2, $zero, 1
+; LA64-LARGE-NEXT:    stx.w $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store i32 1, ptr @g_i32
@@ -296,8 +269,7 @@ define dso_local i64 @load_64() nounwind {
 ; LA64-LABEL: load_64:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i64)
-; LA64-NEXT:    ld.d $a0, $a0, 0
+; LA64-NEXT:    ld.d $a0, $a0, %pc_lo12(g_i64)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_64:
@@ -306,8 +278,7 @@ define dso_local i64 @load_64() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i64)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i64)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i64)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.d $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i64, ptr @g_i64
@@ -327,9 +298,8 @@ define dso_local void @store_i64() nounwind {
 ; LA64-LABEL: store_i64:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_i64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_i64)
 ; LA64-NEXT:    ori $a1, $zero, 1
-; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    st.d $a1, $a0, %pc_lo12(g_i64)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_i64:
@@ -338,9 +308,8 @@ define dso_local void @store_i64() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_i64)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_i64)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_i64)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 1
-; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ori $a2, $zero, 1
+; LA64-LARGE-NEXT:    stx.d $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store i64 1, ptr @g_i64
@@ -353,15 +322,13 @@ define dso_local float @load_f32() nounwind {
 ; LA32-LABEL: load_f32:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f32)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(g_f32)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_f32:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f32)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(g_f32)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_f32:
@@ -370,8 +337,7 @@ define dso_local float @load_f32() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f32)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f32)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    fld.s $fa0, $a0, 0
+; LA64-LARGE-NEXT:    fldx.s $fa0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load float, ptr @g_f32
@@ -382,17 +348,15 @@ define dso_local void @store_f32() nounwind {
 ; LA32-LABEL: store_f32:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f32)
 ; LA32-NEXT:    lu12i.w $a1, 260096
-; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a1, $a0, %pc_lo12(g_f32)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_f32:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f32)
 ; LA64-NEXT:    lu12i.w $a1, 260096
-; LA64-NEXT:    st.w $a1, $a0, 0
+; LA64-NEXT:    st.w $a1, $a0, %pc_lo12(g_f32)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_f32:
@@ -401,9 +365,8 @@ define dso_local void @store_f32() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f32)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f32)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    lu12i.w $a1, 260096
-; LA64-LARGE-NEXT:    st.w $a1, $a0, 0
+; LA64-LARGE-NEXT:    lu12i.w $a2, 260096
+; LA64-LARGE-NEXT:    stx.w $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store float 1.0, ptr @g_f32
@@ -416,15 +379,13 @@ define dso_local double @load_f64() nounwind {
 ; LA32-LABEL: load_f64:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f64)
-; LA32-NEXT:    fld.d $fa0, $a0, 0
+; LA32-NEXT:    fld.d $fa0, $a0, %pc_lo12(g_f64)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_f64:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f64)
-; LA64-NEXT:    fld.d $fa0, $a0, 0
+; LA64-NEXT:    fld.d $fa0, $a0, %pc_lo12(g_f64)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_f64:
@@ -433,8 +394,7 @@ define dso_local double @load_f64() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f64)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f64)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f64)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    fld.d $fa0, $a0, 0
+; LA64-LARGE-NEXT:    fldx.d $fa0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load double, ptr @g_f64
@@ -445,20 +405,18 @@ define dso_local void @store_f64() nounwind {
 ; LA32-LABEL: store_f64:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_f64)
 ; LA32-NEXT:    addi.w $a1, $zero, 1
 ; LA32-NEXT:    movgr2fr.w $fa0, $a1
 ; LA32-NEXT:    ffint.s.w $fa0, $fa0
 ; LA32-NEXT:    fcvt.d.s $fa0, $fa0
-; LA32-NEXT:    fst.d $fa0, $a0, 0
+; LA32-NEXT:    fst.d $fa0, $a0, %pc_lo12(g_f64)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_f64:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_f64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_f64)
 ; LA64-NEXT:    lu52i.d $a1, $zero, 1023
-; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    st.d $a1, $a0, %pc_lo12(g_f64)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_f64:
@@ -467,9 +425,8 @@ define dso_local void @store_f64() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_f64)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_f64)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_f64)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    lu52i.d $a1, $zero, 1023
-; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    lu52i.d $a2, $zero, 1023
+; LA64-LARGE-NEXT:    stx.d $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store double 1.0, ptr @g_f64
@@ -494,11 +451,10 @@ define dso_local void @store_multi() nounwind {
 ; LA64-LABEL: store_multi:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_m64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_m64)
 ; LA64-NEXT:    ori $a1, $zero, 1
-; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    st.d $a1, $a0, %pc_lo12(g_m64)
 ; LA64-NEXT:    ori $a1, $zero, 2
-; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    st.d $a1, $a0, %pc_lo12(g_m64)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_multi:
@@ -507,11 +463,10 @@ define dso_local void @store_multi() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_m64)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_m64)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_m64)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 1
-; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 2
-; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ori $a2, $zero, 1
+; LA64-LARGE-NEXT:    stx.d $a2, $a1, $a0
+; LA64-LARGE-NEXT:    ori $a2, $zero, 2
+; LA64-LARGE-NEXT:    stx.d $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store volatile i64 1, ptr @g_m64
@@ -525,17 +480,15 @@ define dso_local void @store_sf32() nounwind {
 ; LA32-LABEL: store_sf32:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_sf32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_sf32)
-; LA32-NEXT:    fld.s $fa0, $a0, 0
-; LA32-NEXT:    fst.s $fa0, $a0, 0
+; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(g_sf32)
+; LA32-NEXT:    fst.s $fa0, $a0, %pc_lo12(g_sf32)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_sf32:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_sf32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_sf32)
-; LA64-NEXT:    fld.s $fa0, $a0, 0
-; LA64-NEXT:    fst.s $fa0, $a0, 0
+; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(g_sf32)
+; LA64-NEXT:    fst.s $fa0, $a0, %pc_lo12(g_sf32)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_sf32:
@@ -544,9 +497,8 @@ define dso_local void @store_sf32() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_sf32)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_sf32)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_sf32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    fld.s $fa0, $a0, 0
-; LA64-LARGE-NEXT:    fst.s $fa0, $a0, 0
+; LA64-LARGE-NEXT:    fldx.s $fa0, $a1, $a0
+; LA64-LARGE-NEXT:    fstx.s $fa0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load float, ptr @g_sf32
@@ -560,17 +512,15 @@ define dso_local void @store_sf64() nounwind {
 ; LA32-LABEL: store_sf64:
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_sf64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_sf64)
-; LA32-NEXT:    fld.d $fa0, $a0, 0
-; LA32-NEXT:    fst.d $fa0, $a0, 0
+; LA32-NEXT:    fld.d $fa0, $a0, %pc_lo12(g_sf64)
+; LA32-NEXT:    fst.d $fa0, $a0, %pc_lo12(g_sf64)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_sf64:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_sf64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_sf64)
-; LA64-NEXT:    fld.d $fa0, $a0, 0
-; LA64-NEXT:    fst.d $fa0, $a0, 0
+; LA64-NEXT:    fld.d $fa0, $a0, %pc_lo12(g_sf64)
+; LA64-NEXT:    fst.d $fa0, $a0, %pc_lo12(g_sf64)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_sf64:
@@ -579,9 +529,8 @@ define dso_local void @store_sf64() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_sf64)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_sf64)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_sf64)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    fld.d $fa0, $a0, 0
-; LA64-LARGE-NEXT:    fst.d $fa0, $a0, 0
+; LA64-LARGE-NEXT:    fldx.d $fa0, $a1, $a0
+; LA64-LARGE-NEXT:    fstx.d $fa0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load double, ptr @g_sf64
@@ -608,10 +557,9 @@ define dso_local void @rmw() nounwind {
 ; LA64-LABEL: rmw:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_rmw)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_rmw)
-; LA64-NEXT:    ld.d $a1, $a0, 0
+; LA64-NEXT:    ld.d $a1, $a0, %pc_lo12(g_rmw)
 ; LA64-NEXT:    addi.d $a1, $a1, 1
-; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    st.d $a1, $a0, %pc_lo12(g_rmw)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: rmw:
@@ -620,10 +568,9 @@ define dso_local void @rmw() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_rmw)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_rmw)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_rmw)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.d $a1, $a0, 0
-; LA64-LARGE-NEXT:    addi.d $a1, $a1, 1
-; LA64-LARGE-NEXT:    st.d $a1, $a0, 0
+; LA64-LARGE-NEXT:    ldx.d $a2, $a1, $a0
+; LA64-LARGE-NEXT:    addi.d $a2, $a2, 1
+; LA64-LARGE-NEXT:    stx.d $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   %0 = load i64, ptr @g_rmw
@@ -637,31 +584,26 @@ entry:
 define dso_local void @store_a32() nounwind {
 ; LA32-LABEL: store_a32:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a32)
-; LA32-NEXT:    lu12i.w $a1, 1
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4096)
 ; LA32-NEXT:    ori $a1, $zero, 1
-; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a1, $a0, %pc_lo12(g_a32+4096)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: store_a32:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a32)
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4096)
 ; LA64-NEXT:    ori $a1, $zero, 1
-; LA64-NEXT:    stptr.w $a1, $a0, 4096
+; LA64-NEXT:    st.w $a1, $a0, %pc_lo12(g_a32+4096)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: store_a32:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 1
-; LA64-LARGE-NEXT:    stptr.w $a1, $a0, 4096
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4096)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32+4096)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32+4096)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32+4096)
+; LA64-LARGE-NEXT:    ori $a2, $zero, 1
+; LA64-LARGE-NEXT:    stx.w $a2, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   store i32 1, ptr getelementptr inbounds ([1 x i32], ptr @g_a32, i32 1024), align 4
@@ -714,48 +656,44 @@ entry:
 define dso_local void @control_flow_with_mem_access() nounwind {
 ; LA32-LABEL: control_flow_with_mem_access:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a32)
-; LA32-NEXT:    ld.w $a1, $a0, 4
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4)
+; LA32-NEXT:    ld.w $a1, $a0, %pc_lo12(g_a32+4)
 ; LA32-NEXT:    ori $a2, $zero, 1
 ; LA32-NEXT:    blt $a1, $a2, .LBB21_2
 ; LA32-NEXT:  # %bb.1: # %if.then
 ; LA32-NEXT:    ori $a1, $zero, 10
-; LA32-NEXT:    st.w $a1, $a0, 4
+; LA32-NEXT:    st.w $a1, $a0, %pc_lo12(g_a32+4)
 ; LA32-NEXT:  .LBB21_2: # %if.end
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: control_flow_with_mem_access:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a32)
-; LA64-NEXT:    ld.w $a1, $a0, 4
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4)
+; LA64-NEXT:    ld.w $a1, $a0, %pc_lo12(g_a32+4)
 ; LA64-NEXT:    ori $a2, $zero, 1
 ; LA64-NEXT:    blt $a1, $a2, .LBB21_2
 ; LA64-NEXT:  # %bb.1: # %if.then
 ; LA64-NEXT:    ori $a1, $zero, 10
-; LA64-NEXT:    st.w $a1, $a0, 4
+; LA64-NEXT:    st.w $a1, $a0, %pc_lo12(g_a32+4)
 ; LA64-NEXT:  .LBB21_2: # %if.end
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: control_flow_with_mem_access:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.w $a0, $a0, 4
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32+4)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32+4)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32+4)
+; LA64-LARGE-NEXT:    ldx.w $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ori $a1, $zero, 1
 ; LA64-LARGE-NEXT:    blt $a0, $a1, .LBB21_2
 ; LA64-LARGE-NEXT:  # %bb.1: # %if.then
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ori $a1, $zero, 10
-; LA64-LARGE-NEXT:    st.w $a1, $a0, 4
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a32+4)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a32+4)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a32+4)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a32+4)
+; LA64-LARGE-NEXT:    ori $a2, $zero, 10
+; LA64-LARGE-NEXT:    stx.w $a2, $a1, $a0
 ; LA64-LARGE-NEXT:  .LBB21_2: # %if.end
 ; LA64-LARGE-NEXT:    ret
 entry:
@@ -777,8 +715,7 @@ define dso_local ptr @load_ba_1() nounwind {
 ; LA32-NEXT:  .Ltmp0: # Block address taken
 ; LA32-NEXT:  # %bb.1: # %label
 ; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp0)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.Ltmp0)
-; LA32-NEXT:    ld.w $a0, $a0, 0
+; LA32-NEXT:    ld.w $a0, $a0, %pc_lo12(.Ltmp0)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_ba_1:
@@ -786,8 +723,7 @@ define dso_local ptr @load_ba_1() nounwind {
 ; LA64-NEXT:  .Ltmp0: # Block address taken
 ; LA64-NEXT:  # %bb.1: # %label
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp0)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.Ltmp0)
-; LA64-NEXT:    ld.d $a0, $a0, 0
+; LA64-NEXT:    ld.d $a0, $a0, %pc_lo12(.Ltmp0)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_ba_1:
@@ -798,8 +734,7 @@ define dso_local ptr @load_ba_1() nounwind {
 ; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(.Ltmp0)
 ; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(.Ltmp0)
 ; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Ltmp0)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.d $a0, $a0, 0
+; LA64-LARGE-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   br label %label
@@ -813,30 +748,27 @@ define dso_local ptr @load_ba_2() nounwind {
 ; LA32:       # %bb.0: # %entry
 ; LA32-NEXT:  .Ltmp1: # Block address taken
 ; LA32-NEXT:  # %bb.1: # %label
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(.Ltmp1)
-; LA32-NEXT:    ld.w $a0, $a0, 8
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1+8)
+; LA32-NEXT:    ld.w $a0, $a0, %pc_lo12(.Ltmp1+8)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_ba_2:
 ; LA64:       # %bb.0: # %entry
 ; LA64-NEXT:  .Ltmp1: # Block address taken
 ; LA64-NEXT:  # %bb.1: # %label
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.Ltmp1)
-; LA64-NEXT:    ld.d $a0, $a0, 8
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1+8)
+; LA64-NEXT:    ld.d $a0, $a0, %pc_lo12(.Ltmp1+8)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_ba_2:
 ; LA64-LARGE:       # %bb.0: # %entry
 ; LA64-LARGE-NEXT:  .Ltmp1: # Block address taken
 ; LA64-LARGE-NEXT:  # %bb.1: # %label
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(.Ltmp1)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(.Ltmp1)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Ltmp1)
-; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    ld.d $a0, $a0, 8
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(.Ltmp1+8)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(.Ltmp1+8)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(.Ltmp1+8)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Ltmp1+8)
+; LA64-LARGE-NEXT:    ldx.d $a0, $a1, $a0
 ; LA64-LARGE-NEXT:    ret
 entry:
   br label %label
@@ -850,26 +782,23 @@ label:
 define dso_local ptr @load_addr_offset_1() nounwind {
 ; LA32-LABEL: load_addr_offset_1:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, 8
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+8)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_1:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, 8
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64+8)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_addr_offset_1:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64+8)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64+8)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64+8)
 ; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    addi.d $a0, $a0, 8
 ; LA64-LARGE-NEXT:    ret
 entry:
   ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1)
@@ -878,29 +807,23 @@ entry:
 define dso_local ptr @load_addr_offset_257() nounwind {
 ; LA32-LABEL: load_addr_offset_257:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, 2047
-; LA32-NEXT:    addi.w $a0, $a0, 9
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2056)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+2056)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_257:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, 2047
-; LA64-NEXT:    addi.d $a0, $a0, 9
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2056)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64+2056)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_addr_offset_257:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2056)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64+2056)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64+2056)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64+2056)
 ; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    addi.d $a0, $a0, 2047
-; LA64-LARGE-NEXT:    addi.d $a0, $a0, 9
 ; LA64-LARGE-NEXT:    ret
 entry:
   ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 257)
@@ -909,27 +832,23 @@ entry:
 define dso_local ptr @load_addr_offset_1048576() nounwind {
 ; LA32-LABEL: load_addr_offset_1048576:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    lu12i.w $a1, 2048
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8388608)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+8388608)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_1048576:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
-; LA64-NEXT:    addu16i.d $a0, $a0, 128
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8388608)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64+8388608)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_addr_offset_1048576:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8388608)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64+8388608)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64+8388608)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64+8388608)
 ; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    addu16i.d $a0, $a0, 128
 ; LA64-LARGE-NEXT:    ret
 entry:
   ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1048576)
@@ -938,30 +857,23 @@ entry:
 define dso_local ptr @load_addr_offset_1048577() nounwind {
 ; LA32-LABEL: load_addr_offset_1048577:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    lu12i.w $a1, 2048
-; LA32-NEXT:    ori $a1, $a1, 8
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8388616)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+8388616)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_1048577:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
-; LA64-NEXT:    addu16i.d $a0, $a0, 128
-; LA64-NEXT:    addi.d $a0, $a0, 8
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8388616)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64+8388616)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_addr_offset_1048577:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+8388616)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64+8388616)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64+8388616)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64+8388616)
 ; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    addu16i.d $a0, $a0, 128
-; LA64-LARGE-NEXT:    addi.d $a0, $a0, 8
 ; LA64-LARGE-NEXT:    ret
 entry:
   ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 1048577)
@@ -970,29 +882,23 @@ entry:
 define dso_local ptr @load_addr_offset_268432896() nounwind {
 ; LA32-LABEL: load_addr_offset_268432896:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    lu12i.w $a1, 524283
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2147463168)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+2147463168)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_268432896:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
-; LA64-NEXT:    lu12i.w $a1, 524283
-; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2147463168)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64+2147463168)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_addr_offset_268432896:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2147463168)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64+2147463168)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64+2147463168)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64+2147463168)
 ; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    lu12i.w $a1, 524283
-; LA64-LARGE-NEXT:    add.d $a0, $a0, $a1
 ; LA64-LARGE-NEXT:    ret
 entry:
   ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 268432896)
@@ -1001,32 +907,23 @@ entry:
 define dso_local ptr @load_addr_offset_268432897() nounwind {
 ; LA32-LABEL: load_addr_offset_268432897:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    lu12i.w $a1, 524283
-; LA32-NEXT:    ori $a1, $a1, 8
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2147463176)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+2147463176)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_268432897:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64)
-; LA64-NEXT:    lu12i.w $a1, 524283
-; LA64-NEXT:    ori $a1, $a1, 8
-; LA64-NEXT:    add.d $a0, $a0, $a1
+; LA64-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2147463176)
+; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(g_a64+2147463176)
 ; LA64-NEXT:    ret
 ;
 ; LA64-LARGE-LABEL: load_addr_offset_268432897:
 ; LA64-LARGE:       # %bb.0: # %entry
-; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64)
-; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64)
-; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64)
+; LA64-LARGE-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+2147463176)
+; LA64-LARGE-NEXT:    addi.d $a1, $zero, %pc_lo12(g_a64+2147463176)
+; LA64-LARGE-NEXT:    lu32i.d $a1, %pc64_lo20(g_a64+2147463176)
+; LA64-LARGE-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(g_a64+2147463176)
 ; LA64-LARGE-NEXT:    add.d $a0, $a1, $a0
-; LA64-LARGE-NEXT:    lu12i.w $a1, 524283
-; LA64-LARGE-NEXT:    ori $a1, $a1, 8
-; LA64-LARGE-NEXT:    add.d $a0, $a0, $a1
 ; LA64-LARGE-NEXT:    ret
 entry:
   ret ptr getelementptr inbounds ([1 x i64], ptr @g_a64, i64 268432897)
@@ -1035,11 +932,8 @@ entry:
 define dso_local ptr @load_addr_offset_9380351707272() nounwind {
 ; LA32-LABEL: load_addr_offset_9380351707272:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    lu12i.w $a1, 279556
-; LA32-NEXT:    ori $a1, $a1, 1088
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+1145062464)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+1145062464)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_9380351707272:
@@ -1071,11 +965,8 @@ entry:
 define dso_local ptr @load_addr_offset_614750729487779976() nounwind {
 ; LA32-LABEL: load_addr_offset_614750729487779976:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64)
-; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64)
-; LA32-NEXT:    lu12i.w $a1, 279556
-; LA32-NEXT:    ori $a1, $a1, 1088
-; LA32-NEXT:    add.w $a0, $a0, $a1
+; LA32-NEXT:    pcalau12i $a0, %pc_hi20(g_a64+1145062464)
+; LA32-NEXT:    addi.w $a0, $a0, %pc_lo12(g_a64+1145062464)
 ; LA32-NEXT:    ret
 ;
 ; LA64-LABEL: load_addr_offset_614750729487779976:
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index 391888a38daf696..e07a334888c189f 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -105,6 +105,7 @@
 ; LAXX-NEXT:       Remove dead machine instructions
 ; LA64-NEXT:       LoongArch Optimize W Instructions
 ; LAXX-NEXT:       LoongArch Pre-RA pseudo instruction expansion pass
+; LAXX-NEXT:       LoongArch Merge Base Offset
 ; LAXX-NEXT:       Detect Dead Lanes
 ; LAXX-NEXT:       Init Undef Pass
 ; LAXX-NEXT:       Process Implicit Definitions
diff --git a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
index b03a523fb79a9bf..a7873f466bee3f0 100644
--- a/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
+++ b/llvm/test/CodeGen/LoongArch/psabi-restricted-scheduling.ll
@@ -25,8 +25,7 @@ define void @foo() nounwind {
 ; MEDIUM_NO_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, 0
 ; MEDIUM_NO_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; MEDIUM_NO_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(.Lg$local)
-; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, 0
+; MEDIUM_NO_SCH-NEXT:    ld.d $zero, $a0, %pc_lo12(.Lg$local)
 ; MEDIUM_NO_SCH-NEXT:    ori $a0, $zero, 1
 ; MEDIUM_NO_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
 ; MEDIUM_NO_SCH-NEXT:    jirl $ra, $ra, 0
@@ -55,8 +54,7 @@ define void @foo() nounwind {
 ; MEDIUM_SCH-NEXT:    ld.d $a0, $a0, %got_pc_lo12(G)
 ; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, 0
 ; MEDIUM_SCH-NEXT:    pcalau12i $a0, %pc_hi20(.Lg$local)
-; MEDIUM_SCH-NEXT:    addi.d $a0, $a0, %pc_lo12(.Lg$local)
-; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, 0
+; MEDIUM_SCH-NEXT:    ld.d $zero, $a0, %pc_lo12(.Lg$local)
 ; MEDIUM_SCH-NEXT:    ori $a0, $zero, 1
 ; MEDIUM_SCH-NEXT:    pcaddu18i $ra, %call36(bar)
 ; MEDIUM_SCH-NEXT:    jirl $ra, $ra, 0
@@ -91,8 +89,7 @@ define void @foo() nounwind {
 ; LARGE_NO_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
 ; LARGE_NO_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
 ; LARGE_NO_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
-; LARGE_NO_SCH-NEXT:    add.d $a0, $a1, $a0
-; LARGE_NO_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGE_NO_SCH-NEXT:    ldx.d $zero, $a1, $a0
 ; LARGE_NO_SCH-NEXT:    ori $a0, $zero, 1
 ; LARGE_NO_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
 ; LARGE_NO_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
@@ -148,8 +145,7 @@ define void @foo() nounwind {
 ; LARGE_SCH-NEXT:    addi.d $a1, $zero, %pc_lo12(.Lg$local)
 ; LARGE_SCH-NEXT:    lu32i.d $a1, %pc64_lo20(.Lg$local)
 ; LARGE_SCH-NEXT:    lu52i.d $a1, $a1, %pc64_hi12(.Lg$local)
-; LARGE_SCH-NEXT:    add.d $a0, $a1, $a0
-; LARGE_SCH-NEXT:    ld.d $zero, $a0, 0
+; LARGE_SCH-NEXT:    ldx.d $zero, $a1, $a0
 ; LARGE_SCH-NEXT:    ori $a0, $zero, 1
 ; LARGE_SCH-NEXT:    pcalau12i $a1, %got_pc_hi20(bar)
 ; LARGE_SCH-NEXT:    addi.d $ra, $zero, %got_pc_lo12(bar)
diff --git a/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll b/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll
index 0a401ebe5f6b2ae..8dd1ec465c13add 100644
--- a/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll
+++ b/llvm/test/CodeGen/LoongArch/vector-fp-imm.ll
@@ -94,8 +94,7 @@ define void @test_f2(ptr %P, ptr %S) nounwind {
 ; LA32F-NEXT:    fld.s $fa1, $a0, 0
 ; LA32F-NEXT:    addi.w $a0, $zero, 1
 ; LA32F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; LA32F-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI1_0)
-; LA32F-NEXT:    fld.s $fa2, $a2, 0
+; LA32F-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI1_0)
 ; LA32F-NEXT:    movgr2fr.w $fa3, $a0
 ; LA32F-NEXT:    ffint.s.w $fa3, $fa3
 ; LA32F-NEXT:    fadd.s $fa1, $fa1, $fa3
@@ -110,8 +109,7 @@ define void @test_f2(ptr %P, ptr %S) nounwind {
 ; LA32D-NEXT:    fld.s $fa1, $a0, 0
 ; LA32D-NEXT:    addi.w $a0, $zero, 1
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI1_0)
-; LA32D-NEXT:    fld.s $fa2, $a2, 0
+; LA32D-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI1_0)
 ; LA32D-NEXT:    movgr2fr.w $fa3, $a0
 ; LA32D-NEXT:    ffint.s.w $fa3, $fa3
 ; LA32D-NEXT:    fadd.s $fa1, $fa1, $fa3
@@ -126,8 +124,7 @@ define void @test_f2(ptr %P, ptr %S) nounwind {
 ; LA64F-NEXT:    fld.s $fa1, $a0, 0
 ; LA64F-NEXT:    addi.w $a0, $zero, 1
 ; LA64F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; LA64F-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI1_0)
-; LA64F-NEXT:    fld.s $fa2, $a2, 0
+; LA64F-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI1_0)
 ; LA64F-NEXT:    movgr2fr.w $fa3, $a0
 ; LA64F-NEXT:    ffint.s.w $fa3, $fa3
 ; LA64F-NEXT:    fadd.s $fa1, $fa1, $fa3
@@ -142,8 +139,7 @@ define void @test_f2(ptr %P, ptr %S) nounwind {
 ; LA64D-NEXT:    fld.s $fa1, $a0, 0
 ; LA64D-NEXT:    addi.w $a0, $zero, 1
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI1_0)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI1_0)
-; LA64D-NEXT:    fld.s $fa2, $a2, 0
+; LA64D-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI1_0)
 ; LA64D-NEXT:    movgr2fr.w $fa3, $a0
 ; LA64D-NEXT:    ffint.s.w $fa3, $fa3
 ; LA64D-NEXT:    fadd.s $fa1, $fa1, $fa3
@@ -168,14 +164,11 @@ define void @test_f4(ptr %P, ptr %S) nounwind {
 ; LA32F-NEXT:    movgr2fr.w $fa4, $a0
 ; LA32F-NEXT:    ffint.s.w $fa4, $fa4
 ; LA32F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA32F-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA32F-NEXT:    fld.s $fa5, $a0, 0
+; LA32F-NEXT:    fld.s $fa5, $a0, %pc_lo12(.LCPI2_0)
 ; LA32F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA32F-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_1)
-; LA32F-NEXT:    fld.s $fa6, $a0, 0
+; LA32F-NEXT:    fld.s $fa6, $a0, %pc_lo12(.LCPI2_1)
 ; LA32F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_2)
-; LA32F-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_2)
-; LA32F-NEXT:    fld.s $fa7, $a0, 0
+; LA32F-NEXT:    fld.s $fa7, $a0, %pc_lo12(.LCPI2_2)
 ; LA32F-NEXT:    fadd.s $fa3, $fa3, $fa4
 ; LA32F-NEXT:    fadd.s $fa2, $fa2, $fa5
 ; LA32F-NEXT:    fadd.s $fa1, $fa1, $fa6
@@ -196,14 +189,11 @@ define void @test_f4(ptr %P, ptr %S) nounwind {
 ; LA32D-NEXT:    movgr2fr.w $fa4, $a0
 ; LA32D-NEXT:    ffint.s.w $fa4, $fa4
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA32D-NEXT:    fld.s $fa5, $a0, 0
+; LA32D-NEXT:    fld.s $fa5, $a0, %pc_lo12(.LCPI2_0)
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_1)
-; LA32D-NEXT:    fld.s $fa6, $a0, 0
+; LA32D-NEXT:    fld.s $fa6, $a0, %pc_lo12(.LCPI2_1)
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_2)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI2_2)
-; LA32D-NEXT:    fld.s $fa7, $a0, 0
+; LA32D-NEXT:    fld.s $fa7, $a0, %pc_lo12(.LCPI2_2)
 ; LA32D-NEXT:    fadd.s $fa3, $fa3, $fa4
 ; LA32D-NEXT:    fadd.s $fa2, $fa2, $fa5
 ; LA32D-NEXT:    fadd.s $fa1, $fa1, $fa6
@@ -224,14 +214,11 @@ define void @test_f4(ptr %P, ptr %S) nounwind {
 ; LA64F-NEXT:    movgr2fr.w $fa4, $a0
 ; LA64F-NEXT:    ffint.s.w $fa4, $fa4
 ; LA64F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA64F-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA64F-NEXT:    fld.s $fa5, $a0, 0
+; LA64F-NEXT:    fld.s $fa5, $a0, %pc_lo12(.LCPI2_0)
 ; LA64F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA64F-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_1)
-; LA64F-NEXT:    fld.s $fa6, $a0, 0
+; LA64F-NEXT:    fld.s $fa6, $a0, %pc_lo12(.LCPI2_1)
 ; LA64F-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_2)
-; LA64F-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_2)
-; LA64F-NEXT:    fld.s $fa7, $a0, 0
+; LA64F-NEXT:    fld.s $fa7, $a0, %pc_lo12(.LCPI2_2)
 ; LA64F-NEXT:    fadd.s $fa3, $fa3, $fa4
 ; LA64F-NEXT:    fadd.s $fa2, $fa2, $fa5
 ; LA64F-NEXT:    fadd.s $fa1, $fa1, $fa6
@@ -252,14 +239,11 @@ define void @test_f4(ptr %P, ptr %S) nounwind {
 ; LA64D-NEXT:    movgr2fr.w $fa4, $a0
 ; LA64D-NEXT:    ffint.s.w $fa4, $fa4
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_0)
-; LA64D-NEXT:    fld.s $fa5, $a0, 0
+; LA64D-NEXT:    fld.s $fa5, $a0, %pc_lo12(.LCPI2_0)
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_1)
-; LA64D-NEXT:    fld.s $fa6, $a0, 0
+; LA64D-NEXT:    fld.s $fa6, $a0, %pc_lo12(.LCPI2_1)
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_2)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI2_2)
-; LA64D-NEXT:    fld.s $fa7, $a0, 0
+; LA64D-NEXT:    fld.s $fa7, $a0, %pc_lo12(.LCPI2_2)
 ; LA64D-NEXT:    fadd.s $fa3, $fa3, $fa4
 ; LA64D-NEXT:    fadd.s $fa2, $fa2, $fa5
 ; LA64D-NEXT:    fadd.s $fa1, $fa1, $fa6
@@ -281,14 +265,11 @@ define void @test_f8(ptr %P, ptr %S) nounwind {
 ; LA32F-NEXT:    addi.w $a2, $zero, 1
 ; LA32F-NEXT:    movgr2fr.w $fa0, $a2
 ; LA32F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; LA32F-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI3_0)
-; LA32F-NEXT:    fld.s $fa1, $a2, 0
+; LA32F-NEXT:    fld.s $fa1, $a2, %pc_lo12(.LCPI3_0)
 ; LA32F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_1)
-; LA32F-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI3_1)
-; LA32F-NEXT:    fld.s $fa2, $a2, 0
+; LA32F-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI3_1)
 ; LA32F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_2)
-; LA32F-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI3_2)
-; LA32F-NEXT:    fld.s $fa3, $a2, 0
+; LA32F-NEXT:    fld.s $fa3, $a2, %pc_lo12(.LCPI3_2)
 ; LA32F-NEXT:    fld.s $fa4, $a0, 28
 ; LA32F-NEXT:    fld.s $fa5, $a0, 24
 ; LA32F-NEXT:    fld.s $fa6, $a0, 12
@@ -321,14 +302,11 @@ define void @test_f8(ptr %P, ptr %S) nounwind {
 ; LA32D-NEXT:    addi.w $a2, $zero, 1
 ; LA32D-NEXT:    movgr2fr.w $fa0, $a2
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI3_0)
-; LA32D-NEXT:    fld.s $fa1, $a2, 0
+; LA32D-NEXT:    fld.s $fa1, $a2, %pc_lo12(.LCPI3_0)
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_1)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI3_1)
-; LA32D-NEXT:    fld.s $fa2, $a2, 0
+; LA32D-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI3_1)
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_2)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI3_2)
-; LA32D-NEXT:    fld.s $fa3, $a2, 0
+; LA32D-NEXT:    fld.s $fa3, $a2, %pc_lo12(.LCPI3_2)
 ; LA32D-NEXT:    fld.s $fa4, $a0, 28
 ; LA32D-NEXT:    fld.s $fa5, $a0, 24
 ; LA32D-NEXT:    fld.s $fa6, $a0, 12
@@ -361,14 +339,11 @@ define void @test_f8(ptr %P, ptr %S) nounwind {
 ; LA64F-NEXT:    addi.w $a2, $zero, 1
 ; LA64F-NEXT:    movgr2fr.w $fa0, $a2
 ; LA64F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; LA64F-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI3_0)
-; LA64F-NEXT:    fld.s $fa1, $a2, 0
+; LA64F-NEXT:    fld.s $fa1, $a2, %pc_lo12(.LCPI3_0)
 ; LA64F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_1)
-; LA64F-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI3_1)
-; LA64F-NEXT:    fld.s $fa2, $a2, 0
+; LA64F-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI3_1)
 ; LA64F-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_2)
-; LA64F-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI3_2)
-; LA64F-NEXT:    fld.s $fa3, $a2, 0
+; LA64F-NEXT:    fld.s $fa3, $a2, %pc_lo12(.LCPI3_2)
 ; LA64F-NEXT:    fld.s $fa4, $a0, 28
 ; LA64F-NEXT:    fld.s $fa5, $a0, 24
 ; LA64F-NEXT:    fld.s $fa6, $a0, 12
@@ -401,14 +376,11 @@ define void @test_f8(ptr %P, ptr %S) nounwind {
 ; LA64D-NEXT:    addi.w $a2, $zero, 1
 ; LA64D-NEXT:    movgr2fr.w $fa0, $a2
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_0)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI3_0)
-; LA64D-NEXT:    fld.s $fa1, $a2, 0
+; LA64D-NEXT:    fld.s $fa1, $a2, %pc_lo12(.LCPI3_0)
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_1)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI3_1)
-; LA64D-NEXT:    fld.s $fa2, $a2, 0
+; LA64D-NEXT:    fld.s $fa2, $a2, %pc_lo12(.LCPI3_1)
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI3_2)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI3_2)
-; LA64D-NEXT:    fld.s $fa3, $a2, 0
+; LA64D-NEXT:    fld.s $fa3, $a2, %pc_lo12(.LCPI3_2)
 ; LA64D-NEXT:    fld.s $fa4, $a0, 28
 ; LA64D-NEXT:    fld.s $fa5, $a0, 24
 ; LA64D-NEXT:    fld.s $fa6, $a0, 12
@@ -488,8 +460,7 @@ define void @test_d2(ptr %P, ptr %S) nounwind {
 ; LA32D-NEXT:    addi.w $a0, $zero, 1
 ; LA32D-NEXT:    movgr2fr.w $fa2, $a0
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI4_0)
-; LA32D-NEXT:    fld.d $fa3, $a0, 0
+; LA32D-NEXT:    fld.d $fa3, $a0, %pc_lo12(.LCPI4_0)
 ; LA32D-NEXT:    ffint.s.w $fa2, $fa2
 ; LA32D-NEXT:    fcvt.d.s $fa2, $fa2
 ; LA32D-NEXT:    fadd.d $fa1, $fa1, $fa2
@@ -529,8 +500,7 @@ define void @test_d2(ptr %P, ptr %S) nounwind {
 ; LA64D-NEXT:    fld.d $fa1, $a0, 0
 ; LA64D-NEXT:    addi.d $a0, $zero, 1
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI4_0)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI4_0)
-; LA64D-NEXT:    fld.d $fa2, $a2, 0
+; LA64D-NEXT:    fld.d $fa2, $a2, %pc_lo12(.LCPI4_0)
 ; LA64D-NEXT:    movgr2fr.d $fa3, $a0
 ; LA64D-NEXT:    ffint.d.l $fa3, $fa3
 ; LA64D-NEXT:    fadd.d $fa1, $fa1, $fa3
@@ -625,14 +595,11 @@ define void @test_d4(ptr %P, ptr %S) nounwind {
 ; LA32D-NEXT:    ffint.s.w $fa4, $fa4
 ; LA32D-NEXT:    fcvt.d.s $fa4, $fa4
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI5_0)
-; LA32D-NEXT:    fld.d $fa5, $a0, 0
+; LA32D-NEXT:    fld.d $fa5, $a0, %pc_lo12(.LCPI5_0)
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_1)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI5_1)
-; LA32D-NEXT:    fld.d $fa6, $a0, 0
+; LA32D-NEXT:    fld.d $fa6, $a0, %pc_lo12(.LCPI5_1)
 ; LA32D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_2)
-; LA32D-NEXT:    addi.w $a0, $a0, %pc_lo12(.LCPI5_2)
-; LA32D-NEXT:    fld.d $fa7, $a0, 0
+; LA32D-NEXT:    fld.d $fa7, $a0, %pc_lo12(.LCPI5_2)
 ; LA32D-NEXT:    fadd.d $fa3, $fa3, $fa4
 ; LA32D-NEXT:    fadd.d $fa2, $fa2, $fa5
 ; LA32D-NEXT:    fadd.d $fa1, $fa1, $fa6
@@ -696,14 +663,11 @@ define void @test_d4(ptr %P, ptr %S) nounwind {
 ; LA64D-NEXT:    movgr2fr.d $fa4, $a0
 ; LA64D-NEXT:    ffint.d.l $fa4, $fa4
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
-; LA64D-NEXT:    fld.d $fa5, $a0, 0
+; LA64D-NEXT:    fld.d $fa5, $a0, %pc_lo12(.LCPI5_0)
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_1)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_1)
-; LA64D-NEXT:    fld.d $fa6, $a0, 0
+; LA64D-NEXT:    fld.d $fa6, $a0, %pc_lo12(.LCPI5_1)
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_2)
-; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_2)
-; LA64D-NEXT:    fld.d $fa7, $a0, 0
+; LA64D-NEXT:    fld.d $fa7, $a0, %pc_lo12(.LCPI5_2)
 ; LA64D-NEXT:    fadd.d $fa3, $fa3, $fa4
 ; LA64D-NEXT:    fadd.d $fa2, $fa2, $fa5
 ; LA64D-NEXT:    fadd.d $fa1, $fa1, $fa6
@@ -852,14 +816,11 @@ define void @test_d8(ptr %P, ptr %S) nounwind {
 ; LA32D-NEXT:    addi.w $a2, $zero, 1
 ; LA32D-NEXT:    movgr2fr.w $fa0, $a2
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI6_0)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI6_0)
-; LA32D-NEXT:    fld.d $fa1, $a2, 0
+; LA32D-NEXT:    fld.d $fa1, $a2, %pc_lo12(.LCPI6_0)
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI6_1)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI6_1)
-; LA32D-NEXT:    fld.d $fa2, $a2, 0
+; LA32D-NEXT:    fld.d $fa2, $a2, %pc_lo12(.LCPI6_1)
 ; LA32D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI6_2)
-; LA32D-NEXT:    addi.w $a2, $a2, %pc_lo12(.LCPI6_2)
-; LA32D-NEXT:    fld.d $fa3, $a2, 0
+; LA32D-NEXT:    fld.d $fa3, $a2, %pc_lo12(.LCPI6_2)
 ; LA32D-NEXT:    fld.d $fa4, $a0, 56
 ; LA32D-NEXT:    fld.d $fa5, $a0, 48
 ; LA32D-NEXT:    fld.d $fa6, $a0, 24
@@ -976,14 +937,11 @@ define void @test_d8(ptr %P, ptr %S) nounwind {
 ; LA64D-NEXT:    addi.d $a2, $zero, 1
 ; LA64D-NEXT:    movgr2fr.d $fa0, $a2
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI6_0)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI6_0)
-; LA64D-NEXT:    fld.d $fa1, $a2, 0
+; LA64D-NEXT:    fld.d $fa1, $a2, %pc_lo12(.LCPI6_0)
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI6_1)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI6_1)
-; LA64D-NEXT:    fld.d $fa2, $a2, 0
+; LA64D-NEXT:    fld.d $fa2, $a2, %pc_lo12(.LCPI6_1)
 ; LA64D-NEXT:    pcalau12i $a2, %pc_hi20(.LCPI6_2)
-; LA64D-NEXT:    addi.d $a2, $a2, %pc_lo12(.LCPI6_2)
-; LA64D-NEXT:    fld.d $fa3, $a2, 0
+; LA64D-NEXT:    fld.d $fa3, $a2, %pc_lo12(.LCPI6_2)
 ; LA64D-NEXT:    fld.d $fa4, $a0, 56
 ; LA64D-NEXT:    fld.d $fa5, $a0, 48
 ; LA64D-NEXT:    fld.d $fa6, $a0, 24
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.generated.expected
index 56b6c90a2f6f33f..7a7115b393b1db2 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.generated.expected
@@ -122,7 +122,6 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    .cfi_def_cfa 22, 0
 ; CHECK-NEXT:    st.w $zero, $fp, -12
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(x)
-; CHECK-NEXT:    addi.w $a0, $a0, %pc_lo12(x)
 ; CHECK-NEXT:    ori $a1, $zero, 1
 ; CHECK-NEXT:    st.w $a1, $fp, -16
 ; CHECK-NEXT:    ori $a2, $zero, 2
@@ -131,7 +130,7 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    st.w $a3, $fp, -24
 ; CHECK-NEXT:    ori $a4, $zero, 4
 ; CHECK-NEXT:    st.w $a4, $fp, -28
-; CHECK-NEXT:    st.w $a1, $a0, 0
+; CHECK-NEXT:    st.w $a1, $a0, %pc_lo12(x)
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    st.w $a1, $fp, -16
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.nogenerated.expected
index 2e063202fcf79e3..d99eb3749826f01 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.nogenerated.expected
@@ -99,7 +99,6 @@ define dso_local i32 @main() #0 {
 ; CHECK-NEXT:    .cfi_def_cfa 22, 0
 ; CHECK-NEXT:    st.w $zero, $fp, -12
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(x)
-; CHECK-NEXT:    addi.w $a0, $a0, %pc_lo12(x)
 ; CHECK-NEXT:    ori $a1, $zero, 1
 ; CHECK-NEXT:    st.w $a1, $fp, -16
 ; CHECK-NEXT:    ori $a2, $zero, 2
@@ -108,7 +107,7 @@ define dso_local i32 @main() #0 {
 ; CHECK-NEXT:    st.w $a3, $fp, -24
 ; CHECK-NEXT:    ori $a4, $zero, 4
 ; CHECK-NEXT:    st.w $a4, $fp, -28
-; CHECK-NEXT:    st.w $a1, $a0, 0
+; CHECK-NEXT:    st.w $a1, $a0, %pc_lo12(x)
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    st.w $a1, $fp, -16
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/LoongArch/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/LoongArch/BUILD.gn
index 248e47a9584cdf2..09a5311c122fc54 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/LoongArch/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/LoongArch/BUILD.gn
@@ -42,6 +42,7 @@ static_library("LLVMLoongArchCodeGen") {
     "LoongArchISelLowering.cpp",
     "LoongArchInstrInfo.cpp",
     "LoongArchMCInstLower.cpp",
+    "LoongArchMergeBaseOffset.cpp",
     "LoongArchOptWInstrs.cpp",
     "LoongArchRegisterInfo.cpp",
     "LoongArchSubtarget.cpp",

From a760df316510fc56576d0934f33577a4ce64b146 Mon Sep 17 00:00:00 2001
From: Mital Ashok <mital@mitalashok.co.uk>
Date: Thu, 8 Aug 2024 16:11:21 +0100
Subject: [PATCH 022/112] [Clang] Implement CWG2137 (list-initialization from
 objects of the same type) (#94355)

[CWG2137](https://cplusplus.github.io/CWG/issues/2137.html)

This was previously implemented and then reverted in Clang 18 as #77768

This also implements a workaround for
[CWG2311](https://cplusplus.github.io/CWG/issues/2311.html), similarly
to the 2024-03-01 comment for
[CWG2742](https://cplusplus.github.io/CWG/issues/2742.html).

The exact wording this tries to implement, relative to the C++26 draft:

[over.match.list]p(1.2)

> Otherwise, or if no viable initializer-list constructor is found
<ins>and the initializer list does not consist of exactly a single
element with the same cv-unqualified class type as `T`</ins>, overload
resolution is performed again, [...]

[dcl.init.list]p(3.7)

> Otherwise, if `T` is a class type, constructors are considered. The
applicable constructors are enumerated and the best one is chosen
through overload resolution. <ins>If no constructor is found and the
initializer list consists of exactly a single element with the same
cv-unqualified class type as `T`, the object is initialized from that
element (by copy-initialization for copy-list-initialization, or by
direct-initialization for direct-list-initialization). Otherwise,</ins>
if a narrowing conversion (see below) is required [...]
---
 clang/docs/ReleaseNotes.rst                   | 10 ++
 clang/lib/Sema/SemaInit.cpp                   | 56 +++++++++--
 clang/lib/Sema/SemaOverload.cpp               | 41 +++++---
 clang/test/CXX/drs/cwg14xx.cpp                | 10 --
 clang/test/CXX/drs/cwg21xx.cpp                | 45 ++++++++-
 clang/test/CXX/drs/cwg23xx.cpp                | 99 +++++++++++++++++++
 ...xx1z-class-template-argument-deduction.cpp |  5 +-
 .../test/SemaCXX/single-element-init-list.cpp | 82 +++++++++++++++
 clang/www/cxx_dr_status.html                  |  2 +-
 9 files changed, 315 insertions(+), 35 deletions(-)
 create mode 100644 clang/test/SemaCXX/single-element-init-list.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a355b8db11faf0a..cb0a1b25e51c2a5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -101,6 +101,16 @@ C++2c Feature Support
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Allow calling initializer list constructors from initializer lists with
+  a single element of the same type instead of always copying.
+  (`CWG2137: List-initialization from object of same type <https://cplusplus.github.io/CWG/issues/2137.html>`)
+
+- Speculative resolution for CWG2311 implemented so that the implementation of CWG2137 doesn't remove
+  previous cases where guaranteed copy elision was done. Given a prvalue ``e`` of class type
+  ``T``, ``T{e}`` will try to resolve an initializer list constructor and will use it if successful.
+  Otherwise, if there is no initializer list constructor, the copy will be elided as if it was ``T(e)``.
+  (`CWG2311: Missed case for guaranteed copy elision <https://cplusplus.github.io/CWG/issues/2311.html>`)
+
 C Language Changes
 ------------------
 
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index abd4401e029817d..2666e60c0dd67c4 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -4340,7 +4340,7 @@ static OverloadingResult ResolveConstructorOverload(
 /// \param IsListInit     Is this list-initialization?
 /// \param IsInitListCopy Is this non-list-initialization resulting from a
 ///                       list-initialization from {x} where x is the same
-///                       type as the entity?
+///                       aggregate type as the entity?
 static void TryConstructorInitialization(Sema &S,
                                          const InitializedEntity &Entity,
                                          const InitializationKind &Kind,
@@ -4370,6 +4370,14 @@ static void TryConstructorInitialization(Sema &S,
         Entity.getKind() !=
             InitializedEntity::EK_LambdaToBlockConversionBlockElement);
 
+  bool CopyElisionPossible = false;
+  auto ElideConstructor = [&] {
+    // Convert qualifications if necessary.
+    Sequence.AddQualificationConversionStep(DestType, VK_PRValue);
+    if (ILE)
+      Sequence.RewrapReferenceInitList(DestType, ILE);
+  };
+
   // C++17 [dcl.init]p17:
   //     - If the initializer expression is a prvalue and the cv-unqualified
   //       version of the source type is the same class as the class of the
@@ -4382,11 +4390,33 @@ static void TryConstructorInitialization(Sema &S,
   if (S.getLangOpts().CPlusPlus17 && !RequireActualConstructor &&
       UnwrappedArgs.size() == 1 && UnwrappedArgs[0]->isPRValue() &&
       S.Context.hasSameUnqualifiedType(UnwrappedArgs[0]->getType(), DestType)) {
-    // Convert qualifications if necessary.
-    Sequence.AddQualificationConversionStep(DestType, VK_PRValue);
-    if (ILE)
-      Sequence.RewrapReferenceInitList(DestType, ILE);
-    return;
+    if (ILE && !DestType->isAggregateType()) {
+      // CWG2311: T{ prvalue_of_type_T } is not eligible for copy elision
+      // Make this an elision if this won't call an initializer-list
+      // constructor. (Always on an aggregate type or check constructors first.)
+
+      // This effectively makes our resolution as follows. The parts in angle
+      // brackets are additions.
+      // C++17 [over.match.list]p(1.2):
+      //   - If no viable initializer-list constructor is found <and the
+      //     initializer list does not consist of exactly a single element with
+      //     the same cv-unqualified class type as T>, [...]
+      // C++17 [dcl.init.list]p(3.6):
+      //   - Otherwise, if T is a class type, constructors are considered. The
+      //     applicable constructors are enumerated and the best one is chosen
+      //     through overload resolution. <If no constructor is found and the
+      //     initializer list consists of exactly a single element with the same
+      //     cv-unqualified class type as T, the object is initialized from that
+      //     element (by copy-initialization for copy-list-initialization, or by
+      //     direct-initialization for direct-list-initialization). Otherwise, >
+      //     if a narrowing conversion [...]
+      assert(!IsInitListCopy &&
+             "IsInitListCopy only possible with aggregate types");
+      CopyElisionPossible = true;
+    } else {
+      ElideConstructor();
+      return;
+    }
   }
 
   const RecordType *DestRecordType = DestType->getAs<RecordType>();
@@ -4431,6 +4461,12 @@ static void TryConstructorInitialization(Sema &S,
           S, Kind.getLocation(), Args, CandidateSet, DestType, Ctors, Best,
           CopyInitialization, AllowExplicit,
           /*OnlyListConstructors=*/true, IsListInit, RequireActualConstructor);
+
+    if (CopyElisionPossible && Result == OR_No_Viable_Function) {
+      // No initializer list candidate
+      ElideConstructor();
+      return;
+    }
   }
 
   // C++11 [over.match.list]p1:
@@ -4712,9 +4748,9 @@ static void TryListInitialization(Sema &S,
     return;
   }
 
-  // C++11 [dcl.init.list]p3, per DR1467:
-  // - If T is a class type and the initializer list has a single element of
-  //   type cv U, where U is T or a class derived from T, the object is
+  // C++11 [dcl.init.list]p3, per DR1467 and DR2137:
+  // - If T is an aggregate class and the initializer list has a single element
+  //   of type cv U, where U is T or a class derived from T, the object is
   //   initialized from that element (by copy-initialization for
   //   copy-list-initialization, or by direct-initialization for
   //   direct-list-initialization).
@@ -4725,7 +4761,7 @@ static void TryListInitialization(Sema &S,
   // - Otherwise, if T is an aggregate, [...] (continue below).
   if (S.getLangOpts().CPlusPlus11 && InitList->getNumInits() == 1 &&
       !IsDesignatedInit) {
-    if (DestType->isRecordType()) {
+    if (DestType->isRecordType() && DestType->isAggregateType()) {
       QualType InitType = InitList->getInit(0)->getType();
       if (S.Context.hasSameUnqualifiedType(InitType, DestType) ||
           S.IsDerivedFrom(InitList->getBeginLoc(), InitType, DestType)) {
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index fd88b6a74297dc7..0f196ddf812fdfc 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1619,19 +1619,36 @@ TryUserDefinedConversion(Sema &S, Expr *From, QualType ToType,
     //   called for those cases.
     if (CXXConstructorDecl *Constructor
           = dyn_cast<CXXConstructorDecl>(ICS.UserDefined.ConversionFunction)) {
-      QualType FromCanon
-        = S.Context.getCanonicalType(From->getType().getUnqualifiedType());
+      QualType FromType;
+      SourceLocation FromLoc;
+      // C++11 [over.ics.list]p6, per DR2137:
+      // C++17 [over.ics.list]p6:
+      //   If C is not an initializer-list constructor and the initializer list
+      //   has a single element of type cv U, where U is X or a class derived
+      //   from X, the implicit conversion sequence has Exact Match rank if U is
+      //   X, or Conversion rank if U is derived from X.
+      if (const auto *InitList = dyn_cast<InitListExpr>(From);
+          InitList && InitList->getNumInits() == 1 &&
+          !S.isInitListConstructor(Constructor)) {
+        const Expr *SingleInit = InitList->getInit(0);
+        FromType = SingleInit->getType();
+        FromLoc = SingleInit->getBeginLoc();
+      } else {
+        FromType = From->getType();
+        FromLoc = From->getBeginLoc();
+      }
+      QualType FromCanon =
+          S.Context.getCanonicalType(FromType.getUnqualifiedType());
       QualType ToCanon
         = S.Context.getCanonicalType(ToType).getUnqualifiedType();
-      if (Constructor->isCopyConstructor() &&
-          (FromCanon == ToCanon ||
-           S.IsDerivedFrom(From->getBeginLoc(), FromCanon, ToCanon))) {
+      if ((FromCanon == ToCanon ||
+           S.IsDerivedFrom(FromLoc, FromCanon, ToCanon))) {
         // Turn this into a "standard" conversion sequence, so that it
         // gets ranked with standard conversion sequences.
         DeclAccessPair Found = ICS.UserDefined.FoundConversionFunction;
         ICS.setStandard();
         ICS.Standard.setAsIdentityConversion();
-        ICS.Standard.setFromType(From->getType());
+        ICS.Standard.setFromType(FromType);
         ICS.Standard.setAllToTypes(ToType);
         ICS.Standard.CopyConstructor = Constructor;
         ICS.Standard.FoundCopyConstructor = Found;
@@ -5335,18 +5352,18 @@ TryListConversion(Sema &S, InitListExpr *From, QualType ToType,
       IsDesignatedInit)
     return Result;
 
-  // Per DR1467:
-  //   If the parameter type is a class X and the initializer list has a single
-  //   element of type cv U, where U is X or a class derived from X, the
-  //   implicit conversion sequence is the one required to convert the element
-  //   to the parameter type.
+  // Per DR1467 and DR2137:
+  //   If the parameter type is an aggregate class X and the initializer list
+  //   has a single element of type cv U, where U is X or a class derived from
+  //   X, the implicit conversion sequence is the one required to convert the
+  //   element to the parameter type.
   //
   //   Otherwise, if the parameter type is a character array [... ]
   //   and the initializer list has a single element that is an
   //   appropriately-typed string literal (8.5.2 [dcl.init.string]), the
   //   implicit conversion sequence is the identity conversion.
   if (From->getNumInits() == 1 && !IsDesignatedInit) {
-    if (ToType->isRecordType()) {
+    if (ToType->isRecordType() && ToType->isAggregateType()) {
       QualType InitType = From->getInit(0)->getType();
       if (S.Context.hasSameUnqualifiedType(InitType, ToType) ||
           S.IsDerivedFrom(From->getBeginLoc(), InitType, ToType))
diff --git a/clang/test/CXX/drs/cwg14xx.cpp b/clang/test/CXX/drs/cwg14xx.cpp
index f01d96ad47f3e3a..a23ac7444363315 100644
--- a/clang/test/CXX/drs/cwg14xx.cpp
+++ b/clang/test/CXX/drs/cwg14xx.cpp
@@ -505,16 +505,6 @@ namespace cwg1467 {  // cwg1467: 3.7 c++11
     }
   } // nonaggregate
 
-  namespace SelfInitIsNotListInit {
-    struct S {
-      S();
-      explicit S(S &);
-      S(const S &);
-    };
-    S s1;
-    S s2 = {s1}; // ok, not list-initialization so we pick the non-explicit constructor
-  }
-
   struct NestedInit { int a, b, c; };
   NestedInit ni[1] = {{NestedInit{1, 2, 3}}};
 
diff --git a/clang/test/CXX/drs/cwg21xx.cpp b/clang/test/CXX/drs/cwg21xx.cpp
index d7bc52dd9d44646..2800228748e609b 100644
--- a/clang/test/CXX/drs/cwg21xx.cpp
+++ b/clang/test/CXX/drs/cwg21xx.cpp
@@ -12,7 +12,15 @@
 #endif
 
 namespace std {
-struct type_info;
+  typedef __SIZE_TYPE__ size_t;
+
+  template<typename E> struct initializer_list {
+    const E *p; size_t n;
+    initializer_list(const E *p, size_t n);
+    initializer_list();
+  };
+
+  struct type_info;
 }
 
 namespace cwg2100 { // cwg2100: 12
@@ -136,6 +144,41 @@ namespace cwg2126 { // cwg2126: 12
 #endif
 }
 
+namespace cwg2137 { // cwg2137: 20
+#if __cplusplus >= 201103L
+  struct Q {
+    Q();
+    Q(Q&&);
+    Q(std::initializer_list<Q>) = delete; // #cwg2137-Qcons
+  };
+
+  Q x = Q { Q() };
+  // since-cxx11-error@-1 {{call to deleted constructor of 'Q'}}
+  //   since-cxx11-note@#cwg2137-Qcons {{'Q' has been explicitly marked deleted here}}
+
+  int f(Q); // #cwg2137-f
+  int y = f({ Q() });
+  // since-cxx11-error@-1 {{call to deleted constructor of 'Q'}}
+  //   since-cxx11-note@#cwg2137-Qcons {{'Q' has been explicitly marked deleted here}}
+  //   since-cxx11-note@#cwg2137-f {{passing argument to parameter here}}
+
+  struct U {
+    U();
+    U(const U&);
+  };
+
+  struct Derived : U {
+    Derived();
+    Derived(const Derived&);
+  } d;
+
+  int g(Derived);
+  int g(U(&&)[1]) = delete;
+
+  int z = g({ d });
+#endif
+}
+
 namespace cwg2140 { // cwg2140: 9
 #if __cplusplus >= 201103L
   union U { int a; decltype(nullptr) b; };
diff --git a/clang/test/CXX/drs/cwg23xx.cpp b/clang/test/CXX/drs/cwg23xx.cpp
index e4a1e90941dbf00..77fd6a337436e3a 100644
--- a/clang/test/CXX/drs/cwg23xx.cpp
+++ b/clang/test/CXX/drs/cwg23xx.cpp
@@ -6,6 +6,16 @@
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
 // RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
 
+namespace std {
+  __extension__ typedef __SIZE_TYPE__ size_t;
+
+  template<typename E> struct initializer_list {
+    const E *p; size_t n;
+    initializer_list(const E *p, size_t n);
+    initializer_list();
+  };
+}
+
 #if __cplusplus >= 201103L
 namespace cwg2303 { // cwg2303: 12
 template <typename... T>
@@ -94,6 +104,95 @@ struct Z : W,
 // cwg2331: na
 // cwg2335 is in cwg2335.cxx
 
+namespace cwg2311 {  // cwg2311 is open with no proposed resolution
+#if __cplusplus >= 201707L
+template<typename T>
+void test() {
+  // Ensure none of these try to call a move constructor.
+  T a = T{T(0)};
+  T b{T(0)};
+  auto c{T(0)};
+  T d = {T(0)};
+  auto e = {T(0)};
+#if __cplusplus >= 202302L
+  auto f = auto{T(0)};
+#endif
+  void(*fn)(T);
+  fn({T(0)});
+}
+
+struct NonMovable {
+  NonMovable(int);
+  NonMovable(NonMovable&&) = delete;
+};
+struct NonMovableNonApplicableIList {
+  NonMovableNonApplicableIList(int);
+  NonMovableNonApplicableIList(NonMovableNonApplicableIList&&) = delete;
+  NonMovableNonApplicableIList(std::initializer_list<int>);
+};
+struct ExplicitMovable {
+  ExplicitMovable(int);
+  explicit ExplicitMovable(ExplicitMovable&&);
+};
+struct ExplicitNonMovable {
+  ExplicitNonMovable(int);
+  explicit ExplicitNonMovable(ExplicitNonMovable&&) = delete;
+};
+struct ExplicitNonMovableNonApplicableIList {
+  ExplicitNonMovableNonApplicableIList(int);
+  explicit ExplicitNonMovableNonApplicableIList(ExplicitNonMovableNonApplicableIList&&) = delete;
+  ExplicitNonMovableNonApplicableIList(std::initializer_list<int>);
+};
+struct CopyOnly {
+  CopyOnly(int);
+  CopyOnly(const CopyOnly&);
+  CopyOnly(CopyOnly&&) = delete;
+};
+struct ExplicitCopyOnly {
+  ExplicitCopyOnly(int);
+  explicit ExplicitCopyOnly(const ExplicitCopyOnly&);
+  explicit ExplicitCopyOnly(ExplicitCopyOnly&&) = delete;
+};
+
+template void test<NonMovable>();
+template void test<NonMovableNonApplicableIList>();
+template void test<ExplicitMovable>();
+template void test<ExplicitNonMovable>();
+template void test<ExplicitNonMovableNonApplicableIList>();
+template void test<CopyOnly>();
+template void test<ExplicitCopyOnly>();
+
+struct any {
+    template<typename T>
+    any(T&&);
+};
+
+template<typename T>
+struct X {
+    X();
+    X(T) = delete; // #cwg2311-X
+};
+
+X<std::initializer_list<any>> x{ X<std::initializer_list<any>>() };
+// since-cxx17-error@-1 {{call to deleted constructor of 'X<std::initializer_list<any>>'}}
+//   since-cxx17-note@#cwg2311-X {{'X' has been explicitly marked deleted here}}
+
+// Per the currently implemented resolution, this does not apply to std::initializer_list.
+// An initializer list initialized from `{ e }` always has exactly one element constructed
+// from `e`, where previously that could have been a copy of an init list or `e.operator std::initializer_list()`
+struct InitListCtor {
+  InitListCtor(int);
+  InitListCtor(InitListCtor&&) = delete;
+  InitListCtor(std::initializer_list<InitListCtor>) = delete; // #cwg2311-InitListCtor
+};
+
+std::initializer_list<InitListCtor> i;
+auto j = std::initializer_list<InitListCtor>{ i };
+// since-cxx17-error@-1 {{conversion function from 'std::initializer_list<InitListCtor>' to 'const cwg2311::InitListCtor' invokes a deleted function}}
+//   since-cxx17-note@#cwg2311-InitListCtor {{'InitListCtor' has been explicitly marked deleted here}}
+#endif
+}
+
 #if __cplusplus >= 201103L
 namespace cwg2338 { // cwg2338: 12
 namespace B {
diff --git a/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp b/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
index 9ef5303a9c4df55..9aaa13d7ac41ad7 100644
--- a/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
+++ b/clang/test/SemaCXX/cxx1z-class-template-argument-deduction.cpp
@@ -19,13 +19,16 @@ template<typename T> constexpr bool has_type(T&) { return true; }
 
 std::initializer_list il1 = {1, 2, 3, 4, 5};
 auto il2 = std::initializer_list{1, 2, 3, 4};
-auto il3 = std::initializer_list{il1};
+auto il3 = std::initializer_list(il1);
 auto il4 = std::initializer_list{il1, il1, il1};
 static_assert(has_type<std::initializer_list<int>>(il1));
 static_assert(has_type<std::initializer_list<int>>(il2));
 static_assert(has_type<std::initializer_list<int>>(il3));
 static_assert(has_type<std::initializer_list<std::initializer_list<int>>>(il4));
 
+auto il5 = std::initializer_list{il1};
+// expected-error@-1 {{no viable conversion from 'std::initializer_list<int>' to 'const int'}}
+
 template<typename T> struct vector {
   template<typename Iter> vector(Iter, Iter);
   vector(std::initializer_list<T>);
diff --git a/clang/test/SemaCXX/single-element-init-list.cpp b/clang/test/SemaCXX/single-element-init-list.cpp
new file mode 100644
index 000000000000000..33d986e08401383
--- /dev/null
+++ b/clang/test/SemaCXX/single-element-init-list.cpp
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -verify %s
+
+// This is heavily affected by the speculative resolution applied to CWG2311
+// So behaviour shown here is subject to change.
+
+// expected-no-diagnostics
+
+namespace std {
+  typedef decltype(sizeof(int)) size_t;
+
+  // libc++'s implementation
+  template <class _E>
+  class initializer_list
+  {
+    const _E* __begin_;
+    size_t    __size_;
+
+    initializer_list(const _E* __b, size_t __s)
+      : __begin_(__b),
+        __size_(__s)
+    {}
+
+  public:
+    typedef _E        value_type;
+    typedef const _E& reference;
+    typedef const _E& const_reference;
+    typedef size_t    size_type;
+
+    typedef const _E* iterator;
+    typedef const _E* const_iterator;
+
+    constexpr initializer_list() : __begin_(nullptr), __size_(0) {}
+
+    constexpr size_t    size()  const {return __size_;}
+    const _E* begin() const {return __begin_;}
+    const _E* end()   const {return __begin_ + __size_;}
+  };
+
+  template<typename T>
+  struct vector {
+    size_t sz;
+    constexpr vector() : sz(0) {}
+    constexpr vector(initializer_list<T> ilist) : sz(ilist.size()) {}
+    constexpr vector(const vector& other) : sz(other.sz) {}
+    constexpr std::size_t size() const { return sz; }
+  };
+}
+
+// https://github.com/llvm/llvm-project/pull/77768#issuecomment-1908062472
+namespace Issue1 {
+  struct A {
+    constexpr A() {}
+  };
+
+  struct B {
+    int called_ctor;
+    constexpr explicit B(A) : called_ctor(0) {}
+    constexpr explicit B(std::vector<A>) : called_ctor(1) {}
+  };
+
+  struct C {
+    B b;
+    constexpr C() : b({A()}) {}
+  };
+
+  static_assert(C().b.called_ctor == 0);
+}
+
+// https://github.com/llvm/llvm-project/pull/77768#issuecomment-1957171805
+namespace Issue2 {
+  struct A {
+    constexpr A(int x_) {}
+    constexpr A(const std::vector<A>& a) {}
+  };
+
+  void f() {
+    constexpr std::vector<A> a{1,2};
+    constexpr std::vector<A> b{a};
+    // -> constexpr std::vector<A> b(std::initializer_list<A>{ A(a) });
+    static_assert(b.size() == 1);
+  }
+}
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index b8b733dbfa7163b..5ad60002733a04d 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -12637,7 +12637,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2137.html">2137</a></td>
     <td>CD4</td>
     <td>List-initialization from object of same type</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="unreleased" align="center">Clang 20</td>
   </tr>
   <tr id="2138">
     <td><a href="https://cplusplus.github.io/CWG/issues/2138.html">2138</a></td>

From 899f648866affd011baae627752ba15baabc2ef9 Mon Sep 17 00:00:00 2001
From: David Tenty <daltenty@ibm.com>
Date: Thu, 8 Aug 2024 11:16:18 -0400
Subject: [PATCH 023/112] [NFC][llvm][support] rename INFINITY in regcomp
 (#101758)

since C23 this macro is defined by float.h, which clang implements in
it's float.h since #96659 landed.

However, regcomp.c in LLVMSupport happened to define it's own macro with
that name, leading to problems when bootstrapping. This change renames
the offending macro.
---
 llvm/lib/Support/regcomp.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Support/regcomp.c b/llvm/lib/Support/regcomp.c
index 990aef32a396faf..daa41eb4912ef47 100644
--- a/llvm/lib/Support/regcomp.c
+++ b/llvm/lib/Support/regcomp.c
@@ -278,7 +278,7 @@ static char nuls[10];		/* place to point scanner in event of error */
 #else
 #define	DUPMAX	255
 #endif
-#define	INFINITY	(DUPMAX + 1)
+#define REGINFINITY (DUPMAX + 1)
 
 #ifndef NDEBUG
 static int never = 0;		/* for use in asserts; shuts lint up */
@@ -582,7 +582,7 @@ p_ere_exp(struct parse *p)
 				count2 = p_count(p);
 				REQUIRE(count <= count2, REG_BADBR);
 			} else		/* single number with comma */
-				count2 = INFINITY;
+				count2 = REGINFINITY;
 		} else		/* just a single number */
 			count2 = count;
 		repeat(p, pos, count, count2);
@@ -753,7 +753,7 @@ p_simp_re(struct parse *p,
 				count2 = p_count(p);
 				REQUIRE(count <= count2, REG_BADBR);
 			} else		/* single number with comma */
-				count2 = INFINITY;
+				count2 = REGINFINITY;
 		} else		/* just a single number */
 			count2 = count;
 		repeat(p, pos, count, count2);
@@ -1115,7 +1115,7 @@ repeat(struct parse *p,
 #	define	N	2
 #	define	INF	3
 #	define	REP(f, t)	((f)*8 + (t))
-#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
+#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == REGINFINITY) ? INF : N)
 	sopno copy;
 
 	if (p->error != 0)	/* head off possible runaway recursion */

From 7a4fc7491ce8083d81f1bffa7fa9546cb53aa5e0 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen@sifive.com>
Date: Thu, 8 Aug 2024 23:23:10 +0800
Subject: [PATCH 024/112] [SLP][REVEC] Fix insertelement has multiple uses.
 (#102329)

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 10 ++++++++
 llvm/test/Transforms/SLPVectorizer/revec.ll   | 23 +++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index d14dd1dabd5fcd2..575d178374844b7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6419,6 +6419,16 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
       return TreeEntry::NeedToGather;
     }
 
+    if (any_of(VL, [&SourceVectors](Value *V) {
+          // The last InsertElement can have multiple uses.
+          return SourceVectors.contains(V) && !V->hasOneUse();
+        })) {
+      assert(SLPReVec && "Only supported by REVEC.");
+      LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
+                           "multiple uses.\n");
+      return TreeEntry::NeedToGather;
+    }
+
     return TreeEntry::Vectorize;
   }
   case Instruction::Load: {
diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll
index d6dd4128de9c7e5..d7c3ccd8c9ce8a4 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec.ll
@@ -124,3 +124,26 @@ entry:
   store <8 x i1> %6, ptr %7, align 1
   ret void
 }
+
+define void @test5(ptr %ptr0, ptr %ptr1) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GETELEMENTPTR0:%.*]] = getelementptr i8, ptr null, i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x ptr> <ptr null, ptr null, ptr undef, ptr undef>, ptr [[GETELEMENTPTR0]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x ptr> [[TMP0]], ptr null, i32 3
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <4 x ptr> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x ptr> <ptr poison, ptr null, ptr null, ptr null>, ptr [[PTR0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x ptr> [[TMP1]], ptr [[PTR1:%.*]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult <4 x ptr> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %getelementptr0 = getelementptr i8, ptr null, i64 0
+  %0 = insertelement <4 x ptr> <ptr null, ptr null, ptr undef, ptr undef>, ptr %getelementptr0, i32 2
+  %1 = insertelement <4 x ptr> %0, ptr null, i32 3
+  %2 = icmp ult <4 x ptr> zeroinitializer, %1
+  %3 = insertelement <4 x ptr> <ptr poison, ptr null, ptr null, ptr null>, ptr %ptr0, i32 0
+  %4 = insertelement <4 x ptr> %1, ptr %ptr1, i32 3
+  %5 = icmp ult <4 x ptr> %3, %4
+  ret void
+}

From 7e5fe697bf408172250b077e151ace3a834da2cc Mon Sep 17 00:00:00 2001
From: weiguozhi <57237827+weiguozhi@users.noreply.github.com>
Date: Thu, 8 Aug 2024 08:26:23 -0700
Subject: [PATCH 025/112] [X86] Speed up checking clobbered FP/BP (#102365)

Most functions don't clobber frame register and base pointer. They are
usually caused by inline asm and function call. So we can record if a
function call clobber FP/BP at lowering phase, and later we can check
the recorded information and return early.
---
 llvm/lib/Target/X86/X86FrameLowering.cpp     | 10 ++++++++++
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp  |  5 +++++
 llvm/lib/Target/X86/X86MachineFunctionInfo.h | 10 ++++++++++
 3 files changed, 25 insertions(+)

diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 77dac1197f85e91..8404f2231680d61 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -4458,6 +4458,16 @@ void X86FrameLowering::spillFPBP(MachineFunction &MF) const {
     FP = TRI->getFrameRegister(MF);
   if (TRI->hasBasePointer(MF))
     BP = TRI->getBaseRegister();
+
+  // Currently only inline asm and function call can clobbers fp/bp. So we can
+  // do some quick test and return early.
+  if (!MF.hasInlineAsm()) {
+    X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    if (!X86FI->getFPClobberedByCall())
+      FP = 0;
+    if (!X86FI->getBPClobberedByCall())
+      BP = 0;
+  }
   if (!FP && !BP)
     return;
 
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index f659c168b86e0ed..1e609a84673a3cc 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2450,6 +2450,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }();
   assert(Mask && "Missing call preserved mask for calling convention");
 
+  if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getFrameRegister(MF)))
+    X86Info->setFPClobberedByCall(true);
+  if (MachineOperand::clobbersPhysReg(Mask, RegInfo->getBaseRegister()))
+    X86Info->setBPClobberedByCall(true);
+
   // If this is an invoke in a 32-bit function using a funclet-based
   // personality, assume the function clobbers all registers. If an exception
   // is thrown, the runtime will not restore CSRs.
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 315aeef65d28c8b..13d57c2fa9dfbc2 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -170,6 +170,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   SmallVector<size_t, 0> PreallocatedStackSizes;
   SmallVector<SmallVector<size_t, 4>, 0> PreallocatedArgOffsets;
 
+  // True if a function clobbers FP/BP according to its calling convention.
+  bool FPClobberedByCall = false;
+  bool BPClobberedByCall = false;
+
 private:
   /// ForwardedMustTailRegParms - A list of virtual and physical registers
   /// that must be forwarded to every musttail call.
@@ -328,6 +332,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
     assert(!PreallocatedArgOffsets[Id].empty() && "arg offsets not set");
     return PreallocatedArgOffsets[Id];
   }
+
+  bool getFPClobberedByCall() const { return FPClobberedByCall; }
+  void setFPClobberedByCall(bool C) { FPClobberedByCall = C; }
+
+  bool getBPClobberedByCall() const { return BPClobberedByCall; }
+  void setBPClobberedByCall(bool C) { BPClobberedByCall = C; }
 };
 
 } // End llvm namespace

From da492d438790cc7c51f962071a887af7d20be30d Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 8 Aug 2024 16:31:11 +0100
Subject: [PATCH 026/112] [mlir][vector] Fix return of
 `DropUnitDimsFromTransposeOp` pattern (#102478)

This accidentally returned `failure()` (rather than `success()`) when it
applied.
---
 mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 55c1c6bad9f2a40..bc0c96b32a80f5b 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -1782,7 +1782,7 @@ struct DropUnitDimsFromTransposeOp final
     rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(
         op, op.getResultVectorType(), tranposeWithoutUnitDims);
 
-    return failure();
+    return success();
   }
 };
 

From 722c066c59b25bc73f41dbe92193319aa5b8783a Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 8 Aug 2024 16:36:05 +0100
Subject: [PATCH 027/112] [libc] Make use of 32-bit time_t a config option
 (#102012)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 32-bit Arm builds of libc define time_t to be `__INTPTR_TYPE__`,
i.e. a 32-bit integer. This is commented in the commit introducing it
(75398f28ebdb600) as being for compatibility with glibc. But in the near
future not even every AArch32 build of glibc will have a 32-bit time_t:
Debian is planning that their next release (trixie) will have switched
to 64-bit. And non-Linux builds of this libc (e.g. baremetal) have no
reason to need glibc compatibility in the first place – and every reason
_not_ to want to start using a 32-bit time_t in 2024 or later.

So I've replaced the `#ifdef` in `llvm-libc-types/time_t.h` with two
versions of the header file, chosen in `CMakeLists.txt` via a new
configuration option. This involved adding an extra parameter to the
cmake `add_header` function to specify different names for the header
file in the source and destination directories.
---
 .../cmake/modules/LLVMLibCCompileOptionRules.cmake |  4 ++++
 libc/cmake/modules/LLVMLibCFlagRules.cmake         | 13 +++++++++++++
 libc/cmake/modules/LLVMLibCHeaderRules.cmake       | 11 ++++++++---
 libc/config/config.json                            |  6 ++++++
 libc/docs/configure.rst                            |  2 ++
 libc/include/llvm-libc-types/CMakeLists.txt        |  6 +++++-
 libc/include/llvm-libc-types/time_t.h              |  8 ++++----
 libc/include/llvm-libc-types/time_t_32.h           | 14 ++++++++++++++
 libc/include/llvm-libc-types/time_t_64.h           | 14 ++++++++++++++
 9 files changed, 70 insertions(+), 8 deletions(-)
 create mode 100644 libc/include/llvm-libc-types/time_t_32.h
 create mode 100644 libc/include/llvm-libc-types/time_t_64.h

diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index 9fc10375a1d379f..e3dfe1a15296919 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -71,6 +71,10 @@ function(_get_compile_options_from_config output_var)
     list(APPEND config_options "-DLIBC_QSORT_IMPL=${LIBC_CONF_QSORT_IMPL}")
   endif()
 
+  if(LIBC_TYPES_TIME_T_IS_32_BIT AND LLVM_LIBC_FULL_BUILD)
+    list(APPEND config_options "-DLIBC_TYPES_TIME_T_IS_32_BIT")
+  endif()
+
   set(${output_var} ${config_options} PARENT_SCOPE)
 endfunction(_get_compile_options_from_config)
 
diff --git a/libc/cmake/modules/LLVMLibCFlagRules.cmake b/libc/cmake/modules/LLVMLibCFlagRules.cmake
index 3629a7f111a7c52..69f31ace80dd3dd 100644
--- a/libc/cmake/modules/LLVMLibCFlagRules.cmake
+++ b/libc/cmake/modules/LLVMLibCFlagRules.cmake
@@ -286,3 +286,16 @@ if(NOT((LIBC_TARGET_ARCHITECTURE_IS_X86 AND (LIBC_CPU_FEATURES MATCHES "SSE4_2")
        LIBC_TARGET_ARCHITECTURE_IS_AARCH64 OR LIBC_TARGET_OS_IS_GPU))
   set(SKIP_FLAG_EXPANSION_ROUND_OPT TRUE)
 endif()
+
+# Choose whether time_t is 32- or 64-bit, based on target architecture
+# and config options. This will be used to set a #define during the
+# library build, and also to select the right version of time_t.h for
+# the output headers.
+if(LIBC_TARGET_ARCHITECTURE_IS_ARM AND NOT (LIBC_CONF_TIME_64BIT))
+  # Set time_t to 32 bit for compatibility with glibc, unless
+  # configuration says otherwise
+  set(LIBC_TYPES_TIME_T_IS_32_BIT TRUE)
+else()
+  # Other platforms default to 64-bit time_t
+  set(LIBC_TYPES_TIME_T_IS_32_BIT FALSE)
+endif()
diff --git a/libc/cmake/modules/LLVMLibCHeaderRules.cmake b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
index 3049f4db7301f66..c2c675bda26d31e 100644
--- a/libc/cmake/modules/LLVMLibCHeaderRules.cmake
+++ b/libc/cmake/modules/LLVMLibCHeaderRules.cmake
@@ -9,8 +9,8 @@
 function(add_header target_name)
   cmake_parse_arguments(
     "ADD_HEADER"
-    ""    # No optional arguments
-    "HDR" # Single value arguments
+    ""             # No optional arguments
+    "HDR;DEST_HDR" # Single value arguments
     "DEPENDS"
     ${ARGN}
   )
@@ -18,7 +18,12 @@ function(add_header target_name)
     message(FATAL_ERROR "'add_header' rules requires the HDR argument specifying a headef file.")
   endif()
 
-  set(absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_HEADER_HDR})
+  if(ADD_HEADER_DEST_HDR)
+    set(dest_leaf_filename ${ADD_HEADER_DEST_HDR})
+  else()
+    set(dest_leaf_filename ${ADD_HEADER_HDR})
+  endif()
+  set(absolute_path ${CMAKE_CURRENT_SOURCE_DIR}/${dest_leaf_filename})
   file(RELATIVE_PATH relative_path ${LIBC_INCLUDE_SOURCE_DIR} ${absolute_path})
   set(dest_file ${LIBC_INCLUDE_DIR}/${relative_path})
   set(src_file ${CMAKE_CURRENT_SOURCE_DIR}/${ADD_HEADER_HDR})
diff --git a/libc/config/config.json b/libc/config/config.json
index 538fea53cc704ab..2e72c0a3fd1d690 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -88,5 +88,11 @@
       "value": true,
       "doc": "Make setjmp save the value of x18, and longjmp restore it. The AArch64 ABI delegates this register to platform ABIs, which can choose whether to make it caller-saved."
     }
+  },
+  "time": {
+    "LIBC_CONF_TIME_64BIT": {
+      "value": false,
+      "doc": "Force the size of time_t to 64 bits, even on platforms where compatibility considerations would otherwise make it 32-bit."
+    }
   }
 }
diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst
index 950de0eee4c05df..54ca5d55d7b2435 100644
--- a/libc/docs/configure.rst
+++ b/libc/docs/configure.rst
@@ -52,3 +52,5 @@ to learn about the defaults for your platform and target.
 * **"string" options**
     - ``LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING``: Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled.
     - ``LIBC_CONF_STRING_UNSAFE_WIDE_READ``: Read more than a byte at a time to perform byte-string operations like strlen.
+* **"time" options**
+    - ``LIBC_CONF_TIME_64BIT``: Force the size of time_t to 64 bits, even on platforms where compatibility considerations would otherwise make it 32-bit.
diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt
index 9e77ab226ce6c25..0fa86e0152f9ba9 100644
--- a/libc/include/llvm-libc-types/CMakeLists.txt
+++ b/libc/include/llvm-libc-types/CMakeLists.txt
@@ -59,7 +59,11 @@ add_header(pthread_rwlockattr_t HDR pthread_rwlockattr_t.h)
 add_header(pthread_spinlock_t HDR pthread_spinlock_t.h DEPENDS .pid_t)
 add_header(pthread_t HDR pthread_t.h DEPENDS .__thread_type)
 add_header(rlim_t HDR rlim_t.h)
-add_header(time_t HDR time_t.h)
+if(LIBC_TYPES_TIME_T_IS_32_BIT)
+  add_header(time_t HDR time_t_32.h DEST_HDR time_t.h)
+else()
+  add_header(time_t HDR time_t_64.h DEST_HDR time_t.h)
+endif()
 add_header(stack_t HDR stack_t.h DEPENDS .size_t)
 add_header(suseconds_t HDR suseconds_t.h)
 add_header(struct_flock HDR struct_flock.h DEPENDS .off_t .pid_t)
diff --git a/libc/include/llvm-libc-types/time_t.h b/libc/include/llvm-libc-types/time_t.h
index 59953b343ba9634..76920dc07ec69c6 100644
--- a/libc/include/llvm-libc-types/time_t.h
+++ b/libc/include/llvm-libc-types/time_t.h
@@ -1,4 +1,4 @@
-//===-- Definition of the type time_t -------------------------------------===//
+//===-- Definition of the type time_t, for use during the libc build ------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,10 +9,10 @@
 #ifndef LLVM_LIBC_TYPES_TIME_T_H
 #define LLVM_LIBC_TYPES_TIME_T_H
 
-#if (defined(__arm__) || defined(_M_ARM))
-typedef __INTPTR_TYPE__ time_t;
+#ifdef LIBC_TYPES_TIME_T_IS_32_BIT
+#include "time_t_32.h"
 #else
-typedef __INT64_TYPE__ time_t;
+#include "time_t_64.h"
 #endif
 
 #endif // LLVM_LIBC_TYPES_TIME_T_H
diff --git a/libc/include/llvm-libc-types/time_t_32.h b/libc/include/llvm-libc-types/time_t_32.h
new file mode 100644
index 000000000000000..2c415f6fa9dcabb
--- /dev/null
+++ b/libc/include/llvm-libc-types/time_t_32.h
@@ -0,0 +1,14 @@
+//===-- Definition of the type time_t -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_TIME_T_32_H
+#define LLVM_LIBC_TYPES_TIME_T_32_H
+
+typedef __INT32_TYPE__ time_t;
+
+#endif // LLVM_LIBC_TYPES_TIME_T_32_H
diff --git a/libc/include/llvm-libc-types/time_t_64.h b/libc/include/llvm-libc-types/time_t_64.h
new file mode 100644
index 000000000000000..8f7fd3233646e62
--- /dev/null
+++ b/libc/include/llvm-libc-types/time_t_64.h
@@ -0,0 +1,14 @@
+//===-- Definition of the type time_t -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_TIME_T_64_H
+#define LLVM_LIBC_TYPES_TIME_T_64_H
+
+typedef __INT64_TYPE__ time_t;
+
+#endif // LLVM_LIBC_TYPES_TIME_T_64_H

From eddfd504f8bd54b8723ffaa6667e61ecf393652b Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Thu, 8 Aug 2024 11:41:17 -0400
Subject: [PATCH 028/112] [libc][math][c23] Add ddivl C23 math function.
 (#102468)

---
 libc/config/darwin/arm/entrypoints.txt    |  1 +
 libc/config/darwin/x86_64/entrypoints.txt |  1 +
 libc/config/linux/aarch64/entrypoints.txt |  1 +
 libc/config/linux/riscv/entrypoints.txt   |  1 +
 libc/config/linux/x86_64/entrypoints.txt  |  1 +
 libc/config/windows/entrypoints.txt       |  1 +
 libc/docs/math/index.rst                  |  2 +-
 libc/newhdrgen/yaml/math.yaml             |  7 +++++++
 libc/spec/stdc.td                         |  2 +-
 libc/src/__support/FPUtil/generic/div.h   |  2 +-
 libc/src/math/CMakeLists.txt              |  1 +
 libc/src/math/ddivl.h                     | 20 ++++++++++++++++++++
 libc/src/math/generic/CMakeLists.txt      | 12 ++++++++++++
 libc/src/math/generic/ddivl.cpp           | 20 ++++++++++++++++++++
 libc/test/src/math/CMakeLists.txt         | 14 +++++++++++++-
 libc/test/src/math/DivTest.h              |  4 ++--
 libc/test/src/math/ddivl_test.cpp         | 13 +++++++++++++
 libc/test/src/math/smoke/CMakeLists.txt   | 18 +++++++++++++++---
 libc/test/src/math/smoke/ddivl_test.cpp   | 13 +++++++++++++
 19 files changed, 125 insertions(+), 9 deletions(-)
 create mode 100644 libc/src/math/ddivl.h
 create mode 100644 libc/src/math/generic/ddivl.cpp
 create mode 100644 libc/test/src/math/ddivl_test.cpp
 create mode 100644 libc/test/src/math/smoke/ddivl_test.cpp

diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index 27822f5013f2fcc..78e46b18993a096 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -149,6 +149,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.dfmal
     libc.src.math.dsqrtl
     libc.src.math.daddl
+    libc.src.math.ddivl
     libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
diff --git a/libc/config/darwin/x86_64/entrypoints.txt b/libc/config/darwin/x86_64/entrypoints.txt
index aa13fb276750fc6..49c19571ac41927 100644
--- a/libc/config/darwin/x86_64/entrypoints.txt
+++ b/libc/config/darwin/x86_64/entrypoints.txt
@@ -120,6 +120,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     #libc.src.math.coshf
     #libc.src.math.cosf
     #libc.src.math.daddl
+    #libc.src.math.ddivl
     #libc.src.math.dfmal
     #libc.src.math.dsqrtl
     #libc.src.math.dsubl
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index f2ab6c0ba73d741..034db23932a3987 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -389,6 +389,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.coshf
     libc.src.math.cospif
     libc.src.math.daddl
+    libc.src.math.ddivl
     libc.src.math.dfmal
     libc.src.math.dmull
     libc.src.math.dsqrtl
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index a7deccb9ded988c..e2d8e9e32496778 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -388,6 +388,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.coshf
     libc.src.math.cospif
     libc.src.math.daddl
+    libc.src.math.ddivl
     libc.src.math.dfmal
     libc.src.math.dmull
     libc.src.math.dsqrtl
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 6b16843e5027d77..817f4190399aa19 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -391,6 +391,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.dmull
     libc.src.math.dsqrtl
     libc.src.math.daddl
+    libc.src.math.ddivl
     libc.src.math.dsubl
     libc.src.math.erff
     libc.src.math.exp
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 87531028959ef11..1f36bdf9b21ee8d 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -137,6 +137,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.cosf
     libc.src.math.coshf
     libc.src.math.daddl
+    libc.src.math.ddivl
     libc.src.math.dfmal
     libc.src.math.dsubl
     libc.src.math.erff
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index c0cecdfa1b25ac3..a3174c72e86ce12 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -116,7 +116,7 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | dadd             | N/A              | N/A             | |check|                | N/A                  | |check|\*              | 7.12.14.1              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| ddiv             | N/A              | N/A             |                        | N/A                  | |check|\*              | 7.12.14.4              | F.10.11                    |
+| ddiv             | N/A              | N/A             | |check|                | N/A                  | |check|\*              | 7.12.14.4              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | dfma             | N/A              | N/A             | |check|                | N/A                  | |check|\*              | 7.12.14.5              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/newhdrgen/yaml/math.yaml b/libc/newhdrgen/yaml/math.yaml
index c0485428fc559e5..1b474b3401ecb5a 100644
--- a/libc/newhdrgen/yaml/math.yaml
+++ b/libc/newhdrgen/yaml/math.yaml
@@ -2352,3 +2352,10 @@ functions:
     arguments:
       - type: long double
       - type: int *
+  - name: ddivl
+    standards:
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long double
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 0c58af084698940..082b1e499e0d02b 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -398,7 +398,7 @@ def StdC : StandardSpec<"stdc"> {
           GuardedFunctionSpec<"ceilf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"daddl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
-
+          FunctionSpec<"ddivl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
           FunctionSpec<"dfmal", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
           FunctionSpec<"dsubl", RetValSpec<DoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
 
diff --git a/libc/src/__support/FPUtil/generic/div.h b/libc/src/__support/FPUtil/generic/div.h
index dad1772fce75007..f0e405772e9fa3c 100644
--- a/libc/src/__support/FPUtil/generic/div.h
+++ b/libc/src/__support/FPUtil/generic/div.h
@@ -35,7 +35,7 @@ div(InType x, InType y) {
   using InFPBits = FPBits<InType>;
   using InStorageType = typename InFPBits::StorageType;
   using DyadicFloat =
-      DyadicFloat<cpp::bit_ceil(static_cast<size_t>(InFPBits::FRACTION_LEN))>;
+      DyadicFloat<cpp::bit_ceil(static_cast<size_t>(InFPBits::SIG_LEN + 1))>;
 
   InFPBits x_bits(x);
   InFPBits y_bits(y);
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 56e9bb60c1e4362..a07065c93003f26 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -88,6 +88,7 @@ add_math_entrypoint_object(cospif)
 
 add_math_entrypoint_object(daddl)
 add_math_entrypoint_object(daddf128)
+add_math_entrypoint_object(ddivl)
 add_math_entrypoint_object(ddivf128)
 add_math_entrypoint_object(dmull)
 add_math_entrypoint_object(dmulf128)
diff --git a/libc/src/math/ddivl.h b/libc/src/math/ddivl.h
new file mode 100644
index 000000000000000..bf0da2887e330f2
--- /dev/null
+++ b/libc/src/math/ddivl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for ddivl -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_DDIVL_H
+#define LLVM_LIBC_SRC_MATH_DDIVL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double ddivl(long double x, long double y);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_DDIVL_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index e5f40673dd5f021..a224bed8c3a14c4 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -154,6 +154,18 @@ add_entrypoint_object(
     libc.src.__support.FPUtil.generic.add_sub
 )
 
+add_entrypoint_object(
+  ddivl
+  SRCS
+    ddivl.cpp
+  HDRS
+    ../ddivl.h
+  COMPILE_OPTIONS
+    -O3
+  DEPENDS
+    libc.src.__support.FPUtil.generic.div
+)
+
 add_entrypoint_object(
   ddivf128
   SRCS
diff --git a/libc/src/math/generic/ddivl.cpp b/libc/src/math/generic/ddivl.cpp
new file mode 100644
index 000000000000000..18fc44d6f1648e9
--- /dev/null
+++ b/libc/src/math/generic/ddivl.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of ddivl function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/ddivl.h"
+#include "src/__support/FPUtil/generic/div.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, ddivl, (long double x, long double y)) {
+  return fputil::generic::div<double>(x, y);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index f3703eb59999b1c..d0106972809cc02 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -2464,6 +2464,19 @@ add_fp_unittest(
     libc.src.stdlib.srand
 )
 
+add_fp_unittest(
+  ddivl_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    ddivl_test.cpp
+  HDRS
+    DivTest.h
+  DEPENDS
+    libc.src.math.ddivl
+)
+
 add_fp_unittest(
   dfmal_test
   NEED_MPFR
@@ -2518,7 +2531,6 @@ add_fp_unittest(
     libc.src.math.fdivl
 )
 
-
 add_fp_unittest(
   ffma_test
   NEED_MPFR
diff --git a/libc/test/src/math/DivTest.h b/libc/test/src/math/DivTest.h
index 1cdc1398a1a1c1c..c14d16fb5773bc9 100644
--- a/libc/test/src/math/DivTest.h
+++ b/libc/test/src/math/DivTest.h
@@ -47,7 +47,7 @@ class DivTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
       InType x = InFPBits(v).get_val();
       InType y = InFPBits(w).get_val();
       mpfr::BinaryInput<InType> input{x, y};
-      EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Div, input, func(x, y),
+      ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Div, input, func(x, y),
                                      0.5);
     }
   }
@@ -60,7 +60,7 @@ class DivTest : public LIBC_NAMESPACE::testing::FEnvSafeTest {
       InType x = InFPBits(v).get_val();
       InType y = InFPBits(w).get_val();
       mpfr::BinaryInput<InType> input{x, y};
-      EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Div, input, func(x, y),
+      ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Div, input, func(x, y),
                                      0.5);
     }
   }
diff --git a/libc/test/src/math/ddivl_test.cpp b/libc/test/src/math/ddivl_test.cpp
new file mode 100644
index 000000000000000..7768766e30c12ab
--- /dev/null
+++ b/libc/test/src/math/ddivl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for ddivl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DivTest.h"
+
+#include "src/math/ddivl.h"
+
+LIST_DIV_TESTS(double, long double, LIBC_NAMESPACE::ddivl)
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 45e1c4c26cc147b..ef303228a9fe007 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -4615,7 +4615,7 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
-  daddl
+  daddl_test
   SUITE
     libc-math-smoke-tests
   SRCS
@@ -4627,7 +4627,7 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
-  daddf128
+  daddf128_test
   SUITE
     libc-math-smoke-tests
   SRCS
@@ -4639,7 +4639,19 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
-  ddivf128
+  ddivl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    ddivl_test.cpp
+  HDRS
+    DivTest.h
+  DEPENDS
+    libc.src.math.ddivl
+)
+
+add_fp_unittest(
+  ddivf128_test
   SUITE
     libc-math-smoke-tests
   SRCS
diff --git a/libc/test/src/math/smoke/ddivl_test.cpp b/libc/test/src/math/smoke/ddivl_test.cpp
new file mode 100644
index 000000000000000..7768766e30c12ab
--- /dev/null
+++ b/libc/test/src/math/smoke/ddivl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for ddivl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DivTest.h"
+
+#include "src/math/ddivl.h"
+
+LIST_DIV_TESTS(double, long double, LIBC_NAMESPACE::ddivl)

From 4189add58eb82d010ea0b8e29d94b27231619974 Mon Sep 17 00:00:00 2001
From: Andrey Timonin <112198242+EtoAndruwa@users.noreply.github.com>
Date: Thu, 8 Aug 2024 18:47:21 +0300
Subject: [PATCH 029/112] [NFC][mlir][scf] Fix SCF dialect's operations'
 descriptions (#101653)

- Added the dialect's prefix to operations' descriptions to follow the
same style inside the TableGen file.
- Minor changes in some operations' descriptions.
---
 mlir/include/mlir/Dialect/SCF/IR/SCFOps.td | 54 +++++++++++-----------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index acbcbae105dbfb1..847040466a85fd3 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -80,8 +80,8 @@ def ExecuteRegionOp : SCF_Op<"execute_region", [
     DeclareOpInterfaceMethods<RegionBranchOpInterface>]> {
   let summary = "operation that executes its region exactly once";
   let description = [{
-    The `execute_region` operation is used to allow multiple blocks within SCF
-    and other operations which can hold only one block.  The `execute_region`
+    The `scf.execute_region` operation is used to allow multiple blocks within SCF
+    and other operations which can hold only one block.  The `scf.execute_region`
     operation executes the region held exactly once and cannot have any operands.
     As such, its region has no arguments. All SSA values that dominate the op can
     be accessed inside the op. The op's region can have multiple blocks and the
@@ -344,7 +344,7 @@ def ForallOp : SCF_Op<"forall", [
 
     The only allowed terminator is `scf.forall.in_parallel`.
     `scf.forall` returns one value per `shared_out` operand. The
-    actions of the `in_parallel` terminators specify how to combine the
+    actions of the `scf.forall.in_parallel` terminators specify how to combine the
     partial results of all parallel invocations into a full value, in some
     unspecified order. The "destination" of each such op must be a `shared_out`
     block argument of the `scf.forall` op.
@@ -633,7 +633,7 @@ def InParallelOp : SCF_Op<"forall.in_parallel", [
       ] # GraphRegionNoTerminator.traits> {
   let summary = "terminates a `forall` block";
   let description = [{
-    `scf.forall.in_parallel` is a designated terminator for
+    The `scf.forall.in_parallel` is a designated terminator for
     the `scf.forall` operation.
 
     It has a single region with a single block that contains a flat list of ops.
@@ -778,7 +778,7 @@ def ParallelOp : SCF_Op<"parallel",
      HasParallelRegion]> {
   let summary = "parallel for operation";
   let description = [{
-    The "scf.parallel" operation represents a loop nest taking 4 groups of SSA
+    The `scf.parallel` operation represents a loop nest taking 4 groups of SSA
     values as operands that represent the lower bounds, upper bounds, steps and
     initial values, respectively. The operation defines a variadic number of
     SSA values for its induction variables. It has one region capturing the
@@ -787,7 +787,7 @@ def ParallelOp : SCF_Op<"parallel",
     machine word. The steps are values of type index, required to be positive.
     The lower and upper bounds specify a half-open range: the range includes
     the lower bound but does not include the upper bound. The initial values
-    have the same types as results of "scf.parallel". If there are no results,
+    have the same types as results of `scf.parallel`. If there are no results,
     the keyword `init` can be omitted.
 
     Semantically we require that the iteration space can be iterated in any
@@ -796,17 +796,17 @@ def ParallelOp : SCF_Op<"parallel",
 
     The parallel loop operation supports reduction of values produced by
     individual iterations into a single result. This is modeled using the
-    "scf.reduce" terminator operation (see "scf.reduce" for details). The i-th
-    result of an "scf.parallel" operation is associated with the i-th initial
-    value operand, the i-th operand of the "scf.reduce" operation (the value to
-    be reduced) and the i-th region of the "scf.reduce" operation (the reduction
+    `scf.reduce` terminator operation (see `scf.reduce` for details). The i-th
+    result of an `scf.parallel` operation is associated with the i-th initial
+    value operand, the i-th operand of the `scf.reduce` operation (the value to
+    be reduced) and the i-th region of the `scf.reduce` operation (the reduction
     function). Consequently, we require that the number of results of an
-    "scf.parallel" op matches the number of initial values and the the number of
-    reductions in the "scf.reduce" terminator.
+    `scf.parallel` op matches the number of initial values and the the number of
+    reductions in the `scf.reduce` terminator.
 
     The body region must contain exactly one block that terminates with a
-    "scf.reduce" operation. If an "scf.parallel" op has no reductions, the
-    terminator has no operands and no regions. The "scf.parallel" parser will
+    `scf.reduce` operation. If an `scf.parallel` op has no reductions, the
+    terminator has no operands and no regions. The `scf.parallel` parser will
     automatically insert the terminator for ops that have no reductions if it is
     absent.
 
@@ -875,25 +875,25 @@ def ReduceOp : SCF_Op<"reduce", [
     DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface>]> {
   let summary = "reduce operation for scf.parallel";
   let description = [{
-    "scf.reduce" is the terminator for "scf.parallel" operations. It can model
+    The `scf.reduce` operation is the terminator for `scf.parallel` operations. It can model
     an arbitrary number of reductions. It has one region per reduction. Each
     region has one block with two arguments which have the same type as the
-    corresponding operand of "scf.reduce". The operands of the op are the values
+    corresponding operand of `scf.reduce`. The operands of the op are the values
     that should be reduce; one value per reduction.
 
     The i-th reduction (i.e., the i-th region and the i-th operand) corresponds
-    the i-th initial value and the i-th result of the enclosing "scf.parallel"
+    the i-th initial value and the i-th result of the enclosing `scf.parallel`
     op.
 
-    The "scf.reduce" operation contains regions whose entry blocks expect two
+    The `scf.reduce` operation contains regions whose entry blocks expect two
     arguments of the same type as the corresponding operand. As the iteration
     order of the enclosing parallel loop and hence reduction order is
     unspecified, the results of the reductions may be non-deterministic unless
     the reductions are associative and commutative.
 
-    The result of a reduction region ("scf.reduce.return" operand) must have the
-    same type as the corresponding "scf.reduce" operand and the corresponding
-    "scf.parallel" initial value.
+    The result of a reduction region (`scf.reduce.return` operand) must have the
+    same type as the corresponding `scf.reduce` operand and the corresponding
+    `scf.parallel` initial value.
 
     Example:
 
@@ -929,9 +929,9 @@ def ReduceReturnOp :
     SCF_Op<"reduce.return", [HasParent<"ReduceOp">, Pure, Terminator]> {
   let summary = "terminator for reduce operation";
   let description = [{
-    "scf.reduce.return" is a special terminator operation for the block inside
-    "scf.reduce" regions. It terminates the region. It should have the same
-    operand type as the corresponding operand of the enclosing "scf.reduce" op.
+    The `scf.reduce.return` operation is a special terminator operation for the block inside
+    `scf.reduce` regions. It terminates the region. It should have the same
+    operand type as the corresponding operand of the enclosing `scf.reduce` op.
 
     Example:
 
@@ -1172,12 +1172,12 @@ def YieldOp : SCF_Op<"yield", [Pure, ReturnLike, Terminator,
                  "WhileOp"]>]> {
   let summary = "loop yield and termination operation";
   let description = [{
-    "scf.yield" yields an SSA value from the SCF dialect op region and
+    The `scf.yield` operation yields an SSA value from the SCF dialect op region and
     terminates the regions. The semantics of how the values are yielded is
     defined by the parent operation.
-    If "scf.yield" has any operands, the operands must match the parent
+    If `scf.yield` has any operands, the operands must match the parent
     operation's results.
-    If the parent operation defines no values, then the "scf.yield" may be
+    If the parent operation defines no values, then the `scf.yield` may be
     left out in the custom syntax and the builders will insert one implicitly.
     Otherwise, it has to be present in the syntax to indicate which values are
     yielded.

From 2a05971de2a59a99ef759a6f45828064c2dbeacf Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 8 Aug 2024 08:56:35 -0700
Subject: [PATCH 030/112] [SLP]Add index of the node to the short name output.

Improves debugging experience, does nothing with the functionality.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp    | 14 +++++++-------
 .../SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll    |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 575d178374844b7..186b382addd7106 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -309,9 +309,11 @@ static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
 
 #if !defined(NDEBUG)
 /// Print a short descriptor of the instruction bundle suitable for debug output.
-static std::string shortBundleName(ArrayRef<Value *> VL) {
+static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
   std::string Result;
   raw_string_ostream OS(Result);
+  if (Idx >= 0)
+    OS << "Idx: " << Idx << ", ";
   OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
   OS.flush();
   return Result;
@@ -10639,7 +10641,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
         // Some gather nodes might be absolutely the same as some vectorizable
         // nodes after reordering, need to handle it.
         LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
-                          << shortBundleName(TE.Scalars) << ".\n"
+                          << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
                           << "SLP: Current total cost = " << Cost << "\n");
         continue;
       }
@@ -10648,7 +10650,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
     InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
     Cost += C;
     LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
-                      << shortBundleName(TE.Scalars) << ".\n"
+                      << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
                       << "SLP: Current total cost = " << Cost << "\n");
   }
 
@@ -12638,10 +12640,8 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
           Entries.front().front()->isSame(E->Scalars)) {
         // Perfect match in the graph, will reuse the previously vectorized
         // node. Cost is 0.
-        LLVM_DEBUG(
-            dbgs()
-            << "SLP: perfect diamond match for gather bundle "
-            << shortBundleName(E->Scalars) << ".\n");
+        LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
+                          << shortBundleName(E->Scalars, E->Idx) << ".\n");
         // Restore the mask for previous partially matched values.
         Mask.resize(E->Scalars.size());
         const TreeEntry *FrontTE = Entries.front().front();
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
index 7c05355e98d70b1..b80be40d9fc8610 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
@@ -31,6 +31,6 @@ define void @fun(ptr nocapture, i32 zeroext) local_unnamed_addr #0 {
 ._crit_edge:                                      ; preds = %.lr.ph
   ret void
 
-; CHECK: SLP: Adding cost -1 for bundle n=2 [  %4 = icmp ult i32 %2, %1, ..]
+; CHECK: SLP: Adding cost -1 for bundle Idx: 3, n=2 [  %4 = icmp ult i32 %2, %1, ..]
 }
 

From be40c723ce2b7bf2690d22039d74d21b2bd5b7cf Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Thu, 8 Aug 2024 11:59:39 -0400
Subject: [PATCH 031/112] Revert "StructurizeCFG: Optimize phi insertion during
 ssa reconstruction (#101301)"

This reverts commit c62e2a2a4ed69d53a3c6ca5c24ee8d2504d6ba2b.

Since it caused regression in HIP buildbot:

https://lab.llvm.org/buildbot/#/builders/123/builds/3282
---
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp | 134 +++---------------
 .../AMDGPU/agpr-copy-no-free-registers.ll     |  26 ++--
 llvm/test/CodeGen/AMDGPU/while-break.ll       |  13 +-
 .../AMDGPU/loop-subregion-misordered.ll       |  23 +--
 .../StructurizeCFG/loop-break-phi.ll          |  63 ++++----
 5 files changed, 92 insertions(+), 167 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index f173d9ca9be07ad..9c711ec183821ff 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/Transforms/Scalar/StructurizeCFG.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
@@ -289,10 +288,6 @@ class StructurizeCFG {
   void findUndefBlocks(BasicBlock *PHIBlock,
                        const SmallSet<BasicBlock *, 8> &Incomings,
                        SmallVector<BasicBlock *> &UndefBlks) const;
-
-  void mergeIfCompatible(EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A,
-                         PHINode *B);
-
   void setPhiValues();
 
   void simplifyAffectedPhis();
@@ -715,103 +710,10 @@ void StructurizeCFG::findUndefBlocks(
   }
 }
 
-// If two phi nodes have compatible incoming values (for each
-// incoming block, either they have the same incoming value or only one phi
-// node has an incoming value), let them share the merged incoming values. The
-// merge process is guided by the equivalence information from \p PhiClasses.
-// The function will possibly update the incoming values of leader phi in
-// DeletedPhis.
-void StructurizeCFG::mergeIfCompatible(
-    EquivalenceClasses<PHINode *> &PhiClasses, PHINode *A, PHINode *B) {
-  auto ItA = PhiClasses.findLeader(PhiClasses.insert(A));
-  auto ItB = PhiClasses.findLeader(PhiClasses.insert(B));
-  // They are already in the same class, no work needed.
-  if (ItA == ItB)
-    return;
-
-  PHINode *LeaderA = *ItA;
-  PHINode *LeaderB = *ItB;
-  BBValueVector &IncomingA = DeletedPhis[LeaderA->getParent()][LeaderA];
-  BBValueVector &IncomingB = DeletedPhis[LeaderB->getParent()][LeaderB];
-
-  DenseMap<BasicBlock *, Value *> Mergeable(IncomingA.begin(), IncomingA.end());
-  for (auto [BB, V] : IncomingB) {
-    auto BBIt = Mergeable.find(BB);
-    if (BBIt != Mergeable.end() && BBIt->second != V)
-      return;
-    // Either IncomingA does not have this value or IncomingA has the same
-    // value.
-    Mergeable.insert({BB, V});
-  }
-
-  // Update the incoming value of leaderA.
-  IncomingA.assign(Mergeable.begin(), Mergeable.end());
-  PhiClasses.unionSets(ItA, ItB);
-}
-
 /// Add the real PHI value as soon as everything is set up
 void StructurizeCFG::setPhiValues() {
   SmallVector<PHINode *, 8> InsertedPhis;
   SSAUpdater Updater(&InsertedPhis);
-  DenseMap<BasicBlock *, SmallVector<BasicBlock *>> UndefBlksMap;
-
-  // Find phi nodes that have compatible incoming values (either they have
-  // the same value for the same block or only one phi node has an incoming
-  // value, see example below). We only search again the phi's that are
-  // referenced by another phi, which is the case we care about.
-  //
-  // For example (-- means no incoming value):
-  // phi1 : BB1:phi2   BB2:v  BB3:--
-  // phi2:  BB1:--     BB2:v  BB3:w
-  //
-  // Then we can merge these incoming values and let phi1, phi2 use the
-  // same set of incoming values:
-  //
-  // phi1&phi2: BB1:phi2  BB2:v  BB3:w
-  //
-  // By doing this, phi1 and phi2 would share more intermediate phi nodes.
-  // This would help reduce the number of phi nodes during SSA reconstruction
-  // and ultimately result in fewer COPY instructions.
-  //
-  // This should be correct, because if a phi node does not have incoming
-  // value from certain block, this means the block is not the predecessor
-  // of the parent block, so we actually don't care about its incoming value.
-  EquivalenceClasses<PHINode *> PhiClasses;
-  for (const auto &[To, From] : AddedPhis) {
-    auto OldPhiIt = DeletedPhis.find(To);
-    if (OldPhiIt == DeletedPhis.end())
-      continue;
-
-    PhiMap &BlkPhis = OldPhiIt->second;
-    SmallVector<BasicBlock *> &UndefBlks =
-        UndefBlksMap.FindAndConstruct(To).second;
-    SmallSet<BasicBlock *, 8> Incomings;
-
-    // Get the undefined blocks shared by all the phi nodes.
-    if (!BlkPhis.empty()) {
-      for (const auto &VI : BlkPhis.front().second)
-        Incomings.insert(VI.first);
-      findUndefBlocks(To, Incomings, UndefBlks);
-    }
-
-    for (const auto &[Phi, Incomings] : OldPhiIt->second) {
-      SmallVector<PHINode *> IncomingPHIs;
-      for (const auto &[BB, V] : Incomings) {
-        // First, for each phi, check whether it has incoming value which is
-        // another phi.
-        if (PHINode *P = dyn_cast<PHINode>(V))
-          IncomingPHIs.push_back(P);
-      }
-
-      for (auto *OtherPhi : IncomingPHIs) {
-        // Skip phis that are unrelated to the phi reconstruction for now.
-        if (!DeletedPhis.contains(OtherPhi->getParent()))
-          continue;
-        mergeIfCompatible(PhiClasses, Phi, OtherPhi);
-      }
-    }
-  }
-
   for (const auto &AddedPhi : AddedPhis) {
     BasicBlock *To = AddedPhi.first;
     const BBVector &From = AddedPhi.second;
@@ -819,27 +721,28 @@ void StructurizeCFG::setPhiValues() {
     if (!DeletedPhis.count(To))
       continue;
 
+    SmallVector<BasicBlock *> UndefBlks;
+    bool CachedUndefs = false;
     PhiMap &Map = DeletedPhis[To];
-    SmallVector<BasicBlock *> &UndefBlks = UndefBlksMap[To];
-    for (const auto &[Phi, Incoming] : Map) {
+    for (const auto &PI : Map) {
+      PHINode *Phi = PI.first;
       Value *Undef = UndefValue::get(Phi->getType());
       Updater.Initialize(Phi->getType(), "");
       Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
       Updater.AddAvailableValue(To, Undef);
 
-      // Use leader phi's incoming if there is.
-      auto LeaderIt = PhiClasses.findLeader(Phi);
-      bool UseIncomingOfLeader =
-          LeaderIt != PhiClasses.member_end() && *LeaderIt != Phi;
-      const auto &IncomingMap =
-          UseIncomingOfLeader ? DeletedPhis[(*LeaderIt)->getParent()][*LeaderIt]
-                              : Incoming;
-
+      SmallSet<BasicBlock *, 8> Incomings;
       SmallVector<BasicBlock *> ConstantPreds;
-      for (const auto &[BB, V] : IncomingMap) {
-        Updater.AddAvailableValue(BB, V);
-        if (isa<Constant>(V))
-          ConstantPreds.push_back(BB);
+      for (const auto &VI : PI.second) {
+        Incomings.insert(VI.first);
+        Updater.AddAvailableValue(VI.first, VI.second);
+        if (isa<Constant>(VI.second))
+          ConstantPreds.push_back(VI.first);
+      }
+
+      if (!CachedUndefs) {
+        findUndefBlocks(To, Incomings, UndefBlks);
+        CachedUndefs = true;
       }
 
       for (auto UB : UndefBlks) {
@@ -850,10 +753,6 @@ void StructurizeCFG::setPhiValues() {
         if (any_of(ConstantPreds,
                    [&](BasicBlock *CP) { return DT->dominates(CP, UB); }))
           continue;
-        // Maybe already get a value through sharing with other phi nodes.
-        if (Updater.HasValueForBlock(UB))
-          continue;
-
         Updater.AddAvailableValue(UB, Undef);
       }
 
@@ -861,7 +760,10 @@ void StructurizeCFG::setPhiValues() {
         Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI));
       AffectedPhis.push_back(Phi);
     }
+
+    DeletedPhis.erase(To);
   }
+  assert(DeletedPhis.empty());
 
   AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
 }
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index e0226c35cc2de69..fb96b9ff2952e82 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -577,11 +577,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
 ; GFX908-NEXT:    v_mov_b32_e32 v4, s8
 ; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v6
-; GFX908-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX908-NEXT:    v_mov_b32_e32 v8, s8
+; GFX908-NEXT:    v_mov_b32_e32 v6, s8
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s9
-; GFX908-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX908-NEXT:    v_mov_b32_e32 v9, s9
+; GFX908-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX908-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v11, v5
 ; GFX908-NEXT:    s_mov_b64 s[18:19], s[10:11]
@@ -642,10 +642,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_add_f32_e32 v12, v20, v12
 ; GFX908-NEXT:    v_add_f32_e32 v5, v5, v25
 ; GFX908-NEXT:    v_add_f32_e32 v4, v4, v24
-; GFX908-NEXT:    v_add_f32_e32 v7, v7, v27
-; GFX908-NEXT:    v_add_f32_e32 v6, v6, v26
-; GFX908-NEXT:    v_add_f32_e32 v8, v8, v14
-; GFX908-NEXT:    v_add_f32_e32 v9, v9, v15
+; GFX908-NEXT:    v_add_f32_e32 v9, v9, v27
+; GFX908-NEXT:    v_add_f32_e32 v8, v8, v26
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v14
+; GFX908-NEXT:    v_add_f32_e32 v7, v7, v15
 ; GFX908-NEXT:    v_add_f32_e32 v10, v10, v12
 ; GFX908-NEXT:    v_add_f32_e32 v11, v11, v13
 ; GFX908-NEXT:    s_mov_b64 s[20:21], -1
@@ -655,6 +655,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX908-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
+; GFX908-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; GFX908-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX908-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX908-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX908-NEXT:    ; implicit-def: $sgpr18_sgpr19
 ; GFX908-NEXT:  .LBB3_9: ; %loop.exit.guard
@@ -740,8 +744,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v8
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
 ; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
 ; GFX90A-NEXT:    s_mov_b64 s[18:19], s[10:11]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
@@ -797,8 +801,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_pk_add_f32 v[16:17], v[22:23], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[14:15], v[20:21], v[14:15]
 ; GFX90A-NEXT:    v_pk_add_f32 v[6:7], v[6:7], v[24:25]
-; GFX90A-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[26:27]
-; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[16:17]
+; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[26:27]
+; GFX90A-NEXT:    v_pk_add_f32 v[8:9], v[8:9], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[14:15]
 ; GFX90A-NEXT:    s_mov_b64 s[20:21], -1
 ; GFX90A-NEXT:    s_branch .LBB3_4
@@ -807,6 +811,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX90A-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
+; GFX90A-NEXT:    ; implicit-def: $vgpr12_vgpr13
+; GFX90A-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX90A-NEXT:    ; implicit-def: $vgpr10_vgpr11
+; GFX90A-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $sgpr18_sgpr19
 ; GFX90A-NEXT:  .LBB3_9: ; %loop.exit.guard
diff --git a/llvm/test/CodeGen/AMDGPU/while-break.ll b/llvm/test/CodeGen/AMDGPU/while-break.ll
index 9bb8a2f9f0282c2..46254994580d2d0 100644
--- a/llvm/test/CodeGen/AMDGPU/while-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/while-break.ll
@@ -162,8 +162,8 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
 ; GCN-NEXT:    s_branch .LBB2_2
 ; GCN-NEXT:  .LBB2_1: ; %Flow1
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
-; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GCN-NEXT:    s_and_b32 s1, exec_lo, s1
+; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GCN-NEXT:    s_and_b32 s1, exec_lo, s4
 ; GCN-NEXT:    s_or_b32 s2, s1, s2
 ; GCN-NEXT:    s_andn2_b32 exec_lo, exec_lo, s2
 ; GCN-NEXT:    s_cbranch_execz .LBB2_6
@@ -190,17 +190,20 @@ define amdgpu_ps < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i
 ; GCN-NEXT:  .LBB2_4: ; %Flow
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GCN-NEXT:    s_mov_b32 s1, -1
-; GCN-NEXT:    s_and_saveexec_b32 s4, s3
+; GCN-NEXT:    v_mov_b32_e32 v7, v6
+; GCN-NEXT:    s_mov_b32 s4, -1
+; GCN-NEXT:    s_and_saveexec_b32 s1, s3
 ; GCN-NEXT:    s_cbranch_execz .LBB2_1
 ; GCN-NEXT:  ; %bb.5: ; %latch
 ; GCN-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc_lo, s0, v3
+; GCN-NEXT:    v_mov_b32_e32 v7, v0
 ; GCN-NEXT:    s_add_i32 s0, s0, 1
-; GCN-NEXT:    s_orn2_b32 s1, vcc_lo, exec_lo
+; GCN-NEXT:    s_orn2_b32 s4, vcc_lo, exec_lo
 ; GCN-NEXT:    s_branch .LBB2_1
 ; GCN-NEXT:  .LBB2_6: ; %end
 ; GCN-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GCN-NEXT:    v_mov_b32_e32 v0, v7
 ; GCN-NEXT:    v_mov_b32_e32 v1, v6
 ; GCN-NEXT:    ; return to shader part epilog
 entry:
diff --git a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
index 385e37e2750d1e3..10a3e65e5f57d67 100644
--- a/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
+++ b/llvm/test/Transforms/StructurizeCFG/AMDGPU/loop-subregion-misordered.ll
@@ -28,7 +28,7 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[I_INITIAL:%.*]] = load volatile i32, ptr addrspace(1) [[GEP]], align 4
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       LOOP.HEADER:
-; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP3:%.*]], [[FLOW3:%.*]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP5:%.*]], [[FLOW3:%.*]] ]
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 0x100b
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[I]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) null, i64 [[TMP12]]
@@ -49,8 +49,8 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
 ; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
 ; CHECK:       Flow2:
-; CHECK-NEXT:    [[TMP3]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP6:%.*]], [[FLOW]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP8:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP7:%.*]], [[FLOW]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP9:%.*]], [[FLOW]] ]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW3]]
 ; CHECK:       INNER_LOOP:
 ; CHECK-NEXT:    [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
@@ -66,19 +66,20 @@ define amdgpu_kernel void @loop_subregion_misordered(ptr addrspace(1) %arg0) #0
 ; CHECK-NEXT:    [[LOAD13:%.*]] = icmp uge i32 [[TMP16]], 271
 ; CHECK-NEXT:    br i1 [[LOAD13]], label [[INCREMENT_I]], label [[FLOW1:%.*]]
 ; CHECK:       Flow3:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
+; CHECK-NEXT:    [[TMP5]] = phi i32 [ [[TMP3]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW2]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK:%.*]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW2]] ]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FLOW4:%.*]], label [[LOOP_HEADER]]
 ; CHECK:       Flow4:
-; CHECK-NEXT:    br i1 [[TMP7:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
+; CHECK-NEXT:    br i1 [[TMP8:%.*]], label [[BB64:%.*]], label [[RETURN:%.*]]
 ; CHECK:       bb64:
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 42", "~{memory}"() #[[ATTR0]]
 ; CHECK-NEXT:    br label [[RETURN]]
 ; CHECK:       Flow:
-; CHECK-NEXT:    [[TMP6]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP7]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[BB18]], label [[FLOW2]]
+; CHECK-NEXT:    [[TMP7]] = phi i32 [ [[TMP0]], [[FLOW1]] ], [ undef, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP8]] = phi i1 [ [[TMP1]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP9]] = phi i1 [ [[TMP2]], [[FLOW1]] ], [ false, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = phi i1 [ false, [[FLOW1]] ], [ true, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[BB18]], label [[FLOW2]]
 ; CHECK:       INCREMENT_I:
 ; CHECK-NEXT:    [[INC_I]] = add i32 [[I]], 1
 ; CHECK-NEXT:    call void asm sideeffect "s_nop 0x1336
diff --git a/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll b/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll
index 46881ec8272861c..c832b7d1394a880 100644
--- a/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll
+++ b/llvm/test/Transforms/StructurizeCFG/loop-break-phi.ll
@@ -7,8 +7,8 @@ define float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[HEADER:.*]]
 ; CHECK:       [[HEADER]]:
-; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[FLOW2:.*]] ]
-; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[FLOW2]] ]
+; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[FLOW2:.*]] ]
+; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP6:%.*]], %[[FLOW2]] ]
 ; CHECK-NEXT:    [[CC:%.*]] = icmp sge i32 [[IND]], [[X]]
 ; CHECK-NEXT:    br i1 [[CC]], label %[[ELSE:.*]], label %[[FLOW:.*]]
 ; CHECK:       [[FLOW]]:
@@ -23,17 +23,20 @@ define float @while_break(i32 %z, float %v, i32 %x, i32 %y) #0 {
 ; CHECK-NEXT:    [[CC2]] = icmp slt i32 [[IND]], [[Y]]
 ; CHECK-NEXT:    br label %[[FLOW]]
 ; CHECK:       [[FLOW1]]:
-; CHECK-NEXT:    [[TMP8]] = phi float [ [[V_IF]], %[[IF]] ], [ [[TMP0]], %[[FLOW]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ true, %[[IF]] ], [ [[TMP1]], %[[FLOW]] ]
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[LATCH:.*]], label %[[FLOW2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ undef, %[[IF]] ], [ [[TMP0]], %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ [[TMP0]], %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ true, %[[IF]] ], [ [[TMP1]], %[[FLOW]] ]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[LATCH:.*]], label %[[FLOW2]]
 ; CHECK:       [[LATCH]]:
 ; CHECK-NEXT:    [[IND_INC:%.*]] = add i32 [[IND]], 1
 ; CHECK-NEXT:    [[CC3:%.*]] = icmp slt i32 [[IND]], [[Z]]
 ; CHECK-NEXT:    br label %[[FLOW2]]
 ; CHECK:       [[FLOW2]]:
-; CHECK-NEXT:    [[TMP5]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ]
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[END:.*]], label %[[HEADER]]
+; CHECK-NEXT:    [[TMP6]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ]
+; CHECK-NEXT:    [[TMP7]] = phi float [ [[TMP4]], %[[LATCH]] ], [ undef, %[[FLOW1]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi float [ [[TMP4]], %[[LATCH]] ], [ [[TMP3]], %[[FLOW1]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[END:.*]], label %[[HEADER]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret float [[TMP8]]
 ;
@@ -72,8 +75,8 @@ define float @while_break2(i32 %z, float %v, i32 %x, i32 %y) #0 {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[HEADER:.*]]
 ; CHECK:       [[HEADER]]:
-; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[FLOW2:.*]] ]
-; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[FLOW2]] ]
+; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP7:%.*]], %[[FLOW2:.*]] ]
+; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP6:%.*]], %[[FLOW2]] ]
 ; CHECK-NEXT:    [[CC:%.*]] = icmp sge i32 [[IND]], [[X]]
 ; CHECK-NEXT:    br i1 [[CC]], label %[[IF:.*]], label %[[FLOW:.*]]
 ; CHECK:       [[IF]]:
@@ -88,17 +91,20 @@ define float @while_break2(i32 %z, float %v, i32 %x, i32 %y) #0 {
 ; CHECK-NEXT:    [[CC2:%.*]] = icmp slt i32 [[IND]], [[Y]]
 ; CHECK-NEXT:    br label %[[FLOW1]]
 ; CHECK:       [[FLOW1]]:
-; CHECK-NEXT:    [[TMP8]] = phi float [ [[V_1]], %[[ELSE]] ], [ [[TMP0]], %[[FLOW]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ [[CC2]], %[[ELSE]] ], [ [[TMP1]], %[[FLOW]] ]
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[LATCH:.*]], label %[[FLOW2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi float [ [[V_1]], %[[ELSE]] ], [ undef, %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[V_1]], %[[ELSE]] ], [ [[TMP0]], %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ [[CC2]], %[[ELSE]] ], [ [[TMP1]], %[[FLOW]] ]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[LATCH:.*]], label %[[FLOW2]]
 ; CHECK:       [[LATCH]]:
 ; CHECK-NEXT:    [[IND_INC:%.*]] = add i32 [[IND]], 1
 ; CHECK-NEXT:    [[CC3:%.*]] = icmp slt i32 [[IND]], [[Z]]
 ; CHECK-NEXT:    br label %[[FLOW2]]
 ; CHECK:       [[FLOW2]]:
-; CHECK-NEXT:    [[TMP5]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ]
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[END:.*]], label %[[HEADER]]
+; CHECK-NEXT:    [[TMP6]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW1]] ]
+; CHECK-NEXT:    [[TMP7]] = phi float [ [[TMP4]], %[[LATCH]] ], [ undef, %[[FLOW1]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi float [ [[TMP4]], %[[LATCH]] ], [ [[TMP3]], %[[FLOW1]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW1]] ]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[END:.*]], label %[[HEADER]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    ret float [[TMP8]]
 ;
@@ -137,9 +143,9 @@ define < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i32 %y, i32
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[HEADER:.*]]
 ; CHECK:       [[HEADER]]:
-; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP8:%.*]], %[[FLOW1:.*]] ]
-; CHECK-NEXT:    [[V_COPY:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[FLOW1]] ]
-; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[FLOW1]] ]
+; CHECK-NEXT:    [[V_1:%.*]] = phi float [ [[V]], %[[ENTRY]] ], [ [[TMP6:%.*]], %[[FLOW1:.*]] ]
+; CHECK-NEXT:    [[V_COPY:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[FLOW1]] ]
+; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[TMP4:%.*]], %[[FLOW1]] ]
 ; CHECK-NEXT:    [[CC:%.*]] = icmp slt i32 [[IND]], [[X]]
 ; CHECK-NEXT:    [[CC_INV:%.*]] = xor i1 [[CC]], true
 ; CHECK-NEXT:    br i1 [[CC]], label %[[IF:.*]], label %[[FLOW:.*]]
@@ -150,18 +156,23 @@ define < 2 x float> @while_break_two_chains_of_phi(float %v, i32 %x, i32 %y, i32
 ; CHECK-NEXT:    [[CC2:%.*]] = icmp slt i32 [[IND]], [[Y]]
 ; CHECK-NEXT:    br label %[[FLOW]]
 ; CHECK:       [[FLOW]]:
-; CHECK-NEXT:    [[TMP7]] = phi float [ [[V_IF]], %[[IF]] ], [ [[V_COPY]], %[[HEADER]] ]
-; CHECK-NEXT:    [[TMP8]] = phi float [ [[V_IF]], %[[IF]] ], [ [[V_1]], %[[HEADER]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ [[CC2]], %[[IF]] ], [ [[CC_INV]], %[[HEADER]] ]
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[LATCH:.*]], label %[[FLOW1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ undef, %[[HEADER]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ [[V_COPY]], %[[HEADER]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi float [ [[V_IF]], %[[IF]] ], [ [[V_1]], %[[HEADER]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i1 [ [[CC2]], %[[IF]] ], [ [[CC_INV]], %[[HEADER]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[LATCH:.*]], label %[[FLOW1]]
 ; CHECK:       [[LATCH]]:
 ; CHECK-NEXT:    [[IND_INC:%.*]] = add i32 [[IND]], 1
 ; CHECK-NEXT:    [[CC3:%.*]] = icmp slt i32 [[IND]], [[Z]]
 ; CHECK-NEXT:    br label %[[FLOW1]]
 ; CHECK:       [[FLOW1]]:
-; CHECK-NEXT:    [[TMP3]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW]] ]
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[END:.*]], label %[[HEADER]]
+; CHECK-NEXT:    [[TMP4]] = phi i32 [ [[IND_INC]], %[[LATCH]] ], [ undef, %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP5]] = phi float [ [[TMP1]], %[[LATCH]] ], [ undef, %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP6]] = phi float [ [[TMP2]], %[[LATCH]] ], [ undef, %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = phi float [ [[TMP1]], %[[LATCH]] ], [ [[TMP0]], %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi float [ [[TMP2]], %[[LATCH]] ], [ [[TMP0]], %[[FLOW]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = phi i1 [ [[CC3]], %[[LATCH]] ], [ true, %[[FLOW]] ]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[END:.*]], label %[[HEADER]]
 ; CHECK:       [[END]]:
 ; CHECK-NEXT:    [[PACKED0:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0
 ; CHECK-NEXT:    [[PACKED1:%.*]] = insertelement <2 x float> [[PACKED0]], float [[TMP7]], i32 1

From 04da77308fcff417d9c925502d92d70761b5054b Mon Sep 17 00:00:00 2001
From: Andreas Jonson <andjo403@hotmail.com>
Date: Thu, 8 Aug 2024 18:07:09 +0200
Subject: [PATCH 032/112] Allow empty range attribute and add assert for full
 range (#100601)

fix #99619
---
 llvm/docs/LangRef.rst                                 |  3 ++-
 llvm/lib/AsmParser/LLParser.cpp                       |  4 ++--
 llvm/lib/IR/Attributes.cpp                            |  4 ++++
 llvm/test/Assembler/range-attribute-invalid-range.ll  |  4 ++--
 llvm/test/Bitcode/attributes.ll                       |  4 ++--
 .../Transforms/Inline/ret_attr_align_and_noundef.ll   | 11 +++++++++++
 6 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 36c9e8bb339f920..0ee4d7b444cfcf7 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -1675,7 +1675,8 @@ Currently, only the following parameter attributes are defined:
     -  The pair ``a,b`` represents the range ``[a,b)``.
     -  Both ``a`` and ``b`` are constants.
     -  The range is allowed to wrap.
-    -  The range should not represent the full or empty set. That is, ``a!=b``.
+    -  The empty range is represented using ``0,0``.
+    -  Otherwise, ``a`` and ``b`` are not allowed to be equal.
     
     This attribute may only be applied to parameters or return values with integer 
     or vector of integer types.
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 9358f89e2bf9dca..f41907f03512576 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -3109,8 +3109,8 @@ bool LLParser::parseRangeAttr(AttrBuilder &B) {
   if (ParseAPSInt(BitWidth, Lower) ||
       parseToken(lltok::comma, "expected ','") || ParseAPSInt(BitWidth, Upper))
     return true;
-  if (Lower == Upper)
-    return tokError("the range should not represent the full or empty set!");
+  if (Lower == Upper && !Lower.isZero())
+    return tokError("the range represent the empty set but limits aren't 0!");
 
   if (parseToken(lltok::rparen, "expected ')'"))
     return true;
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index abd05e316bec148..fa124e46483dcec 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -171,6 +171,7 @@ Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind,
                          const ConstantRange &CR) {
   assert(Attribute::isConstantRangeAttrKind(Kind) &&
          "Not a ConstantRange attribute");
+  assert(!CR.isFullSet() && "ConstantRange attribute must not be full");
   LLVMContextImpl *pImpl = Context.pImpl;
   FoldingSetNodeID ID;
   ID.AddInteger(Kind);
@@ -2020,6 +2021,9 @@ AttrBuilder &AttrBuilder::addInAllocaAttr(Type *Ty) {
 
 AttrBuilder &AttrBuilder::addConstantRangeAttr(Attribute::AttrKind Kind,
                                                const ConstantRange &CR) {
+  if (CR.isFullSet())
+    return *this;
+
   return addAttribute(Attribute::get(Ctx, Kind, CR));
 }
 
diff --git a/llvm/test/Assembler/range-attribute-invalid-range.ll b/llvm/test/Assembler/range-attribute-invalid-range.ll
index cf6d3f0801838c9..1ddb6745e5dc208 100644
--- a/llvm/test/Assembler/range-attribute-invalid-range.ll
+++ b/llvm/test/Assembler/range-attribute-invalid-range.ll
@@ -1,6 +1,6 @@
 ; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
 
-; CHECK: the range should not represent the full or empty set!
-define void @range_empty(i8 range(i8 0, 0) %a) {
+; CHECK: the range represent the empty set but limits aren't 0!
+define void @range_empty(i8 range(i8 1, 1) %a) {
   ret void
 }
diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll
index b25bd7582a05afd..4402289ac170d9b 100644
--- a/llvm/test/Bitcode/attributes.ll
+++ b/llvm/test/Bitcode/attributes.ll
@@ -537,8 +537,8 @@ define range(i32 -1, 42) i32 @range_attribute(<4 x i32> range(i32 -1, 42) %a) {
   ret i32 0
 }
 
-; CHECK: define range(i32 0, 42) i32 @range_attribute_same_range_other_bitwidth(i8 range(i8 0, 42) %a)
-define range(i32 0, 42) i32 @range_attribute_same_range_other_bitwidth(i8 range(i8 0, 42) %a) {
+; CHECK: define range(i32 0, 0) i32 @range_attribute_same_range_other_bitwidth(i8 range(i8 0, 42) %a)
+define range(i32 0, 0) i32 @range_attribute_same_range_other_bitwidth(i8 range(i8 0, 42) %a) {
   ret i32 0
 }
 
diff --git a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
index f4cebf1fcb5da0c..dc685d2c4d1368a 100644
--- a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
+++ b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
@@ -410,3 +410,14 @@ define i8 @caller15_okay_intersect_ranges() {
   call void @use.val(i8 %r)
   ret i8 %r
 }
+
+define i8 @caller16_not_intersecting_ranges() {
+; CHECK-LABEL: define i8 @caller16_not_intersecting_ranges() {
+; CHECK-NEXT:    [[R_I:%.*]] = call range(i8 0, 0) i8 @val8()
+; CHECK-NEXT:    call void @use.val(i8 [[R_I]])
+; CHECK-NEXT:    ret i8 [[R_I]]
+;
+  %r = call range(i8 0, 5) i8 @callee15()
+  call void @use.val(i8 %r)
+  ret i8 %r
+}

From 32c69faa6ce58333c26293a7708fa3f71991c55c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Aug 2024 17:08:53 +0100
Subject: [PATCH 033/112] [SLP] Regenerate test checks to reduce NFC diff in
 #100904

---
 .../SLPVectorizer/RISCV/complex-loads.ll      | 74 +++++++++----------
 .../Transforms/SLPVectorizer/X86/diamond.ll   | 32 ++++----
 .../SLPVectorizer/X86/extract_in_tree_user.ll | 16 ++--
 ...gathered-delayed-nodes-with-reused-user.ll | 38 +++++-----
 .../SLPVectorizer/X86/geps-non-pow-2.ll       | 36 ++++-----
 .../SLPVectorizer/X86/operandorder.ll         | 18 ++---
 .../X86/reorder_with_reordered_users.ll       | 16 ++--
 7 files changed, 115 insertions(+), 115 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 4978991b42c8f19..36681ecea4f50fd 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -142,27 +142,27 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1
 ; CHECK-NEXT:    [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP78:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
+; CHECK-NEXT:    [[TMP77:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP73:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP81:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32>
-; CHECK-NEXT:    [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32>
-; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP81]], [[TMP76]]
+; CHECK-NEXT:    [[TMP78:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32>
+; CHECK-NEXT:    [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP76:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
+; CHECK-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP78]], [[TMP76]]
 ; CHECK-NEXT:    [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP83:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP80:%.*]] = zext <2 x i8> [[TMP83]] to <2 x i32>
-; CHECK-NEXT:    [[TMP77:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP82:%.*]] = zext <2 x i8> [[TMP77]] to <2 x i32>
-; CHECK-NEXT:    [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP84:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32>
+; CHECK-NEXT:    [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP80:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32>
+; CHECK-NEXT:    [[TMP81:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP82:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32>
+; CHECK-NEXT:    [[TMP83:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP84:%.*]] = zext <2 x i8> [[TMP83]] to <2 x i32>
 ; CHECK-NEXT:    [[TMP95:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1
-; CHECK-NEXT:    [[TMP89:%.*]] = sub <2 x i32> [[TMP97]], [[TMP80]]
-; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP89]]
+; CHECK-NEXT:    [[TMP90:%.*]] = sub <2 x i32> [[TMP97]], [[TMP80]]
+; CHECK-NEXT:    [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP90]]
 ; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0
-; CHECK-NEXT:    [[TMP99:%.*]] = sub <2 x i32> [[TMP86]], [[TMP78]]
-; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP99]]
+; CHECK-NEXT:    [[TMP98:%.*]] = sub <2 x i32> [[TMP86]], [[TMP77]]
+; CHECK-NEXT:    [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP98]]
 ; CHECK-NEXT:    [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> <i32 0, i32 2>
 ; CHECK-NEXT:    [[TMP106:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]]
 ; CHECK-NEXT:    [[TMP91:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]]
@@ -182,21 +182,21 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535
 ; CHECK-NEXT:    [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1
 ; CHECK-NEXT:    [[TMP110:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32>
-; CHECK-NEXT:    [[TMP98:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR644]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32>
-; CHECK-NEXT:    [[TMP100:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP109:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32>
-; CHECK-NEXT:    [[TMP112:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP114:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
-; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP109]], [[TMP114]]
+; CHECK-NEXT:    [[TMP109:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR644]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP103:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32>
+; CHECK-NEXT:    [[TMP116:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
+; CHECK-NEXT:    [[TMP128:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP155:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32>
+; CHECK-NEXT:    [[TMP124:%.*]] = sub <2 x i32> [[TMP118]], [[TMP155]]
 ; CHECK-NEXT:    [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], <i32 16, i32 16>
-; CHECK-NEXT:    [[TMP113:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32>
-; CHECK-NEXT:    [[TMP115:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP118:%.*]] = zext <2 x i8> [[TMP115]] to <2 x i32>
-; CHECK-NEXT:    [[TMP116:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
-; CHECK-NEXT:    [[TMP128:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32>
-; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP118]], [[TMP128]]
+; CHECK-NEXT:    [[TMP156:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP111:%.*]] = zext <2 x i8> [[TMP156]] to <2 x i32>
+; CHECK-NEXT:    [[TMP112:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32>
+; CHECK-NEXT:    [[TMP114:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27_1]], i64 2, <2 x i1> <i1 true, i1 true>, i32 2)
+; CHECK-NEXT:    [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32>
+; CHECK-NEXT:    [[TMP135:%.*]] = sub <2 x i32> [[TMP113]], [[TMP115]]
 ; CHECK-NEXT:    [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], <i32 16, i32 16>
 ; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1
 ; CHECK-NEXT:    [[TMP119:%.*]] = sub <2 x i32> [[TMP137]], [[TMP111]]
@@ -480,11 +480,11 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP84:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]]
 ; THR15-NEXT:    [[TMP85:%.*]] = shl <2 x i32> [[TMP84]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP86:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV33]], i32 1
-; THR15-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP86]], [[TMP76]]
-; THR15-NEXT:    [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP87]]
+; THR15-NEXT:    [[TMP93:%.*]] = sub <2 x i32> [[TMP86]], [[TMP76]]
+; THR15-NEXT:    [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP93]]
 ; THR15-NEXT:    [[TMP92:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV]], i32 0
-; THR15-NEXT:    [[TMP93:%.*]] = sub <2 x i32> [[TMP92]], [[TMP68]]
-; THR15-NEXT:    [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP93]]
+; THR15-NEXT:    [[TMP87:%.*]] = sub <2 x i32> [[TMP92]], [[TMP68]]
+; THR15-NEXT:    [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP87]]
 ; THR15-NEXT:    [[TMP97:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP95]], <2 x i32> <i32 0, i32 2>
 ; THR15-NEXT:    [[TMP77:%.*]] = add <2 x i32> [[TMP88]], [[TMP95]]
 ; THR15-NEXT:    [[TMP91:%.*]] = sub <2 x i32> [[TMP95]], [[TMP88]]
@@ -521,13 +521,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP113:%.*]] = sub <2 x i32> [[TMP109]], [[TMP111]]
 ; THR15-NEXT:    [[TMP114:%.*]] = shl <2 x i32> [[TMP113]], <i32 16, i32 16>
 ; THR15-NEXT:    [[TMP115:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV33_1]], i32 1
-; THR15-NEXT:    [[TMP116:%.*]] = sub <2 x i32> [[TMP115]], [[TMP107]]
-; THR15-NEXT:    [[TMP117:%.*]] = add <2 x i32> [[TMP114]], [[TMP116]]
+; THR15-NEXT:    [[TMP117:%.*]] = sub <2 x i32> [[TMP115]], [[TMP107]]
+; THR15-NEXT:    [[TMP116:%.*]] = add <2 x i32> [[TMP114]], [[TMP117]]
 ; THR15-NEXT:    [[TMP126:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV_1]], i32 0
 ; THR15-NEXT:    [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP99]]
 ; THR15-NEXT:    [[TMP128:%.*]] = add <2 x i32> [[TMP102]], [[TMP127]]
-; THR15-NEXT:    [[TMP106:%.*]] = add <2 x i32> [[TMP117]], [[TMP128]]
-; THR15-NEXT:    [[TMP121:%.*]] = sub <2 x i32> [[TMP128]], [[TMP117]]
+; THR15-NEXT:    [[TMP106:%.*]] = add <2 x i32> [[TMP116]], [[TMP128]]
+; THR15-NEXT:    [[TMP121:%.*]] = sub <2 x i32> [[TMP128]], [[TMP116]]
 ; THR15-NEXT:    [[TMP118:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0
 ; THR15-NEXT:    [[TMP119:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1
 ; THR15-NEXT:    [[ADD48_1:%.*]] = add i32 [[TMP119]], [[TMP118]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
index 0937b686ee2f75d..b2bcdb178b21be4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll
@@ -16,10 +16,10 @@ define i32 @foo(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 %n, i32
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[B:%.*]], align 4
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -59,13 +59,13 @@ define i32 @extr_user(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 %n
 ; CHECK-LABEL: @extr_user(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    ret i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 entry:
   %0 = load i32, ptr %A, align 4
@@ -95,13 +95,13 @@ define i32 @extr_user1(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 %
 ; CHECK-LABEL: @extr_user1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MUL238:%.*]] = add i32 [[M:%.*]], [[N:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i32> [[TMP0]], [[TMP2]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
-; CHECK-NEXT:    ret i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 entry:
   %0 = load i32, ptr %A, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
index c600d75ed1e8c4c..0eb18239ae3fb6e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
@@ -13,9 +13,9 @@ define i32 @fn1() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 11, i64 56>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[ADD_PTR]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
@@ -92,11 +92,11 @@ define void @externally_used_ptrs() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 56, i64 11>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[ADD_PTR]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[ADD_PTR]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
index d7144d750321fe1..f197b2480d61ca8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-delayed-nodes-with-reused-user.ll
@@ -4,6 +4,25 @@
 ; RUN: -slp-skip-early-profitability-check < %s | FileCheck %s --check-prefixes=FORCED
 
 define i64 @foo() {
+; CHECK-LABEL: define i64 @foo() {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[ADD:%.*]], [[BB3]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i64 [ [[TMP9:%.*]], [[BB3]] ]
+; CHECK-NEXT:    ret i64 0
+; CHECK:       bb3:
+; CHECK-NEXT:    [[PHI5:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ 0, [[BB3]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, [[BB]] ], [ [[TMP7:%.*]], [[BB3]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; CHECK-NEXT:    [[ADD]] = add i64 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 0
+; CHECK-NEXT:    [[TMP9]] = or i64 [[PHI5]], 0
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP7]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[ADD]], i32 0
+; CHECK-NEXT:    br i1 false, label [[BB3]], label [[BB1:%.*]]
+;
 ; FORCED-LABEL: define i64 @foo() {
 ; FORCED-NEXT:  bb:
 ; FORCED-NEXT:    br label [[BB3:%.*]]
@@ -25,25 +44,6 @@ define i64 @foo() {
 ; FORCED-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0
 ; FORCED-NEXT:    br i1 false, label [[BB3]], label [[BB1:%.*]]
 ;
-; CHECK-LABEL: define i64 @foo() {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label [[BB3:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[ADD:%.*]], [[BB3]] ]
-; CHECK-NEXT:    [[PHI2:%.*]] = phi i64 [ [[TMP9:%.*]], [[BB3]] ]
-; CHECK-NEXT:    ret i64 0
-; CHECK:       bb3:
-; CHECK-NEXT:    [[PHI5:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ 0, [[BB3]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i64> [ zeroinitializer, [[BB]] ], [ [[TMP7:%.*]], [[BB3]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-; CHECK-NEXT:    [[ADD]] = add i64 [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i64, ptr addrspace(1) null, i64 0
-; CHECK-NEXT:    [[TMP9]] = or i64 [[PHI5]], 0
-; CHECK-NEXT:    [[ICMP:%.*]] = icmp ult i64 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP7]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[ADD]], i32 0
-; CHECK-NEXT:    br i1 false, label [[BB3]], label [[BB1:%.*]]
-;
 bb:
   br label %bb3
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
index e459cd8c6955b03..e94dd2119270ce1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
@@ -11,37 +11,37 @@ define dso_local i32 @g() local_unnamed_addr {
 ; CHECK-NEXT:    [[TOBOOL_NOT19:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT19]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]]
 ; CHECK:       while.body:
-; CHECK-NEXT:    [[C_022:%.*]] = phi ptr [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[A_020:%.*]] = phi ptr [ [[A_020_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 1, i64 1>
-; CHECK-NEXT:    switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[A_020]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 1, i64 1>
+; CHECK-NEXT:    switch i32 [[TMP4]], label [[WHILE_BODY_BACKEDGE]] [
 ; CHECK-NEXT:      i32 2, label [[SW_BB:%.*]]
 ; CHECK-NEXT:      i32 4, label [[SW_BB6:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2
-; CHECK-NEXT:    store i32 [[TMP7]], ptr [[INCDEC_PTR1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
+; CHECK-NEXT:    [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[A_020]], i64 2
+; CHECK-NEXT:    store i32 [[TMP8]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       sw.bb6:
-; CHECK-NEXT:    [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[A_020]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[INCDEC_PTR]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x ptr> [[TMP5]], i32 0
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       while.body.backedge:
-; CHECK-NEXT:    [[C_022_BE]] = phi ptr [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ]
-; CHECK-NEXT:    [[TMP14]] = phi <2 x ptr> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ]
+; CHECK-NEXT:    [[A_020_BE]] = phi ptr [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR7]], [[SW_BB6]] ], [ [[INCDEC_PTR4]], [[SW_BB]] ]
+; CHECK-NEXT:    [[TMP14]] = phi <2 x ptr> [ [[TMP5]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP9]], [[SW_BB]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret i32 undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index 8562e53b1538722..9df2b9a8e8f3ec0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -189,11 +189,11 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1
 ; CHECK-NEXT:    [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
 ; CHECK-NEXT:    [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
 ; CHECK-NEXT:    ret void
@@ -238,10 +238,10 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LP]], label [[EXT:%.*]]
 ; CHECK:       ext:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll
index eb5d13d6fc19d74..69b4639d9c13187 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_with_reordered_users.ll
@@ -47,22 +47,22 @@ define void @reorder_crash(ptr %ptr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 undef, label [[BB0:%.*]], label [[BB12:%.*]]
 ; CHECK:       bb0:
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTR:%.*]], align 4
-; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP0]], ptr [[PTR]], align 4
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb12:
 ; CHECK-NEXT:    br i1 undef, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[PTR]], align 4
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTR]], align 4
+; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[PTR]], align 4
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[PTR]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi <4 x float> [ [[TMP1]], [[BB0]] ], [ [[TMP4]], [[BB1]] ], [ [[SHUFFLE]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <4 x float> [ [[TMP0]], [[BB0]] ], [ [[TMP1]], [[BB1]] ], [ [[TMP4]], [[BB2]] ]
 ; CHECK-NEXT:    ret void
 ;
 entry:

From 73aa4e4b3ec69480cc3981ab3eda091fc95f30cf Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 8 Aug 2024 09:15:43 -0700
Subject: [PATCH 034/112] [RISCV] Use listconcat to shorten some of the profile
 feature lists. (#102356)

The profiles in a family gain more features overtime so each year can
mostly inherit from the previous year.

I did make an exception for Za128rs becoming Za64rs in RVA22.
Technically Za64rs is a stricter requirement than Za128rs so it would be
ok to have both, but I didn't want to change existing ISA strings.

I used RVA20U64BaseFeatures as the base for RVB23U64 to shorten its list
a bit. RVB23 is the first year for that family and was created as a
reaction to too many features being added to RVA so inheriting from an
earlier RVA seemed somewhat reasonable to me.
---
 llvm/lib/Target/RISCV/RISCVProfiles.td | 264 +++++++++++--------------
 1 file changed, 113 insertions(+), 151 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVProfiles.td b/llvm/lib/Target/RISCV/RISCVProfiles.td
index c4a64681f5f123a..157e087a64da07b 100644
--- a/llvm/lib/Target/RISCV/RISCVProfiles.td
+++ b/llvm/lib/Target/RISCV/RISCVProfiles.td
@@ -6,169 +6,113 @@
 //
 //===----------------------------------------------------------------------===//
 
-class RISCVProfile<string name, list<SubtargetFeature> features>
-    : SubtargetFeature<name, "Is" # NAME, "true",
-                       "RISC-V " # name # " profile", features> {
-  // Indicates if the profile is not yet ratified, so should be treated as
-  // experimental.
-  bit Experimental = false;
-}
-class RISCVExperimentalProfile<string name, list<SubtargetFeature> features>
-    : RISCVProfile<"experimental-"#name, features> {
-  let Experimental = true;
-}
+//===----------------------------------------------------------------------===//
+// Profile Featuyre Lists
+//===----------------------------------------------------------------------===//
+
+// RVI Profile Family
 
 defvar RVI20U32Features = [Feature32Bit, FeatureStdExtI];
 defvar RVI20U64Features = [Feature64Bit, FeatureStdExtI];
 
-defvar RVA20U64Features = [Feature64Bit,
-                           FeatureStdExtI,
-                           FeatureStdExtM,
-                           FeatureStdExtA,
-                           FeatureStdExtF,
-                           FeatureStdExtD,
-                           FeatureStdExtC,
-                           FeatureStdExtZicntr,
-                           FeatureStdExtZiccif,
-                           FeatureStdExtZiccrse,
-                           FeatureStdExtZiccamoa,
-                           FeatureStdExtZa128rs,
-                           FeatureStdExtZicclsm];
+// RVA Profile Family
+
+defvar RVA20U64BaseFeatures = [Feature64Bit,
+                               FeatureStdExtI,
+                               FeatureStdExtM,
+                               FeatureStdExtA,
+                               FeatureStdExtF,
+                               FeatureStdExtD,
+                               FeatureStdExtC,
+                               FeatureStdExtZicntr,
+                               FeatureStdExtZiccif,
+                               FeatureStdExtZiccrse,
+                               FeatureStdExtZiccamoa,
+                               FeatureStdExtZicclsm];
+defvar RVA20U64Features = !listconcat(RVA20U64BaseFeatures,
+                                      [FeatureStdExtZa128rs]);
 
+defvar RVA20S64BaseFeatures = [FeatureStdExtZifencei,
+                               FeatureStdExtSvbare,
+                               FeatureStdExtSvade,
+                               FeatureStdExtSsccptr,
+                               FeatureStdExtSstvecd,
+                               FeatureStdExtSstvala];
 defvar RVA20S64Features = !listconcat(RVA20U64Features,
-                                      [FeatureStdExtZifencei,
-                                       FeatureStdExtSvbare,
-                                       FeatureStdExtSvade,
-                                       FeatureStdExtSsccptr,
-                                       FeatureStdExtSstvecd,
-                                       FeatureStdExtSstvala]);
+                                      RVA20S64BaseFeatures);
 
-defvar RVA22U64Features = [Feature64Bit,
-                           FeatureStdExtI,
-                           FeatureStdExtM,
-                           FeatureStdExtA,
-                           FeatureStdExtF,
-                           FeatureStdExtD,
-                           FeatureStdExtC,
-                           FeatureStdExtZicntr,
-                           FeatureStdExtZiccif,
-                           FeatureStdExtZiccrse,
-                           FeatureStdExtZiccamoa,
-                           FeatureStdExtZicclsm,
-                           FeatureStdExtZa64rs,
-                           FeatureStdExtZihpm,
-                           FeatureStdExtZihintpause,
-                           FeatureStdExtZba,
-                           FeatureStdExtZbb,
-                           FeatureStdExtZbs,
-                           FeatureStdExtZic64b,
-                           FeatureStdExtZicbom,
-                           FeatureStdExtZicbop,
-                           FeatureStdExtZicboz,
-                           FeatureStdExtZfhmin,
-                           FeatureStdExtZkt];
+defvar RVA22U64Features = !listconcat(RVA20U64BaseFeatures,
+                                      [FeatureStdExtZa64rs,
+                                       FeatureStdExtZihpm,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtZba,
+                                       FeatureStdExtZbb,
+                                       FeatureStdExtZbs,
+                                       FeatureStdExtZic64b,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZfhmin,
+                                       FeatureStdExtZkt]);
 
+defvar RVA22S64BaseFeatures = !listconcat(RVA20S64BaseFeatures,
+                                          [FeatureStdExtSscounterenw,
+                                           FeatureStdExtSvpbmt,
+                                           FeatureStdExtSvinval]);
 defvar RVA22S64Features = !listconcat(RVA22U64Features,
-                                      [FeatureStdExtZifencei,
-                                       FeatureStdExtSvbare,
-                                       FeatureStdExtSvade,
-                                       FeatureStdExtSsccptr,
-                                       FeatureStdExtSstvecd,
-                                       FeatureStdExtSstvala,
-                                       FeatureStdExtSscounterenw,
-                                       FeatureStdExtSvpbmt,
-                                       FeatureStdExtSvinval]);
+                                      RVA22S64BaseFeatures);
 
-defvar RVA23U64Features = [Feature64Bit,
-                           FeatureStdExtI,
-                           FeatureStdExtM,
-                           FeatureStdExtA,
-                           FeatureStdExtF,
-                           FeatureStdExtD,
-                           FeatureStdExtC,
-                           FeatureStdExtZicntr,
-                           FeatureStdExtZihpm,
-                           FeatureStdExtZiccif,
-                           FeatureStdExtZiccrse,
-                           FeatureStdExtZiccamoa,
-                           FeatureStdExtZicclsm,
-                           FeatureStdExtZa64rs,
-                           FeatureStdExtZihintpause,
-                           FeatureStdExtZba,
-                           FeatureStdExtZbb,
-                           FeatureStdExtZbs,
-                           FeatureStdExtZic64b,
-                           FeatureStdExtZicbom,
-                           FeatureStdExtZicbop,
-                           FeatureStdExtZicboz,
-                           FeatureStdExtZfhmin,
-                           FeatureStdExtZkt,
-                           FeatureStdExtV,
-                           FeatureStdExtZvfhmin,
-                           FeatureStdExtZvbb,
-                           FeatureStdExtZvkt,
-                           FeatureStdExtZihintntl,
-                           FeatureStdExtZicond,
-                           FeatureStdExtZimop,
-                           FeatureStdExtZcmop,
-                           FeatureStdExtZcb,
-                           FeatureStdExtZfa,
-                           FeatureStdExtZawrs];
+defvar RVA23U64Features = !listconcat(RVA22U64Features,
+                                      [FeatureStdExtV,
+                                       FeatureStdExtZvfhmin,
+                                       FeatureStdExtZvbb,
+                                       FeatureStdExtZvkt,
+                                       FeatureStdExtZihintntl,
+                                       FeatureStdExtZicond,
+                                       FeatureStdExtZimop,
+                                       FeatureStdExtZcmop,
+                                       FeatureStdExtZcb,
+                                       FeatureStdExtZfa,
+                                       FeatureStdExtZawrs]);
 
+defvar RVA23S64BaseFeatures = !listconcat(RVA22S64BaseFeatures,
+                                          [FeatureStdExtSvnapot,
+                                           FeatureStdExtSstc,
+                                           FeatureStdExtSscofpmf,
+                                           FeatureStdExtSsnpm,
+                                           FeatureStdExtSsu64xl,
+                                           FeatureStdExtH,
+                                           FeatureStdExtSsstateen,
+                                           FeatureStdExtShcounterenw,
+                                           FeatureStdExtShvstvala,
+                                           FeatureStdExtShtvala,
+                                           FeatureStdExtShvstvecd,
+                                           FeatureStdExtShvsatpa,
+                                           FeatureStdExtShgatpa]);
 defvar RVA23S64Features = !listconcat(RVA23U64Features,
-                                      [FeatureStdExtZifencei,
-                                       FeatureStdExtSvbare,
-                                       FeatureStdExtSvade,
-                                       FeatureStdExtSsccptr,
-                                       FeatureStdExtSstvecd,
-                                       FeatureStdExtSstvala,
-                                       FeatureStdExtSscounterenw,
-                                       FeatureStdExtSvpbmt,
-                                       FeatureStdExtSvinval,
-                                       FeatureStdExtSvnapot,
-                                       FeatureStdExtSstc,
-                                       FeatureStdExtSscofpmf,
-                                       FeatureStdExtSsnpm,
-                                       FeatureStdExtSsu64xl,
-                                       FeatureStdExtH,
-                                       FeatureStdExtSsstateen,
-                                       FeatureStdExtShcounterenw,
-                                       FeatureStdExtShvstvala,
-                                       FeatureStdExtShtvala,
-                                       FeatureStdExtShvstvecd,
-                                       FeatureStdExtShvsatpa,
-                                       FeatureStdExtShgatpa]);
-
-defvar RVB23U64Features = [Feature64Bit,
-                           FeatureStdExtI,
-                           FeatureStdExtM,
-                           FeatureStdExtA,
-                           FeatureStdExtF,
-                           FeatureStdExtD,
-                           FeatureStdExtC,
-                           FeatureStdExtZicntr,
-                           FeatureStdExtZihpm,
-                           FeatureStdExtZiccif,
-                           FeatureStdExtZiccrse,
-                           FeatureStdExtZiccamoa,
-                           FeatureStdExtZicclsm,
-                           FeatureStdExtZa64rs,
-                           FeatureStdExtZihintpause,
-                           FeatureStdExtZba,
-                           FeatureStdExtZbb,
-                           FeatureStdExtZbs,
-                           FeatureStdExtZic64b,
-                           FeatureStdExtZicbom,
-                           FeatureStdExtZicbop,
-                           FeatureStdExtZicboz,
-                           FeatureStdExtZkt,
-                           FeatureStdExtZihintntl,
-                           FeatureStdExtZicond,
-                           FeatureStdExtZimop,
-                           FeatureStdExtZcmop,
-                           FeatureStdExtZcb,
-                           FeatureStdExtZfa,
-                           FeatureStdExtZawrs];
+                                      RVA23S64BaseFeatures);
+
+// RVB Profile Family
+
+defvar RVB23U64Features = !listconcat(RVA20U64BaseFeatures,
+                                      [FeatureStdExtZihpm,
+                                       FeatureStdExtZa64rs,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtZba,
+                                       FeatureStdExtZbb,
+                                       FeatureStdExtZbs,
+                                       FeatureStdExtZic64b,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZkt,
+                                       FeatureStdExtZihintntl,
+                                       FeatureStdExtZicond,
+                                       FeatureStdExtZimop,
+                                       FeatureStdExtZcmop,
+                                       FeatureStdExtZcb,
+                                       FeatureStdExtZfa,
+                                       FeatureStdExtZawrs]);
 
 defvar RVB23S64Features = !listconcat(RVB23U64Features,
                                       [FeatureStdExtZifencei,
@@ -185,6 +129,8 @@ defvar RVB23S64Features = !listconcat(RVB23U64Features,
                                        FeatureStdExtSscofpmf,
                                        FeatureStdExtSsu64xl]);
 
+// RVM Profile Family
+
 defvar RVM23U32Features = [Feature32Bit,
                            FeatureStdExtI,
                            FeatureStdExtM,
@@ -199,6 +145,22 @@ defvar RVM23U32Features = [Feature32Bit,
                            FeatureStdExtZimop,
                            FeatureStdExtZcmop];
 
+//===----------------------------------------------------------------------===//
+// Profile Definitions for ISA String
+//===----------------------------------------------------------------------===//
+
+class RISCVProfile<string name, list<SubtargetFeature> features>
+    : SubtargetFeature<name, "Is" # NAME, "true",
+                       "RISC-V " # name # " profile", features> {
+  // Indicates if the profile is not yet ratified, so should be treated as
+  // experimental.
+  bit Experimental = false;
+}
+class RISCVExperimentalProfile<string name, list<SubtargetFeature> features>
+    : RISCVProfile<"experimental-"#name, features> {
+  let Experimental = true;
+}
+
 def RVI20U32 : RISCVProfile<"rvi20u32", RVI20U32Features>;
 def RVI20U64 : RISCVProfile<"rvi20u64", RVI20U64Features>;
 def RVA20U64 : RISCVProfile<"rva20u64", RVA20U64Features>;

From 59728193a688a21c1999579044d84d7ee0183e80 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 8 Aug 2024 09:17:43 -0700
Subject: [PATCH 035/112] [RISCV] Disable fixed length vectors with Zve32*
 without Zvl64b. (#102405)

Fixed length vectors use scalable vector containers. With Zve32* and not
Zvl64b, vscale is a 0.5 due RVVBitsPerBlock being 64.

To support this correctly we need to lower RVVBitsPerBlock to 32 and
change our type mapping. But we need to RVVBitsPerBlock to alway be
>= ELEN.  This means we need two different mapping depending on ELEN.

That is a non-trivial amount of work so disable fixed lenght vectors
without Zvl64b for now.

We had almost no tests for Zve32x without Zvl64b which is probably why
we never realized that it was broken.

Fixes #102352.
---
 llvm/docs/ReleaseNotes.rst                    |   3 +
 llvm/lib/Target/RISCV/RISCVSubtarget.cpp      |   3 +-
 .../RISCV/rvv/fixed-vectors-shuffle-rotate.ll | 156 +++++++++---------
 .../Transforms/LoopVectorize/RISCV/zvl32b.ll  |  33 +---
 4 files changed, 89 insertions(+), 106 deletions(-)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 1ed860de6b9dce9..c98171bc8fd8c1b 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -108,6 +108,9 @@ Changes to the RISC-V Backend
   fill value) rather than NOPs.
 * Added Syntacore SCR4 CPUs: ``-mcpu=syntacore-scr4-rv32/64``
 * ``-mcpu=sifive-p470`` was added.
+* Fixed length vector support using RVV instructions now requires VLEN>=64. This
+  means Zve32x and Zve32f will also require Zvl64b. The prior support was
+  largely untested.
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 61f8379983ef99e..e7db1ededf383b8 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -179,7 +179,8 @@ unsigned RISCVSubtarget::getMaxLMULForFixedLengthVectors() const {
 }
 
 bool RISCVSubtarget::useRVVForFixedLengthVectors() const {
-  return hasVInstructions() && getMinRVVVectorSizeInBits() != 0;
+  return hasVInstructions() &&
+         getMinRVVVectorSizeInBits() >= RISCV::RVVBitsPerBlock;
 }
 
 bool RISCVSubtarget::enableSubRegLiveness() const { return true; }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
index 17a63eff26ac134..3cfcb4398a1f00c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll
@@ -3,8 +3,8 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-V
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-V
-; RUN: llc -mtriple=riscv32 -mattr=+zve32x,+zvfh,+zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X
-; RUN: llc -mtriple=riscv64 -mattr=+zve32x,+zvfh,+zvkb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X
+; RUN: llc -mtriple=riscv32 -mattr=+zve32x,+zvfh,+zvkb,+zvl64b -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X
+; RUN: llc -mtriple=riscv64 -mattr=+zve32x,+zvfh,+zvkb,+zvl64b -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVKB-ZVE32X
 
 define <8 x i1> @shuffle_v8i1_as_i8_1(<8 x i1> %v) {
 ; CHECK-LABEL: shuffle_v8i1_as_i8_1:
@@ -191,7 +191,7 @@ define <8 x i8> @shuffle_v8i8_as_i16(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i16:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vrev8.v v8, v8
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -215,7 +215,7 @@ define <8 x i8> @shuffle_v8i8_as_i32_8(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i32_8:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 8
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
@@ -239,7 +239,7 @@ define <8 x i8> @shuffle_v8i8_as_i32_16(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i32_16:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 16
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
@@ -263,7 +263,7 @@ define <8 x i8> @shuffle_v8i8_as_i32_24(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i32_24:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 24
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
@@ -287,10 +287,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_8(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_8:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 1
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 7
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 1
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 7
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
   ret <8 x i8> %shuffle
@@ -313,10 +313,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_16(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_16:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 2
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 6
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 2
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 6
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
   ret <8 x i8> %shuffle
@@ -339,10 +339,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_24(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_24:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 3
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 5
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 3
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 5
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2>
   ret <8 x i8> %shuffle
@@ -365,10 +365,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_32(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_32:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 4
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 4
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 4
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 4
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
   ret <8 x i8> %shuffle
@@ -391,10 +391,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_40(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_40:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 5
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 3
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 5
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 3
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4>
   ret <8 x i8> %shuffle
@@ -417,10 +417,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_48(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_48:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 6
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 2
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 6
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 2
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
   ret <8 x i8> %shuffle
@@ -443,10 +443,10 @@ define <8 x i8> @shuffle_v8i8_as_i64_56(<8 x i8> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i8_as_i64_56:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m2, ta, ma
-; ZVKB-ZVE32X-NEXT:    vslidedown.vi v10, v8, 7
-; ZVKB-ZVE32X-NEXT:    vslideup.vi v10, v8, 1
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
+; ZVKB-ZVE32X-NEXT:    vslidedown.vi v9, v8, 7
+; ZVKB-ZVE32X-NEXT:    vslideup.vi v9, v8, 1
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v9
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
   ret <8 x i8> %shuffle
@@ -469,7 +469,7 @@ define <8 x i16> @shuffle_v8i16_as_i32(<8 x i16> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8i16_as_i32:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 16
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -512,11 +512,11 @@ define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI19_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI19_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
   ret <8 x i16> %shuffle
@@ -558,11 +558,11 @@ define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI20_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI20_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   ret <8 x i16> %shuffle
@@ -604,11 +604,11 @@ define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI21_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI21_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
   ret <8 x i16> %shuffle
@@ -650,12 +650,12 @@ define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI22_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI22_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v16, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v24, v16
-; ZVKB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVKB-ZVE32X-NEXT:    vrgatherei16.vv v16, v8, v24
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v16
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
+; ZVKB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vrgatherei16.vv v12, v8, v16
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   ret <8 x i32> %shuffle
@@ -678,7 +678,7 @@ define <8 x half> @shuffle_v8f16_as_i32(<8 x half> %v) {
 ;
 ; ZVKB-ZVE32X-LABEL: shuffle_v8f16_as_i32:
 ; ZVKB-ZVE32X:       # %bb.0:
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
 ; ZVKB-ZVE32X-NEXT:    vror.vi v8, v8, 16
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -721,11 +721,11 @@ define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI24_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI24_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 0, i32 5, i32 6, i32 7, i32 4>
   ret <8 x half> %shuffle
@@ -767,11 +767,11 @@ define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI25_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI25_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
   ret <8 x half> %shuffle
@@ -813,11 +813,11 @@ define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI26_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI26_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
-; ZVKB-ZVE32X-NEXT:    vrgather.vv v12, v8, v16
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v10, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v12, v10
+; ZVKB-ZVE32X-NEXT:    vrgather.vv v10, v8, v12
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v10
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> <i32 3, i32 0, i32 1, i32 2, i32 7, i32 4, i32 5, i32 6>
   ret <8 x half> %shuffle
@@ -859,12 +859,12 @@ define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) {
 ; ZVKB-ZVE32X:       # %bb.0:
 ; ZVKB-ZVE32X-NEXT:    lui a0, %hi(.LCPI27_0)
 ; ZVKB-ZVE32X-NEXT:    addi a0, a0, %lo(.LCPI27_0)
-; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m4, ta, ma
-; ZVKB-ZVE32X-NEXT:    vle8.v v16, (a0)
-; ZVKB-ZVE32X-NEXT:    vsext.vf2 v24, v16
-; ZVKB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; ZVKB-ZVE32X-NEXT:    vrgatherei16.vv v16, v8, v24
-; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v16
+; ZVKB-ZVE32X-NEXT:    vsetivli zero, 8, e16, m2, ta, ma
+; ZVKB-ZVE32X-NEXT:    vle8.v v12, (a0)
+; ZVKB-ZVE32X-NEXT:    vsext.vf2 v16, v12
+; ZVKB-ZVE32X-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVKB-ZVE32X-NEXT:    vrgatherei16.vv v12, v8, v16
+; ZVKB-ZVE32X-NEXT:    vmv.v.v v8, v12
 ; ZVKB-ZVE32X-NEXT:    ret
   %shuffle = shufflevector <8 x float> %v, <8 x float> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
   ret <8 x float> %shuffle
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll b/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll
index ba78216100598d4..f37c32396cd059c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/zvl32b.ll
@@ -4,42 +4,21 @@
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
 target triple = "riscv64"
 
-; We can't use scalable vectorization for Zvl32b due to RVVBitsPerBlock being
-; 64. Since our vscale value is vlen/RVVBitsPerBlock this makes vscale 0.
-; Make sure we fall back to fixed vectorization instead.
+; We can't vectorize with Zvl32b due to RVVBitsPerBlock being 64. Since our
+; vscale value is vlen/RVVBitsPerBlock this makes vscale 0.
 define void @vector_add_i16(ptr noalias nocapture %a, i16 %v, i64 %n) {
 ; CHECK-LABEL: @vector_add_i16(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[V:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> [[TMP0]], i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> poison)
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i16> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> [[TMP1]], <4 x ptr> [[TMP0]], i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1020
-; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1020, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IV]]
 ; CHECK-NEXT:    [[ELEM:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[ELEM]], [[V]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[ELEM]], [[V:%.*]]
 ; CHECK-NEXT:    store i16 [[ADD]], ptr [[ARRAYIDX]], align 2
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;

From a262ac0c680b8907617d30b5d53f438e75e7ce13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 8 Aug 2024 09:18:51 -0700
Subject: [PATCH 036/112] [flang][cuda] Make operations dynamically legal in
 cuf op conversion (#102220)

---
 .../Optimizer/Transforms/CufOpConversion.cpp  | 17 ++++++++--
 flang/test/Fir/CUDA/cuda-allocate.fir         | 34 ++++++++++++++++++-
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
index 70b503799421616..f059d36315a345a 100644
--- a/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CufOpConversion.cpp
@@ -234,9 +234,20 @@ class CufOpConversion : public fir::impl::CufOpConversionBase<CufOpConversion> {
         fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
     fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
                                          /*forceUnifiedTBAATree=*/false, *dl);
-
-    target.addIllegalOp<cuf::AllocOp, cuf::AllocateOp, cuf::DeallocateOp,
-                        cuf::FreeOp>();
+    target.addDynamicallyLegalOp<cuf::AllocOp>([](::cuf::AllocOp op) {
+      return !mlir::isa<fir::BaseBoxType>(op.getInType());
+    });
+    target.addDynamicallyLegalOp<cuf::FreeOp>([](::cuf::FreeOp op) {
+      if (auto refTy = mlir::dyn_cast_or_null<fir::ReferenceType>(
+              op.getDevptr().getType())) {
+        return !mlir::isa<fir::BaseBoxType>(refTy.getEleTy());
+      }
+      return true;
+    });
+    target.addDynamicallyLegalOp<cuf::AllocateOp>(
+        [](::cuf::AllocateOp op) { return isBoxGlobal(op); });
+    target.addDynamicallyLegalOp<cuf::DeallocateOp>(
+        [](::cuf::DeallocateOp op) { return isBoxGlobal(op); });
     patterns.insert<CufAllocOpConversion>(ctx, &*dl, &typeConverter);
     patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion,
                     CufFreeOpConversion>(ctx);
diff --git a/flang/test/Fir/CUDA/cuda-allocate.fir b/flang/test/Fir/CUDA/cuda-allocate.fir
index 1274d3921dd854e..569e72f57d6d6c0 100644
--- a/flang/test/Fir/CUDA/cuda-allocate.fir
+++ b/flang/test/Fir/CUDA/cuda-allocate.fir
@@ -14,7 +14,6 @@ func.func @_QPsub1() {
   return
 }
 
-
 // CHECK-LABEL: func.func @_QPsub1()
 // CHECK: %[[DESC_RT_CALL:.*]] = fir.call @_FortranACUFAllocDesciptor(%{{.*}}, %{{.*}}, %{{.*}}) : (i64, !fir.ref<i8>, i32) -> !fir.ref<!fir.box<none>>
 // CHECK: %[[DESC:.*]] = fir.convert %[[DESC_RT_CALL]] : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
@@ -27,4 +26,37 @@ func.func @_QPsub1() {
 // CHECK: %[[BOX_NONE:.*]] = fir.convert %[[DECL_DESC]]#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
 // CHECK: fir.call @_FortranACUFFreeDesciptor(%[[BOX_NONE]], %{{.*}}, %{{.*}}) : (!fir.ref<!fir.box<none>>, !fir.ref<i8>, i32) -> none
 
+// Check operations that should not be transformed yet.
+func.func @_QPsub2() {
+  %0 = cuf.alloc !fir.array<10xf32> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFcuda_alloc_freeEa"} -> !fir.ref<!fir.array<10xf32>>
+  cuf.free %0 : !fir.ref<!fir.array<10xf32>> {data_attr = #cuf.cuda<device>}
+  return
 }
+
+// CHECK-LABEL: func.func @_QPsub2()
+// CHECK: cuf.alloc !fir.array<10xf32>
+// CHECK: cuf.free %{{.*}} : !fir.ref<!fir.array<10xf32>>
+
+fir.global @_QMmod1Ea {data_attr = #cuf.cuda<device>} : !fir.box<!fir.heap<!fir.array<?xf32>>> {
+    %0 = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+    %c0 = arith.constant 0 : index
+    %1 = fir.shape %c0 : (index) -> !fir.shape<1>
+    %2 = fir.embox %0(%1) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+    fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?xf32>>>
+}
+
+func.func @_QPsub3() {
+  %0 = fir.address_of(@_QMmod1Ea) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+  %1:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmod1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %2 = cuf.allocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
+  %3 = cuf.deallocate %1#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
+  return
+}
+
+// CHECK-LABEL: func.func @_QPsub3()
+// CHECK: cuf.allocate
+// CHECK: cuf.deallocate
+
+}
+
+

From 3e7135750cfe41e8b093e19ce5b85ad937e064fd Mon Sep 17 00:00:00 2001
From: Zaara Syeda <syzaara@ca.ibm.com>
Date: Thu, 8 Aug 2024 12:27:26 -0400
Subject: [PATCH 037/112] [PPC] Disable vsx and altivec when -msoft-float is
 used (#100450)

We emit an error when -msoft-float and -maltivec/-mvsx is used together,
but when -msoft-float is used on its own, there is still +altivec and
+vsx in the IR attributes. This patch disables altivec and vsx and all
related sub features when -msoft-float is used.
---
 clang/lib/Basic/Targets/PPC.cpp             | 72 ++++++++++++---------
 clang/test/Driver/ppc-dependent-options.cpp | 40 +++++++++++-
 clang/test/Driver/ppc-soft-float.c          | 28 ++++++++
 3 files changed, 107 insertions(+), 33 deletions(-)
 create mode 100644 clang/test/Driver/ppc-soft-float.c

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index b5f9adfdd515b0e..04dc436eb1b9cdf 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -469,21 +469,36 @@ void PPCTargetInfo::getTargetDefines(const LangOptions &Opts,
 // set of options.
 static bool ppcUserFeaturesCheck(DiagnosticsEngine &Diags,
                                  const std::vector<std::string> &FeaturesVec) {
-  // Cannot allow soft-float with Altivec.
-  if (llvm::is_contained(FeaturesVec, "-hard-float") &&
-      llvm::is_contained(FeaturesVec, "+altivec")) {
-    Diags.Report(diag::err_opt_not_valid_with_opt) << "-msoft-float"
-                                                   << "-maltivec";
+  auto FindVSXSubfeature = [&](StringRef Feature, StringRef SubOption,
+                               StringRef Option) {
+    if (llvm::is_contained(FeaturesVec, Feature)) {
+      Diags.Report(diag::err_opt_not_valid_with_opt) << SubOption << Option;
+      return true;
+    }
     return false;
-  }
+  };
 
-  // Cannot allow soft-float with VSX.
-  if (llvm::is_contained(FeaturesVec, "-hard-float") &&
-      llvm::is_contained(FeaturesVec, "+vsx")) {
-    Diags.Report(diag::err_opt_not_valid_with_opt) << "-msoft-float"
-                                                   << "-mvsx";
-    return false;
+  // Cannot allow soft-float with VSX, Altivec, or any
+  // VSX subfeatures.
+  bool Found = false;
+  if (llvm::is_contained(FeaturesVec, "-hard-float")) {
+    Found |= FindVSXSubfeature("+vsx", "-mvsx", "-msoft-float");
+    Found |= FindVSXSubfeature("+altivec", "-maltivec", "-msoft-float");
+    Found |=
+        FindVSXSubfeature("+power8-vector", "-mpower8-vector", "-msoft-float");
+    Found |= FindVSXSubfeature("+direct-move", "-mdirect-move", "-msoft-float");
+    Found |= FindVSXSubfeature("+float128", "-mfloat128", "-msoft-float");
+    Found |=
+        FindVSXSubfeature("+power9-vector", "-mpower9-vector", "-msoft-float");
+    Found |= FindVSXSubfeature("+paired-vector-memops",
+                               "-mpaired-vector-memops", "-msoft-float");
+    Found |= FindVSXSubfeature("+mma", "-mmma", "-msoft-float");
+    Found |= FindVSXSubfeature("+crypto", "-mcrypto", "-msoft-float");
+    Found |= FindVSXSubfeature("+power10-vector", "-mpower10-vector",
+                               "-msoft-float");
   }
+  if (Found)
+    return false;
 
   // Cannot allow VSX with no Altivec.
   if (llvm::is_contained(FeaturesVec, "+vsx") &&
@@ -497,21 +512,14 @@ static bool ppcUserFeaturesCheck(DiagnosticsEngine &Diags,
   if (!llvm::is_contained(FeaturesVec, "-vsx"))
     return true;
 
-  auto FindVSXSubfeature = [&](StringRef Feature, StringRef Option) {
-    if (llvm::is_contained(FeaturesVec, Feature)) {
-      Diags.Report(diag::err_opt_not_valid_with_opt) << Option << "-mno-vsx";
-      return true;
-    }
-    return false;
-  };
-
-  bool Found = FindVSXSubfeature("+power8-vector", "-mpower8-vector");
-  Found |= FindVSXSubfeature("+direct-move", "-mdirect-move");
-  Found |= FindVSXSubfeature("+float128", "-mfloat128");
-  Found |= FindVSXSubfeature("+power9-vector", "-mpower9-vector");
-  Found |= FindVSXSubfeature("+paired-vector-memops", "-mpaired-vector-memops");
-  Found |= FindVSXSubfeature("+mma", "-mmma");
-  Found |= FindVSXSubfeature("+power10-vector", "-mpower10-vector");
+  Found = FindVSXSubfeature("+power8-vector", "-mpower8-vector", "-mno-vsx");
+  Found |= FindVSXSubfeature("+direct-move", "-mdirect-move", "-mno-vsx");
+  Found |= FindVSXSubfeature("+float128", "-mfloat128", "-mno-vsx");
+  Found |= FindVSXSubfeature("+power9-vector", "-mpower9-vector", "-mno-vsx");
+  Found |= FindVSXSubfeature("+paired-vector-memops", "-mpaired-vector-memops",
+                             "-mno-vsx");
+  Found |= FindVSXSubfeature("+mma", "-mmma", "-mno-vsx");
+  Found |= FindVSXSubfeature("+power10-vector", "-mpower10-vector", "-mno-vsx");
 
   // Return false if any vsx subfeatures was found.
   return !Found;
@@ -693,7 +701,6 @@ bool PPCTargetInfo::initFeatureMap(
     Diags.Report(diag::err_opt_not_valid_with_opt) << "-mprivileged" << CPU;
     return false;
   }
-
   return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
 }
 
@@ -783,13 +790,16 @@ void PPCTargetInfo::setFeatureEnabled(llvm::StringMap<bool> &Features,
   } else {
     if (Name == "spe")
       Features["efpu2"] = false;
-    // If we're disabling altivec or vsx go ahead and disable all of the vsx
-    // features.
-    if ((Name == "altivec") || (Name == "vsx"))
+    // If we're disabling altivec, hard-float, or vsx go ahead and disable all
+    // of the vsx features.
+    if ((Name == "altivec") || (Name == "vsx") || (Name == "hard-float")) {
+      if (Name != "vsx")
+        Features["altivec"] = Features["crypto"] = false;
       Features["vsx"] = Features["direct-move"] = Features["power8-vector"] =
           Features["float128"] = Features["power9-vector"] =
               Features["paired-vector-memops"] = Features["mma"] =
                   Features["power10-vector"] = false;
+    }
     if (Name == "power8-vector")
       Features["power9-vector"] = Features["paired-vector-memops"] =
           Features["mma"] = Features["power10-vector"] = false;
diff --git a/clang/test/Driver/ppc-dependent-options.cpp b/clang/test/Driver/ppc-dependent-options.cpp
index 414ed1e70bb3054..46d6beafbc174c1 100644
--- a/clang/test/Driver/ppc-dependent-options.cpp
+++ b/clang/test/Driver/ppc-dependent-options.cpp
@@ -89,6 +89,34 @@
 // RUN: -std=c++11 -msoft-float -mvsx %s 2>&1 | \
 // RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-VSX
 
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mpower8-vector %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-P8VEC
+
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mpower9-vector %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-P9VEC
+
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mpower10-vector %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-P10VEC
+
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mdirect-move %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-DIRECTMOVE
+
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mmma %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-MMA
+
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mpaired-vector-memops %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-PAIREDVECMEMOP
+
+// RUN: not %clang -target powerpc64le-unknown-unknown -fsyntax-only \
+// RUN: -std=c++11 -msoft-float -mcrypto %s 2>&1 | \
+// RUN: FileCheck %s -check-prefix=CHECK-SOFTFLT-CRYPTO
+
 #ifdef __VSX__
 static_assert(false, "VSX enabled");
 #endif
@@ -126,5 +154,13 @@ static_assert(false, "Neither enabled");
 // CHECK-NVSX: Neither enabled
 // CHECK-VSX: VSX enabled
 // CHECK-NALTI-VSX: error: option '-mvsx' cannot be specified with '-mno-altivec'
-// CHECK-SOFTFLT-ALTI: error: option '-msoft-float' cannot be specified with '-maltivec'
-// CHECK-SOFTFLT-VSX: error: option '-msoft-float' cannot be specified with '-mvsx'
+// CHECK-SOFTFLT-ALTI: error: option '-maltivec' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-VSX: error: option '-mvsx' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-FLOAT128: error: option '-mfloat128' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-P8VEC: error: option '-mpower8-vector' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-P9VEC: error: option '-mpower9-vector' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-P10VEC: error: option '-mpower10-vector' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-DIRECTMOVE: error: option '-mdirect-move' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-MMA: error: option '-mmma' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-PAIREDVECMEMOP: error: option '-mpaired-vector-memops' cannot be specified with '-msoft-float'
+// CHECK-SOFTFLT-CRYPTO: error: option '-mcrypto' cannot be specified with '-msoft-float'
diff --git a/clang/test/Driver/ppc-soft-float.c b/clang/test/Driver/ppc-soft-float.c
new file mode 100644
index 000000000000000..18d768aec0dedce
--- /dev/null
+++ b/clang/test/Driver/ppc-soft-float.c
@@ -0,0 +1,28 @@
+// RUN: %clang -target powerpc64-unknown-unknown -mcpu=pwr10 -msoft-float -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECKSOFT
+// RUN: %clang -target powerpc64-unknown-unknown -mcpu=pwr10 -S -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECKNOSOFT
+
+int main () {
+  return 0;
+}
+
+// CHECKSOFT-DAG: -hard-float
+// CHECKSOFT-DAG: -vsx
+// CHECKSOFT-DAG: -altivec
+// CHECKSOFT-DAG: -direct-move
+// CHECKSOFT-DAG: -float128
+// CHECKSOFT-DAG: -mma
+// CHECKSOFT-DAG: -paired-vector-memops
+// CHECKSOFT-DAG: -power10-vector
+// CHECKSOFT-DAG: -power9-vector
+// CHECKSOFT-DAG: -power8-vector
+// CHECKSOFT-DAG: -crypto
+
+// CHECKNOSOFT-DAG: +vsx
+// CHECKNOSOFT-DAG: +altivec
+// CHECKNOSOFT-DAG: +direct-move
+// CHECKNOSOFT-DAG: +mma
+// CHECKNOSOFT-DAG: +paired-vector-memops
+// CHECKNOSOFT-DAG: +power10-vector
+// CHECKNOSOFT-DAG: +power9-vector
+// CHECKNOSOFT-DAG: +power8-vector
+// CHECKNOSOFT-DAG: +crypto

From 876ee11eeeb2b6a7ccfefea00265b4bfd5f9b6dd Mon Sep 17 00:00:00 2001
From: Dan Liew <dan@su-root.co.uk>
Date: Thu, 8 Aug 2024 09:57:47 -0700
Subject: [PATCH 038/112] [Bounds Safety][NFC] Add some missing coverage for
 `-fexperimental-late-parse-attributes` (#102236)

Previously we weren't properly checking that using
`-fexperimental-late-parse-attributes` worked on source code that didn't
need late parsing.

For example we weren't testing that the attribute appearing in the type
position generated the right AST with
`-fexperimental-late-parse-attributes` off.

This patch adds additional `RUN` lines to re-run the relevant test cases
with `-fexperimental-late-parse-attributes` enabled.

rdar://133325597
---
 clang/test/AST/attr-counted-by-or-null-struct-ptrs.c       | 1 +
 clang/test/AST/attr-counted-by-struct-ptrs.c               | 1 +
 clang/test/AST/attr-sized-by-or-null-struct-ptrs.c         | 1 +
 clang/test/AST/attr-sized-by-struct-ptrs.c                 | 1 +
 clang/test/Sema/attr-counted-by-bounds-safety-vlas.c       | 1 +
 clang/test/Sema/attr-counted-by-or-null-last-field.c       | 7 +++++--
 .../attr-counted-by-or-null-struct-ptrs-sizeless-types.c   | 1 +
 clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c      | 1 +
 .../test/Sema/attr-counted-by-or-null-vla-sizeless-types.c | 1 +
 .../test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c | 1 +
 clang/test/Sema/attr-counted-by-struct-ptrs.c              | 1 +
 clang/test/Sema/attr-counted-by-vla-sizeless-types.c       | 1 +
 clang/test/Sema/attr-counted-by-vla.c                      | 7 +++++--
 clang/test/Sema/attr-sized-by-last-field.c                 | 7 +++++--
 clang/test/Sema/attr-sized-by-or-null-last-field.c         | 7 +++++--
 .../attr-sized-by-or-null-struct-ptrs-sizeless-types.c     | 1 +
 clang/test/Sema/attr-sized-by-or-null-struct-ptrs.c        | 1 +
 clang/test/Sema/attr-sized-by-or-null-vla-sizeless-types.c | 1 +
 clang/test/Sema/attr-sized-by-struct-ptrs-sizeless-types.c | 1 +
 clang/test/Sema/attr-sized-by-struct-ptrs.c                | 1 +
 clang/test/Sema/attr-sized-by-vla-sizeless-types.c         | 1 +
 21 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c b/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c
index cedb3f1192eda30..075f583784fe192 100644
--- a/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c
+++ b/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes %s -ast-dump | FileCheck %s
 
 #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
 
diff --git a/clang/test/AST/attr-counted-by-struct-ptrs.c b/clang/test/AST/attr-counted-by-struct-ptrs.c
index 79a453d239cd528..0c0525823414312 100644
--- a/clang/test/AST/attr-counted-by-struct-ptrs.c
+++ b/clang/test/AST/attr-counted-by-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes %s -ast-dump | FileCheck %s
 
 #define __counted_by(f)  __attribute__((counted_by(f)))
 
diff --git a/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c b/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c
index 6189799b85ccb2e..73b8a71f23503ea 100644
--- a/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c
+++ b/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes %s -ast-dump | FileCheck %s
 
 #define __sized_by_or_null(f)  __attribute__((sized_by_or_null(f)))
 
diff --git a/clang/test/AST/attr-sized-by-struct-ptrs.c b/clang/test/AST/attr-sized-by-struct-ptrs.c
index 5d9ed0094c685ba..7f7e3dfea2ac73c 100644
--- a/clang/test/AST/attr-sized-by-struct-ptrs.c
+++ b/clang/test/AST/attr-sized-by-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -ast-dump | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes %s -ast-dump | FileCheck %s
 
 #define __sized_by(f)  __attribute__((sized_by(f)))
 
diff --git a/clang/test/Sema/attr-counted-by-bounds-safety-vlas.c b/clang/test/Sema/attr-counted-by-bounds-safety-vlas.c
index 7d9c9a90880fff4..5a739f9c6bc0432 100644
--- a/clang/test/Sema/attr-counted-by-bounds-safety-vlas.c
+++ b/clang/test/Sema/attr-counted-by-bounds-safety-vlas.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -fexperimental-bounds-safety -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fexperimental-bounds-safety -fexperimental-late-parse-attributes -verify %s
 //
 // This is a portion of the `attr-counted-by-vla.c` test but is checked
 // under the semantics of `-fexperimental-bounds-safety` which has different
diff --git a/clang/test/Sema/attr-counted-by-or-null-last-field.c b/clang/test/Sema/attr-counted-by-or-null-last-field.c
index dd3a6422521c02a..12c0b6de44f7264 100644
--- a/clang/test/Sema/attr-counted-by-or-null-last-field.c
+++ b/clang/test/Sema/attr-counted-by-or-null-last-field.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,immediate %s
+// RUN: %clang_cc1 -fsyntax-only -fexperimental-late-parse-attributes -verify=expected,late %s
 
 #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
 
@@ -82,7 +83,9 @@ struct found_outside_of_struct {
 
 struct self_referrential {
   int bork;
-  struct bar *self[] __counted_by_or_null(self); // expected-error {{use of undeclared identifier 'self'}}
+  // immediate-error@+2{{use of undeclared identifier 'self'}}
+  // late-error@+1{{'counted_by_or_null' only applies to pointers; did you mean to use 'counted_by'?}}
+  struct bar *self[] __counted_by_or_null(self);
 };
 
 struct non_int_count {
diff --git a/clang/test/Sema/attr-counted-by-or-null-struct-ptrs-sizeless-types.c b/clang/test/Sema/attr-counted-by-or-null-struct-ptrs-sizeless-types.c
index 301977300b06a19..4b898e7369c1978 100644
--- a/clang/test/Sema/attr-counted-by-or-null-struct-ptrs-sizeless-types.c
+++ b/clang/test/Sema/attr-counted-by-or-null-struct-ptrs-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
 
diff --git a/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c b/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c
index 017aafe0c9396ac..708bb727ce09dad 100644
--- a/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c
+++ b/clang/test/Sema/attr-counted-by-or-null-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
 
 #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
 #define __counted_by(f)  __attribute__((counted_by(f)))
diff --git a/clang/test/Sema/attr-counted-by-or-null-vla-sizeless-types.c b/clang/test/Sema/attr-counted-by-or-null-vla-sizeless-types.c
index 8abd4476fe5977b..1e8c7179e790365 100644
--- a/clang/test/Sema/attr-counted-by-or-null-vla-sizeless-types.c
+++ b/clang/test/Sema/attr-counted-by-or-null-vla-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __counted_by_or_null(f)  __attribute__((counted_by_or_null(f)))
 
diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c b/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
index 9b0f2eafb13c2b5..1de93640cf45806 100644
--- a/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
+++ b/clang/test/Sema/attr-counted-by-struct-ptrs-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __counted_by(f)  __attribute__((counted_by(f)))
 
diff --git a/clang/test/Sema/attr-counted-by-struct-ptrs.c b/clang/test/Sema/attr-counted-by-struct-ptrs.c
index cd2bfe36938b2e1..321d6aafbeba24a 100644
--- a/clang/test/Sema/attr-counted-by-struct-ptrs.c
+++ b/clang/test/Sema/attr-counted-by-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fexperimental-late-parse-attributes %s -verify
 
 #define __counted_by(f)  __attribute__((counted_by(f)))
 
diff --git a/clang/test/Sema/attr-counted-by-vla-sizeless-types.c b/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
index 31c0007501c48da..8cc5f6448254846 100644
--- a/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
+++ b/clang/test/Sema/attr-counted-by-vla-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple arm64-apple-darwin -fexperimental-late-parse-attributes -fsyntax-only -verify %s
 
 #define __counted_by(f)  __attribute__((counted_by(f)))
 
diff --git a/clang/test/Sema/attr-counted-by-vla.c b/clang/test/Sema/attr-counted-by-vla.c
index 571d6e6291e6bcb..35737c03f3222e4 100644
--- a/clang/test/Sema/attr-counted-by-vla.c
+++ b/clang/test/Sema/attr-counted-by-vla.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,immediate %s
+// RUN: %clang_cc1 -fsyntax-only -fexperimental-late-parse-attributes %s -verify=expected,late
 
 #define __counted_by(f)  __attribute__((counted_by(f)))
 
@@ -80,7 +81,9 @@ struct found_outside_of_struct {
 
 struct self_referrential {
   int bork;
-  struct bar *self[] __counted_by(self); // expected-error {{use of undeclared identifier 'self'}}
+  // immediate-error@+2{{use of undeclared identifier 'self'}}
+  // late-error@+1{{'counted_by' requires a non-boolean integer type argument}}
+  struct bar *self[] __counted_by(self);
 };
 
 struct non_int_count {
diff --git a/clang/test/Sema/attr-sized-by-last-field.c b/clang/test/Sema/attr-sized-by-last-field.c
index 6af29e9f31435da..f2e74f7fdf4b510 100644
--- a/clang/test/Sema/attr-sized-by-last-field.c
+++ b/clang/test/Sema/attr-sized-by-last-field.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,immediate %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify=expected,late %s
 
 #define __sized_by(f)  __attribute__((sized_by(f)))
 
@@ -82,7 +83,9 @@ struct found_outside_of_struct {
 
 struct self_referrential {
   int bork;
-  struct bar *self[] __sized_by(self); // expected-error {{use of undeclared identifier 'self'}}
+  // immediate-error@+2{{use of undeclared identifier 'self'}}
+  // late-error@+1{{'sized_by' only applies to pointers; did you mean to use 'counted_by'?}}
+  struct bar *self[] __sized_by(self);
 };
 
 struct non_int_size {
diff --git a/clang/test/Sema/attr-sized-by-or-null-last-field.c b/clang/test/Sema/attr-sized-by-or-null-last-field.c
index 96bbe847b910bf5..08ef5263a05cf03 100644
--- a/clang/test/Sema/attr-sized-by-or-null-last-field.c
+++ b/clang/test/Sema/attr-sized-by-or-null-last-field.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,immediate %s
+// RUN: %clang_cc1 -fsyntax-only -fexperimental-late-parse-attributes -verify=expected,late %s
 
 #define __sized_by_or_null(f)  __attribute__((sized_by_or_null(f)))
 
@@ -82,7 +83,9 @@ struct found_outside_of_struct {
 
 struct self_referrential {
   int bork;
-  struct bar *self[] __sized_by_or_null(self); // expected-error {{use of undeclared identifier 'self'}}
+  // immediate-error@+2{{use of undeclared identifier 'self'}}
+  // late-error@+1{{'sized_by_or_null' only applies to pointers; did you mean to use 'counted_by'?}}
+  struct bar *self[] __sized_by_or_null(self);
 };
 
 struct non_int_size {
diff --git a/clang/test/Sema/attr-sized-by-or-null-struct-ptrs-sizeless-types.c b/clang/test/Sema/attr-sized-by-or-null-struct-ptrs-sizeless-types.c
index 4a360b9722a0b0c..d960c0d31b65cc2 100644
--- a/clang/test/Sema/attr-sized-by-or-null-struct-ptrs-sizeless-types.c
+++ b/clang/test/Sema/attr-sized-by-or-null-struct-ptrs-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __sized_by_or_null(f)  __attribute__((sized_by_or_null(f)))
 
diff --git a/clang/test/Sema/attr-sized-by-or-null-struct-ptrs.c b/clang/test/Sema/attr-sized-by-or-null-struct-ptrs.c
index 2c7578b5ecbe64c..4200c9275a18040 100644
--- a/clang/test/Sema/attr-sized-by-or-null-struct-ptrs.c
+++ b/clang/test/Sema/attr-sized-by-or-null-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
 
 #define __sized_by_or_null(f)  __attribute__((sized_by_or_null(f)))
 #define __counted_by(f)  __attribute__((counted_by(f)))
diff --git a/clang/test/Sema/attr-sized-by-or-null-vla-sizeless-types.c b/clang/test/Sema/attr-sized-by-or-null-vla-sizeless-types.c
index 398b1df592fe389..7d16c2d456a02c9 100644
--- a/clang/test/Sema/attr-sized-by-or-null-vla-sizeless-types.c
+++ b/clang/test/Sema/attr-sized-by-or-null-vla-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __sized_by_or_null(f)  __attribute__((sized_by_or_null(f)))
 
diff --git a/clang/test/Sema/attr-sized-by-struct-ptrs-sizeless-types.c b/clang/test/Sema/attr-sized-by-struct-ptrs-sizeless-types.c
index 2e916bdb04720ce..7038330e60eee5b 100644
--- a/clang/test/Sema/attr-sized-by-struct-ptrs-sizeless-types.c
+++ b/clang/test/Sema/attr-sized-by-struct-ptrs-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __sized_by(f)  __attribute__((sized_by(f)))
 
diff --git a/clang/test/Sema/attr-sized-by-struct-ptrs.c b/clang/test/Sema/attr-sized-by-struct-ptrs.c
index 01195469c6fe42f..07373b247d0f790 100644
--- a/clang/test/Sema/attr-sized-by-struct-ptrs.c
+++ b/clang/test/Sema/attr-sized-by-struct-ptrs.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -fsyntax-only -verify %s
 
 #define __sized_by(f)  __attribute__((sized_by(f)))
 #define __counted_by(f)  __attribute__((counted_by(f)))
diff --git a/clang/test/Sema/attr-sized-by-vla-sizeless-types.c b/clang/test/Sema/attr-sized-by-vla-sizeless-types.c
index 37e91639bb4a1f1..8a94b1217c9064a 100644
--- a/clang/test/Sema/attr-sized-by-vla-sizeless-types.c
+++ b/clang/test/Sema/attr-sized-by-vla-sizeless-types.c
@@ -1,5 +1,6 @@
 // __SVInt8_t is specific to ARM64 so specify that in the target triple
 // RUN: %clang_cc1 -triple arm64-apple-darwin -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fexperimental-late-parse-attributes -triple arm64-apple-darwin -fsyntax-only -verify %s
 
 #define __sized_by(f)  __attribute__((sized_by(f)))
 

From 6b78e94110dda175b248b1c361a098236522e1e2 Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern@gmail.com>
Date: Fri, 9 Aug 2024 01:10:25 +0800
Subject: [PATCH 039/112] [libc][CMake] Fix build issues of libc on Windows
 (#102499)

Due to an issue mentioned in `llvm/projects/CMakeLists.txt`, libc build
is disabled by default when building with `clang-cl` on Windows. This PR
sets `LLVM_FORCE_BUILD_RUNTIME` to `ON` to bypass this limit for libc
and make libc build with `clang-cl` on Windows.
---
 llvm/cmake/modules/CrossCompile.cmake | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake
index 39b4abaa0d9313e..e36a71f522d82c9 100644
--- a/llvm/cmake/modules/CrossCompile.cmake
+++ b/llvm/cmake/modules/CrossCompile.cmake
@@ -71,6 +71,12 @@ function(llvm_create_cross_target project_name target_name toolchain buildtype)
 
   if("libc" IN_LIST LLVM_ENABLE_PROJECTS AND NOT LIBC_HDRGEN_EXE)
     set(libc_flags -DLLVM_LIBC_FULL_BUILD=ON -DLIBC_HDRGEN_ONLY=ON)
+    if(MSVC)
+      # Due to some issues mentioned in llvm/projects/CMakeLists.txt, libc build is disabled by
+      # default in the cross target when building with MSVC compatible compilers on Windows. Add
+      # LLVM_FORCE_BUILD_RUNTIME to bypass this issue and force its building on Windows.
+      list(APPEND libc_flags -DLLVM_FORCE_BUILD_RUNTIME=ON)
+    endif()
   endif()
 
   add_custom_command(OUTPUT ${${project_name}_${target_name}_BUILD}/CMakeCache.txt

From ad00e8a8ddf8fd5e3a49a1319bec93bfea8d37c1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Aug 2024 18:20:24 +0100
Subject: [PATCH 040/112] [DAG] Replace m_SpecificInt(1) -> m_One()

For SDPatternMatch there's no difference in undef/poison vector element handling - in fact m_One() just wraps m_SpecificInt(1)
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7a6a1c6b832b39c..f827eb559a01cf8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2563,14 +2563,12 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) {
 
   if ((!LegalOperations || hasOperation(ISD::AVGCEILU, VT)) &&
       sd_match(N, m_Sub(m_Or(m_Value(A), m_Value(B)),
-                        m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)),
-                              m_SpecificInt(1))))) {
+                        m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)), m_One())))) {
     return DAG.getNode(ISD::AVGCEILU, DL, VT, A, B);
   }
   if ((!LegalOperations || hasOperation(ISD::AVGCEILS, VT)) &&
       sd_match(N, m_Sub(m_Or(m_Value(A), m_Value(B)),
-                        m_Sra(m_Xor(m_Deferred(A), m_Deferred(B)),
-                              m_SpecificInt(1))))) {
+                        m_Sra(m_Xor(m_Deferred(A), m_Deferred(B)), m_One())))) {
     return DAG.getNode(ISD::AVGCEILS, DL, VT, A, B);
   }
   return SDValue();
@@ -2928,14 +2926,12 @@ SDValue DAGCombiner::foldAddToAvg(SDNode *N, const SDLoc &DL) {
 
   if ((!LegalOperations || hasOperation(ISD::AVGFLOORU, VT)) &&
       sd_match(N, m_Add(m_And(m_Value(A), m_Value(B)),
-                        m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)),
-                              m_SpecificInt(1))))) {
+                        m_Srl(m_Xor(m_Deferred(A), m_Deferred(B)), m_One())))) {
     return DAG.getNode(ISD::AVGFLOORU, DL, VT, A, B);
   }
   if ((!LegalOperations || hasOperation(ISD::AVGFLOORS, VT)) &&
       sd_match(N, m_Add(m_And(m_Value(A), m_Value(B)),
-                        m_Sra(m_Xor(m_Deferred(A), m_Deferred(B)),
-                              m_SpecificInt(1))))) {
+                        m_Sra(m_Xor(m_Deferred(A), m_Deferred(B)), m_One())))) {
     return DAG.getNode(ISD::AVGFLOORS, DL, VT, A, B);
   }
 

From 5ab7b0de0ba59ca3b89db7b5bb72282f2d51c5e2 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 8 Aug 2024 10:26:29 -0700
Subject: [PATCH 041/112] [SLP][NFC]Add a test for segmented load, NFC.

---
 .../SLPVectorizer/RISCV/segmented-loads.ll    | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
new file mode 100644
index 000000000000000..54eb564768318b8
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=riscv64-unknown-linux -mattr=+v -passes=slp-vectorizer -S | FileCheck %s
+
+@src = common global [8 x double] zeroinitializer, align 64
+@dst = common global [4 x double] zeroinitializer, align 64
+
+define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i64(ptr align 8 @src, i64 16, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i64(ptr align 8 getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), i64 16, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    store <4 x double> [[TMP3]], ptr @dst, align 8
+; CHECK-NEXT:    ret void
+;
+  %a0 = load double, ptr @src, align 8
+  %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
+  %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
+  %a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
+  %a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
+  %a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
+  %a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
+  %a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+  %res1 = fsub fast double %a0, %a1
+  %res2 = fsub fast double %a2, %a3
+  %res3 = fsub fast double %a4, %a5
+  %res4 = fsub fast double %a6, %a7
+  store double %res1, ptr @dst, align 8
+  store double %res2, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 1), align 8
+  store double %res3, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 2), align 8
+  store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
+  ret void
+}

From 4516e631554ef05cfe38600097c7993be538c636 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 8 Aug 2024 12:32:30 -0500
Subject: [PATCH 042/112] [compiler-rt] Pass the CMake binary dir directly
 (#102493)

Summary:
The usage of `LLVM_CMAKE_DIR` is confusing in CMake, because in LLVM it
refers to `llvm/cmake/modules/` while compiler-rt wants the `build/` dir
for finding LLVM. THis passes it manually.
---
 llvm/runtimes/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 42b1b86ebaadf0e..5a4435bc96e0c80 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -89,6 +89,7 @@ function(builtin_default_target compiler_rt_path)
                                       -DLLVM_RUNTIME_OUTPUT_INTDIR=${LLVM_TOOLS_BINARY_DIR}
                                       -DLLVM_DEFAULT_TARGET_TRIPLE=${LLVM_TARGET_TRIPLE}
                                       -DLLVM_ENABLE_PER_TARGET_RUNTIME_DIR=${LLVM_ENABLE_PER_TARGET_RUNTIME_DIR}
+                                      -DLLVM_CMAKE_DIR=${CMAKE_BINARY_DIR}
                                       -DCMAKE_C_COMPILER_WORKS=ON
                                       -DCMAKE_ASM_COMPILER_WORKS=ON
                                       ${COMMON_CMAKE_ARGS}
@@ -128,6 +129,7 @@ function(builtin_register_target compiler_rt_path name)
                            CMAKE_ARGS -DLLVM_LIBRARY_OUTPUT_INTDIR=${LLVM_LIBRARY_DIR}
                                       -DLLVM_RUNTIME_OUTPUT_INTDIR=${LLVM_TOOLS_BINARY_DIR}
                                       -DLLVM_ENABLE_PER_TARGET_RUNTIME_DIR=ON
+                                      -DLLVM_CMAKE_DIR=${CMAKE_BINARY_DIR}
                                       -DCMAKE_C_COMPILER_WORKS=ON
                                       -DCMAKE_ASM_COMPILER_WORKS=ON
                                       -DCOMPILER_RT_DEFAULT_TARGET_ONLY=ON

From 4d2009cdc7f5af9609e4dfec30db625db1a6cd5b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Aug 2024 18:36:15 +0100
Subject: [PATCH 043/112] [X86] FindSingleBitChange - use m_SpecificInt
 directly when peeking through shiftmasks

No change in poison/undef detection
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2891e21be1b2676..4800b2bdf011a82 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30578,13 +30578,12 @@ static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
 
       Value *BitV = I->getOperand(1);
 
+      // Read past a shiftmask instruction to find count
       Value *AndOp;
-      const APInt *AndC;
-      if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
-        // Read past a shiftmask instruction to find count
-        if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
-          BitV = AndOp;
-      }
+      uint64_t ShiftMask = I->getType()->getPrimitiveSizeInBits() - 1;
+      if (match(BitV, m_c_And(m_Value(AndOp), m_SpecificInt(ShiftMask))))
+        BitV = AndOp;
+
       return {BitV, BTK};
     }
   }

From 9ddff0b93e01d603a1a9b04c32505095841c5be5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 8 Aug 2024 18:40:59 +0100
Subject: [PATCH 044/112] [X86] FindSingleBitChange - reduce scope of dyn_cast
 checks. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4800b2bdf011a82..f69606783f25c87 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30529,8 +30529,7 @@ enum BitTestKind : unsigned {
 static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
   using namespace llvm::PatternMatch;
   BitTestKind BTK = UndefBit;
-  auto *C = dyn_cast<ConstantInt>(V);
-  if (C) {
+  if (auto *C = dyn_cast<ConstantInt>(V)) {
     // Check if V is a power of 2 or NOT power of 2.
     if (isPowerOf2_64(C->getZExtValue()))
       BTK = ConstantBit;
@@ -30540,8 +30539,7 @@ static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
   }
 
   // Check if V is some power of 2 pattern known to be non-zero
-  auto *I = dyn_cast<Instruction>(V);
-  if (I) {
+  if (auto *I = dyn_cast<Instruction>(V)) {
     bool Not = false;
     // Check if we have a NOT
     Value *PeekI;

From 458fa12202868281f11680a18878d57defcccc9f Mon Sep 17 00:00:00 2001
From: Jannick Kremer <jannick.kremer@mailbox.org>
Date: Thu, 8 Aug 2024 18:43:01 +0100
Subject: [PATCH 045/112] [Github] Allow CI to run different Python version
 tests at once (#102455)

Previously, #77219 added a `python_version` parameter for the Github
Actions CI Ninja-based build tests. This is necessary to run component
tests on different Python versions, as is currently done by the only
user of this parameter, the [Libclang Python bindings
test](https://github.com/llvm/llvm-project/blob/main/.github/workflows/libclang-python-tests.yml).
The parameter is missing from the concurrency group of the
workflow, meaning that starting the workflow with two different Python
versions immediately cancels one of them, as pointed out by
https://github.com/llvm/llvm-project/pull/77219#issuecomment-1937105822.
This change fixes that problem by making the Python version part of the
concurrency group key, and removes the superfluous concurrency group
from the calling workflow.
---
 .github/workflows/libclang-python-tests.yml | 6 ------
 .github/workflows/llvm-project-tests.yml    | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/.github/workflows/libclang-python-tests.yml b/.github/workflows/libclang-python-tests.yml
index 43ded0af3ac21cf..801a701724789ad 100644
--- a/.github/workflows/libclang-python-tests.yml
+++ b/.github/workflows/libclang-python-tests.yml
@@ -22,12 +22,6 @@ on:
       - '.github/workflows/libclang-python-tests.yml'
       - '.github/workflows/llvm-project-tests.yml'
 
-concurrency:
-  # Skip intermediate builds: always.
-  # Cancel intermediate builds: only if it is a pull request build.
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
-
 jobs:
   check-clang-python:
     # Build libclang and then run the libclang Python binding's unit tests.
diff --git a/.github/workflows/llvm-project-tests.yml b/.github/workflows/llvm-project-tests.yml
index 17a54be16badc15..95a3890c0d2dc7f 100644
--- a/.github/workflows/llvm-project-tests.yml
+++ b/.github/workflows/llvm-project-tests.yml
@@ -51,7 +51,7 @@ concurrency:
   # Cancel intermediate builds: only if it is a pull request build.
   # If the group name here is the same as the group name in the workflow that includes
   # this one, then the action will try to wait on itself and get stuck.
-  group: llvm-project-${{ github.workflow }}-${{ inputs.projects }}${{ github.ref }}
+  group: llvm-project-${{ github.workflow }}-${{ inputs.projects }}-${{ inputs.python_version }}${{ github.ref }}
   cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
 
 jobs:

From 6b5308b7924108d63149d7c521f21c5e90da7a09 Mon Sep 17 00:00:00 2001
From: Abhina Sree <Abhina.Sreeskantharajan@ibm.com>
Date: Thu, 8 Aug 2024 13:58:30 -0400
Subject: [PATCH 046/112] [NFC] Remove unnecessary copy of Triples (#102469)

Remove unnecessary copy of Triple in ItaniumMangle.cpp, SemaType.cpp
---
 clang/lib/AST/ItaniumMangle.cpp | 4 +---
 clang/lib/Sema/SemaType.cpp     | 3 +--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index dc317f2a7870b8d..976670d1efa5618 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -2727,8 +2727,6 @@ void CXXNameMangler::mangleQualifiers(Qualifiers Quals, const DependentAddressSp
     //   <type> ::= U <OpenCL-addrspace>
     //   <type> ::= U <CUDA-addrspace>
 
-    llvm::Triple Triple = getASTContext().getTargetInfo().getTriple();
-
     SmallString<64> ASString;
     LangAS AS = Quals.getAddressSpace();
 
@@ -2800,7 +2798,7 @@ void CXXNameMangler::mangleQualifiers(Qualifiers Quals, const DependentAddressSp
         // For z/OS, there are no special mangling rules applied to the ptr32
         // qualifier. Ex: void foo(int * __ptr32 p) -> _Z3f2Pi. The mangling for
         // "p" is treated the same as a regular integer pointer.
-        if (!Triple.isOSzOS())
+        if (!getASTContext().getTargetInfo().getTriple().isOSzOS())
           ASString = "ptr32_uptr";
         break;
       case LangAS::ptr64:
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 91c8f18648fa473..08020f9f889270a 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -7057,7 +7057,6 @@ static bool handleMSPointerTypeQualifierAttr(TypeProcessingState &State,
 
   // Add address space to type based on its attributes.
   LangAS ASIdx = LangAS::Default;
-  llvm::Triple Triple = S.Context.getTargetInfo().getTriple();
   uint64_t PtrWidth =
       S.Context.getTargetInfo().getPointerWidth(LangAS::Default);
   if (PtrWidth == 32) {
@@ -7066,7 +7065,7 @@ static bool handleMSPointerTypeQualifierAttr(TypeProcessingState &State,
     else if (Attrs[attr::UPtr])
       ASIdx = LangAS::ptr32_uptr;
   } else if (PtrWidth == 64 && Attrs[attr::Ptr32]) {
-    if (Triple.isOSzOS() || Attrs[attr::UPtr])
+    if (S.Context.getTargetInfo().getTriple().isOSzOS() || Attrs[attr::UPtr])
       ASIdx = LangAS::ptr32_uptr;
     else
       ASIdx = LangAS::ptr32_sptr;

From 28ba8a56b6fb9ec61897fa84369f46e43be94c03 Mon Sep 17 00:00:00 2001
From: Greg Clayton <gclayton@fb.com>
Date: Thu, 8 Aug 2024 11:04:52 -0700
Subject: [PATCH 047/112] [LLDB] Impove ObjectFileELF's .dynamic parsing and
 usage. (#101237)

This patch improves the ability of a ObjectFileELF instance to read the .dynamic section. It adds the ability to read the .dynamic section from the PT_DYNAMIC program header which is useful for ELF files that have no section headers and for ELF files that are read from memory. It cleans up the usage of the .dynamic entries so that ObjectFileELF::ParseDynamicSymbols() is the only code that parses .dynamic entries, teaches that function the read and store the string values for each .dynamic entry. We now dump the .dynamic entries in the output of "image dump objfile". It also cleans up the code that gets the dynamic string table so that it can grab it from the DT_STRTAB and DT_STRSZ .dynamic entries for when we have a ELF file with no section headers or we are reading it from memory.
---
 lldb/include/lldb/Symbol/ObjectFile.h         |   5 +-
 .../Plugins/ObjectFile/ELF/ObjectFileELF.cpp  | 422 +++++++++++++-----
 .../Plugins/ObjectFile/ELF/ObjectFileELF.h    |  35 +-
 lldb/source/Symbol/ObjectFile.cpp             |   7 +-
 .../ObjectFile/ELF/Inputs/memory-elf.cpp      |   5 +
 .../ObjectFile/ELF/elf-dynamic-no-shdrs.yaml  |  90 ++++
 .../test/Shell/ObjectFile/ELF/elf-memory.test |  55 +++
 7 files changed, 493 insertions(+), 126 deletions(-)
 create mode 100644 lldb/test/Shell/ObjectFile/ELF/Inputs/memory-elf.cpp
 create mode 100644 lldb/test/Shell/ObjectFile/ELF/elf-dynamic-no-shdrs.yaml
 create mode 100644 lldb/test/Shell/ObjectFile/ELF/elf-memory.test

diff --git a/lldb/include/lldb/Symbol/ObjectFile.h b/lldb/include/lldb/Symbol/ObjectFile.h
index 8592323322e383e..d89314d44bf6719 100644
--- a/lldb/include/lldb/Symbol/ObjectFile.h
+++ b/lldb/include/lldb/Symbol/ObjectFile.h
@@ -656,8 +656,9 @@ class ObjectFile : public std::enable_shared_from_this<ObjectFile>,
   // When an object file is in memory, subclasses should try and lock the
   // process weak pointer. If the process weak pointer produces a valid
   // ProcessSP, then subclasses can call this function to read memory.
-  static lldb::DataBufferSP ReadMemory(const lldb::ProcessSP &process_sp,
-                                       lldb::addr_t addr, size_t byte_size);
+  static lldb::WritableDataBufferSP
+  ReadMemory(const lldb::ProcessSP &process_sp, lldb::addr_t addr,
+             size_t byte_size);
 
   // This function returns raw file contents. Do not use it if you want
   // transparent decompression of section contents.
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 890db5c27481462..414ac6112d5797f 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -419,19 +419,35 @@ ObjectFile *ObjectFileELF::CreateInstance(const lldb::ModuleSP &module_sp,
 ObjectFile *ObjectFileELF::CreateMemoryInstance(
     const lldb::ModuleSP &module_sp, WritableDataBufferSP data_sp,
     const lldb::ProcessSP &process_sp, lldb::addr_t header_addr) {
-  if (data_sp && data_sp->GetByteSize() > (llvm::ELF::EI_NIDENT)) {
-    const uint8_t *magic = data_sp->GetBytes();
-    if (ELFHeader::MagicBytesMatch(magic)) {
-      unsigned address_size = ELFHeader::AddressSizeInBytes(magic);
-      if (address_size == 4 || address_size == 8) {
-        std::unique_ptr<ObjectFileELF> objfile_up(
-            new ObjectFileELF(module_sp, data_sp, process_sp, header_addr));
-        ArchSpec spec = objfile_up->GetArchitecture();
-        if (spec && objfile_up->SetModulesArchitecture(spec))
-          return objfile_up.release();
-      }
-    }
-  }
+  if (!data_sp || data_sp->GetByteSize() < (llvm::ELF::EI_NIDENT))
+    return nullptr;
+  const uint8_t *magic = data_sp->GetBytes();
+  if (!ELFHeader::MagicBytesMatch(magic))
+    return nullptr;
+  // Read the ELF header first so we can figure out how many bytes we need
+  // to read to get as least the ELF header + program headers.
+  DataExtractor data;
+  data.SetData(data_sp);
+  elf::ELFHeader hdr;
+  lldb::offset_t offset = 0;
+  if (!hdr.Parse(data, &offset))
+    return nullptr;
+
+  // Make sure the address size is set correctly in the ELF header.
+  if (!hdr.Is32Bit() && !hdr.Is64Bit())
+    return nullptr;
+  // Figure out where the program headers end and read enough bytes to get the
+  // program headers in their entirety.
+  lldb::offset_t end_phdrs = hdr.e_phoff + (hdr.e_phentsize * hdr.e_phnum);
+  if (end_phdrs > data_sp->GetByteSize())
+    data_sp = ReadMemory(process_sp, header_addr, end_phdrs);
+
+  std::unique_ptr<ObjectFileELF> objfile_up(
+      new ObjectFileELF(module_sp, data_sp, process_sp, header_addr));
+  ArchSpec spec = objfile_up->GetArchitecture();
+  if (spec && objfile_up->SetModulesArchitecture(spec))
+    return objfile_up.release();
+
   return nullptr;
 }
 
@@ -873,42 +889,37 @@ Address ObjectFileELF::GetImageInfoAddress(Target *target) {
   if (!section_list)
     return Address();
 
-  // Find the SHT_DYNAMIC (.dynamic) section.
-  SectionSP dynsym_section_sp(
-      section_list->FindSectionByType(eSectionTypeELFDynamicLinkInfo, true));
-  if (!dynsym_section_sp)
-    return Address();
-  assert(dynsym_section_sp->GetObjectFile() == this);
+  for (size_t i = 0; i < m_dynamic_symbols.size(); ++i) {
+    const ELFDynamic &symbol = m_dynamic_symbols[i].symbol;
 
-  user_id_t dynsym_id = dynsym_section_sp->GetID();
-  const ELFSectionHeaderInfo *dynsym_hdr = GetSectionHeaderByIndex(dynsym_id);
-  if (!dynsym_hdr)
-    return Address();
+    if (symbol.d_tag != DT_DEBUG && symbol.d_tag != DT_MIPS_RLD_MAP &&
+        symbol.d_tag != DT_MIPS_RLD_MAP_REL)
+      continue;
 
-  for (size_t i = 0; i < m_dynamic_symbols.size(); ++i) {
-    ELFDynamic &symbol = m_dynamic_symbols[i];
+    // Compute the offset as the number of previous entries plus the size of
+    // d_tag.
+    const addr_t offset = (i * 2 + 1) * GetAddressByteSize();
+    const addr_t d_file_addr = m_dynamic_base_addr + offset;
+    Address d_addr;
+    if (d_addr.ResolveAddressUsingFileSections(d_file_addr, GetSectionList()))
+      return Address();
+    if (symbol.d_tag == DT_DEBUG)
+      return d_addr;
 
-    if (symbol.d_tag == DT_DEBUG) {
-      // Compute the offset as the number of previous entries plus the size of
-      // d_tag.
-      addr_t offset = i * dynsym_hdr->sh_entsize + GetAddressByteSize();
-      return Address(dynsym_section_sp, offset);
-    }
     // MIPS executables uses DT_MIPS_RLD_MAP_REL to support PIE. DT_MIPS_RLD_MAP
     // exists in non-PIE.
-    else if ((symbol.d_tag == DT_MIPS_RLD_MAP ||
-              symbol.d_tag == DT_MIPS_RLD_MAP_REL) &&
-             target) {
-      addr_t offset = i * dynsym_hdr->sh_entsize + GetAddressByteSize();
-      addr_t dyn_base = dynsym_section_sp->GetLoadBaseAddress(target);
-      if (dyn_base == LLDB_INVALID_ADDRESS)
+    if ((symbol.d_tag == DT_MIPS_RLD_MAP ||
+         symbol.d_tag == DT_MIPS_RLD_MAP_REL) &&
+        target) {
+      const addr_t d_load_addr = d_addr.GetLoadAddress(target);
+      if (d_load_addr == LLDB_INVALID_ADDRESS)
         return Address();
 
       Status error;
       if (symbol.d_tag == DT_MIPS_RLD_MAP) {
         // DT_MIPS_RLD_MAP tag stores an absolute address of the debug pointer.
         Address addr;
-        if (target->ReadPointerFromMemory(dyn_base + offset, error, addr, true))
+        if (target->ReadPointerFromMemory(d_load_addr, error, addr, true))
           return addr;
       }
       if (symbol.d_tag == DT_MIPS_RLD_MAP_REL) {
@@ -916,18 +927,17 @@ Address ObjectFileELF::GetImageInfoAddress(Target *target) {
         // relative to the address of the tag.
         uint64_t rel_offset;
         rel_offset = target->ReadUnsignedIntegerFromMemory(
-            dyn_base + offset, GetAddressByteSize(), UINT64_MAX, error, true);
+            d_load_addr, GetAddressByteSize(), UINT64_MAX, error, true);
         if (error.Success() && rel_offset != UINT64_MAX) {
           Address addr;
           addr_t debug_ptr_address =
-              dyn_base + (offset - GetAddressByteSize()) + rel_offset;
+              d_load_addr - GetAddressByteSize() + rel_offset;
           addr.SetOffset(debug_ptr_address);
           return addr;
         }
       }
     }
   }
-
   return Address();
 }
 
@@ -970,62 +980,23 @@ Address ObjectFileELF::GetBaseAddress() {
   return LLDB_INVALID_ADDRESS;
 }
 
-// ParseDependentModules
 size_t ObjectFileELF::ParseDependentModules() {
   if (m_filespec_up)
     return m_filespec_up->GetSize();
 
   m_filespec_up = std::make_unique<FileSpecList>();
 
-  if (!ParseSectionHeaders())
-    return 0;
-
-  SectionList *section_list = GetSectionList();
-  if (!section_list)
-    return 0;
-
-  // Find the SHT_DYNAMIC section.
-  Section *dynsym =
-      section_list->FindSectionByType(eSectionTypeELFDynamicLinkInfo, true)
-          .get();
-  if (!dynsym)
-    return 0;
-  assert(dynsym->GetObjectFile() == this);
-
-  const ELFSectionHeaderInfo *header = GetSectionHeaderByIndex(dynsym->GetID());
-  if (!header)
-    return 0;
-  // sh_link: section header index of string table used by entries in the
-  // section.
-  Section *dynstr = section_list->FindSectionByID(header->sh_link).get();
-  if (!dynstr)
-    return 0;
-
-  DataExtractor dynsym_data;
-  DataExtractor dynstr_data;
-  if (ReadSectionData(dynsym, dynsym_data) &&
-      ReadSectionData(dynstr, dynstr_data)) {
-    ELFDynamic symbol;
-    const lldb::offset_t section_size = dynsym_data.GetByteSize();
-    lldb::offset_t offset = 0;
-
-    // The only type of entries we are concerned with are tagged DT_NEEDED,
-    // yielding the name of a required library.
-    while (offset < section_size) {
-      if (!symbol.Parse(dynsym_data, &offset))
-        break;
-
-      if (symbol.d_tag != DT_NEEDED)
+  if (ParseDynamicSymbols()) {
+    for (const auto &entry : m_dynamic_symbols) {
+      if (entry.symbol.d_tag != DT_NEEDED)
         continue;
-
-      uint32_t str_index = static_cast<uint32_t>(symbol.d_val);
-      const char *lib_name = dynstr_data.PeekCStr(str_index);
-      FileSpec file_spec(lib_name);
-      FileSystem::Instance().Resolve(file_spec);
-      m_filespec_up->Append(file_spec);
+      if (!entry.name.empty()) {
+        FileSpec file_spec(entry.name);
+        FileSystem::Instance().Resolve(file_spec);
+        m_filespec_up->Append(file_spec);
+      }
     }
   }
-
   return m_filespec_up->GetSize();
 }
 
@@ -2472,48 +2443,47 @@ size_t ObjectFileELF::ParseDynamicSymbols() {
   if (m_dynamic_symbols.size())
     return m_dynamic_symbols.size();
 
-  SectionList *section_list = GetSectionList();
-  if (!section_list)
+  std::optional<DataExtractor> dynamic_data = GetDynamicData();
+  if (!dynamic_data)
     return 0;
 
-  // Find the SHT_DYNAMIC section.
-  Section *dynsym =
-      section_list->FindSectionByType(eSectionTypeELFDynamicLinkInfo, true)
-          .get();
-  if (!dynsym)
-    return 0;
-  assert(dynsym->GetObjectFile() == this);
-
-  ELFDynamic symbol;
-  DataExtractor dynsym_data;
-  if (ReadSectionData(dynsym, dynsym_data)) {
-    const lldb::offset_t section_size = dynsym_data.GetByteSize();
-    lldb::offset_t cursor = 0;
-
-    while (cursor < section_size) {
-      if (!symbol.Parse(dynsym_data, &cursor))
+  ELFDynamicWithName e;
+  lldb::offset_t cursor = 0;
+  while (e.symbol.Parse(*dynamic_data, &cursor)) {
+    m_dynamic_symbols.push_back(e);
+    if (e.symbol.d_tag == DT_NULL)
+      break;
+  }
+  if (std::optional<DataExtractor> dynstr_data = GetDynstrData()) {
+    for (ELFDynamicWithName &entry : m_dynamic_symbols) {
+      switch (entry.symbol.d_tag) {
+      case DT_NEEDED:
+      case DT_SONAME:
+      case DT_RPATH:
+      case DT_RUNPATH:
+      case DT_AUXILIARY:
+      case DT_FILTER: {
+        lldb::offset_t cursor = entry.symbol.d_val;
+        const char *name = dynstr_data->GetCStr(&cursor);
+        if (name)
+          entry.name = std::string(name);
         break;
-
-      m_dynamic_symbols.push_back(symbol);
+      }
+      default:
+        break;
+      }
     }
   }
-
   return m_dynamic_symbols.size();
 }
 
 const ELFDynamic *ObjectFileELF::FindDynamicSymbol(unsigned tag) {
   if (!ParseDynamicSymbols())
     return nullptr;
-
-  DynamicSymbolCollIter I = m_dynamic_symbols.begin();
-  DynamicSymbolCollIter E = m_dynamic_symbols.end();
-  for (; I != E; ++I) {
-    ELFDynamic *symbol = &*I;
-
-    if (symbol->d_tag == tag)
-      return symbol;
+  for (const auto &entry : m_dynamic_symbols) {
+    if (entry.symbol.d_tag == tag)
+      return &entry.symbol;
   }
-
   return nullptr;
 }
 
@@ -3230,7 +3200,10 @@ void ObjectFileELF::Dump(Stream *s) {
   ArchSpec header_arch = GetArchitecture();
 
   *s << ", file = '" << m_file
-     << "', arch = " << header_arch.GetArchitectureName() << "\n";
+     << "', arch = " << header_arch.GetArchitectureName();
+  if (m_memory_addr != LLDB_INVALID_ADDRESS)
+    s->Printf(", addr = %#16.16" PRIx64, m_memory_addr);
+  s->EOL();
 
   DumpELFHeader(s, m_header);
   s->EOL();
@@ -3248,6 +3221,8 @@ void ObjectFileELF::Dump(Stream *s) {
   s->EOL();
   DumpDependentModules(s);
   s->EOL();
+  DumpELFDynamic(s);
+  s->EOL();
 }
 
 // DumpELFHeader
@@ -3492,6 +3467,111 @@ void ObjectFileELF::DumpDependentModules(lldb_private::Stream *s) {
   }
 }
 
+std::string static getDynamicTagAsString(uint16_t Arch, uint64_t Type) {
+#define DYNAMIC_STRINGIFY_ENUM(tag, value)                                     \
+  case value:                                                                  \
+    return #tag;
+
+#define DYNAMIC_TAG(n, v)
+  switch (Arch) {
+  case llvm::ELF::EM_AARCH64:
+    switch (Type) {
+#define AARCH64_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef AARCH64_DYNAMIC_TAG
+    }
+    break;
+
+  case llvm::ELF::EM_HEXAGON:
+    switch (Type) {
+#define HEXAGON_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef HEXAGON_DYNAMIC_TAG
+    }
+    break;
+
+  case llvm::ELF::EM_MIPS:
+    switch (Type) {
+#define MIPS_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef MIPS_DYNAMIC_TAG
+    }
+    break;
+
+  case llvm::ELF::EM_PPC:
+    switch (Type) {
+#define PPC_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef PPC_DYNAMIC_TAG
+    }
+    break;
+
+  case llvm::ELF::EM_PPC64:
+    switch (Type) {
+#define PPC64_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef PPC64_DYNAMIC_TAG
+    }
+    break;
+
+  case llvm::ELF::EM_RISCV:
+    switch (Type) {
+#define RISCV_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef RISCV_DYNAMIC_TAG
+    }
+    break;
+  }
+#undef DYNAMIC_TAG
+  switch (Type) {
+// Now handle all dynamic tags except the architecture specific ones
+#define AARCH64_DYNAMIC_TAG(name, value)
+#define MIPS_DYNAMIC_TAG(name, value)
+#define HEXAGON_DYNAMIC_TAG(name, value)
+#define PPC_DYNAMIC_TAG(name, value)
+#define PPC64_DYNAMIC_TAG(name, value)
+#define RISCV_DYNAMIC_TAG(name, value)
+// Also ignore marker tags such as DT_HIOS (maps to DT_VERNEEDNUM), etc.
+#define DYNAMIC_TAG_MARKER(name, value)
+#define DYNAMIC_TAG(name, value)                                               \
+  case value:                                                                  \
+    return #name;
+#include "llvm/BinaryFormat/DynamicTags.def"
+#undef DYNAMIC_TAG
+#undef AARCH64_DYNAMIC_TAG
+#undef MIPS_DYNAMIC_TAG
+#undef HEXAGON_DYNAMIC_TAG
+#undef PPC_DYNAMIC_TAG
+#undef PPC64_DYNAMIC_TAG
+#undef RISCV_DYNAMIC_TAG
+#undef DYNAMIC_TAG_MARKER
+#undef DYNAMIC_STRINGIFY_ENUM
+  default:
+    return "<unknown:>0x" + llvm::utohexstr(Type, true);
+  }
+}
+
+void ObjectFileELF::DumpELFDynamic(lldb_private::Stream *s) {
+  ParseDynamicSymbols();
+  if (m_dynamic_symbols.empty())
+    return;
+
+  s->PutCString(".dynamic:\n");
+  s->PutCString("IDX  d_tag            d_val/d_ptr\n");
+  s->PutCString("==== ---------------- ------------------\n");
+  uint32_t idx = 0;
+  for (const auto &entry : m_dynamic_symbols) {
+    s->Printf("[%2u] ", idx++);
+    s->Printf(
+        "%-16s 0x%16.16" PRIx64,
+        getDynamicTagAsString(m_header.e_machine, entry.symbol.d_tag).c_str(),
+        entry.symbol.d_ptr);
+    if (!entry.name.empty())
+      s->Printf(" \"%s\"", entry.name.c_str());
+    s->EOL();
+  }
+}
+
 ArchSpec ObjectFileELF::GetArchitecture() {
   if (!ParseHeader())
     return ArchSpec();
@@ -3664,7 +3744,24 @@ llvm::ArrayRef<ELFProgramHeader> ObjectFileELF::ProgramHeaders() {
 }
 
 DataExtractor ObjectFileELF::GetSegmentData(const ELFProgramHeader &H) {
-  return DataExtractor(m_data, H.p_offset, H.p_filesz);
+  // Try and read the program header from our cached m_data which can come from
+  // the file on disk being mmap'ed or from the initial part of the ELF file we
+  // read from memory and cached.
+  DataExtractor data = DataExtractor(m_data, H.p_offset, H.p_filesz);
+  if (data.GetByteSize() == H.p_filesz)
+    return data;
+  if (IsInMemory()) {
+    // We have a ELF file in process memory, read the program header data from
+    // the process.
+    if (ProcessSP process_sp = m_process_wp.lock()) {
+      const lldb::offset_t base_file_addr = GetBaseAddress().GetFileAddress();
+      const addr_t load_bias = m_memory_addr - base_file_addr;
+      const addr_t data_addr = H.p_vaddr + load_bias;
+      if (DataBufferSP data_sp = ReadMemory(process_sp, data_addr, H.p_memsz))
+        return DataExtractor(data_sp, GetByteOrder(), GetAddressByteSize());
+    }
+  }
+  return DataExtractor();
 }
 
 bool ObjectFileELF::AnySegmentHasPhysicalAddress() {
@@ -3704,3 +3801,88 @@ ObjectFileELF::MapFileDataWritable(const FileSpec &file, uint64_t Size,
   return FileSystem::Instance().CreateWritableDataBuffer(file.GetPath(), Size,
                                                          Offset);
 }
+
+std::optional<DataExtractor> ObjectFileELF::GetDynstrData() {
+  if (SectionList *section_list = GetSectionList()) {
+    // Find the SHT_DYNAMIC section.
+    if (Section *dynamic =
+            section_list
+                ->FindSectionByType(eSectionTypeELFDynamicLinkInfo, true)
+                .get()) {
+      assert(dynamic->GetObjectFile() == this);
+      if (const ELFSectionHeaderInfo *header =
+              GetSectionHeaderByIndex(dynamic->GetID())) {
+        // sh_link: section header index of string table used by entries in
+        // the section.
+        if (Section *dynstr =
+                section_list->FindSectionByID(header->sh_link).get()) {
+          DataExtractor data;
+          if (ReadSectionData(dynstr, data))
+            return data;
+        }
+      }
+    }
+  }
+
+  // Every ELF file which represents an executable or shared library has
+  // mandatory .dynamic entries. Two of these values are DT_STRTAB and DT_STRSZ
+  // and represent the dynamic symbol tables's string table. These are needed
+  // by the dynamic loader and we can read them from a process' address space.
+  //
+  // When loading and ELF file from memory, only the program headers end up
+  // being mapped into memory, and we can find these values in the PT_DYNAMIC
+  // segment.
+  const ELFDynamic *strtab = FindDynamicSymbol(DT_STRTAB);
+  const ELFDynamic *strsz = FindDynamicSymbol(DT_STRSZ);
+  if (strtab == nullptr || strsz == nullptr)
+    return std::nullopt;
+
+  if (ProcessSP process_sp = m_process_wp.lock()) {
+    if (DataBufferSP data_sp =
+            ReadMemory(process_sp, strtab->d_ptr, strsz->d_val))
+      return DataExtractor(data_sp, GetByteOrder(), GetAddressByteSize());
+  } else {
+    // We have an ELF file with no section headers or we didn't find the
+    // .dynamic section. Try and find the .dynstr section.
+    Address addr;
+    if (addr.ResolveAddressUsingFileSections(strtab->d_ptr, GetSectionList())) {
+      DataExtractor data;
+      addr.GetSection()->GetSectionData(data);
+      return DataExtractor(data,
+                           strtab->d_ptr - addr.GetSection()->GetFileAddress(),
+                           strsz->d_val);
+    }
+  }
+  return std::nullopt;
+}
+
+std::optional<lldb_private::DataExtractor> ObjectFileELF::GetDynamicData() {
+  DataExtractor data;
+  // The PT_DYNAMIC program header describes where the .dynamic section is and
+  // doesn't require parsing section headers. The PT_DYNAMIC is required by
+  // executables and shared libraries so it will always be available.
+  for (const ELFProgramHeader &H : ProgramHeaders()) {
+    if (H.p_type == llvm::ELF::PT_DYNAMIC) {
+      data = GetSegmentData(H);
+      if (data.GetByteSize() > 0) {
+        m_dynamic_base_addr = H.p_vaddr;
+        return data;
+      }
+    }
+  }
+  // Fall back to using section headers.
+  if (SectionList *section_list = GetSectionList()) {
+    // Find the SHT_DYNAMIC section.
+    if (Section *dynamic =
+            section_list
+                ->FindSectionByType(eSectionTypeELFDynamicLinkInfo, true)
+                .get()) {
+      assert(dynamic->GetObjectFile() == this);
+      if (ReadSectionData(dynamic, data)) {
+        m_dynamic_base_addr = dynamic->GetFileAddress();
+        return data;
+      }
+    }
+  }
+  return std::nullopt;
+}
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
index 844e981b1d890a3..aba3a5bfcbf5b6c 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
@@ -183,7 +183,11 @@ class ObjectFileELF : public lldb_private::ObjectFile {
   typedef SectionHeaderColl::iterator SectionHeaderCollIter;
   typedef SectionHeaderColl::const_iterator SectionHeaderCollConstIter;
 
-  typedef std::vector<elf::ELFDynamic> DynamicSymbolColl;
+  struct ELFDynamicWithName {
+    elf::ELFDynamic symbol;
+    std::string name;
+  };
+  typedef std::vector<ELFDynamicWithName> DynamicSymbolColl;
   typedef DynamicSymbolColl::iterator DynamicSymbolCollIter;
   typedef DynamicSymbolColl::const_iterator DynamicSymbolCollConstIter;
 
@@ -213,6 +217,10 @@ class ObjectFileELF : public lldb_private::ObjectFile {
   /// Collection of section headers.
   SectionHeaderColl m_section_headers;
 
+  /// The file address of the .dynamic section. This can be found in the p_vaddr
+  /// of the PT_DYNAMIC program header.
+  lldb::addr_t m_dynamic_base_addr = LLDB_INVALID_ADDRESS;
+
   /// Collection of symbols from the dynamic table.
   DynamicSymbolColl m_dynamic_symbols;
 
@@ -384,6 +392,9 @@ class ObjectFileELF : public lldb_private::ObjectFile {
   /// ELF dependent module dump routine.
   void DumpDependentModules(lldb_private::Stream *s);
 
+  /// ELF dump the .dynamic section
+  void DumpELFDynamic(lldb_private::Stream *s);
+
   const elf::ELFDynamic *FindDynamicSymbol(unsigned tag);
 
   unsigned PLTRelocationType();
@@ -402,6 +413,28 @@ class ObjectFileELF : public lldb_private::ObjectFile {
   /// .gnu_debugdata section or \c nullptr if an error occured or if there's no
   /// section with that name.
   std::shared_ptr<ObjectFileELF> GetGnuDebugDataObjectFile();
+
+  /// Get the bytes that represent the .dynamic section.
+  ///
+  /// This function will fetch the data for the .dynamic section in an ELF file.
+  /// The PT_DYNAMIC program header will be used to extract the data and this
+  /// function will fall back to using the section headers if PT_DYNAMIC isn't
+  /// found.
+  ///
+  /// \return The bytes that represent the string table data or \c std::nullopt
+  ///         if an error occured.
+  std::optional<lldb_private::DataExtractor> GetDynamicData();
+
+  /// Get the bytes that represent the dynamic string table data.
+  ///
+  /// This function will fetch the data for the string table in an ELF file. If
+  /// the ELF file is loaded from a file on disk, it will use the section
+  /// headers to extract the data and fall back to using the DT_STRTAB and
+  /// DT_STRSZ .dynamic entries.
+  ///
+  /// \return The bytes that represent the string table data or \c std::nullopt
+  ///         if an error occured.
+  std::optional<lldb_private::DataExtractor> GetDynstrData();
 };
 
 #endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_ELF_OBJECTFILEELF_H
diff --git a/lldb/source/Symbol/ObjectFile.cpp b/lldb/source/Symbol/ObjectFile.cpp
index 2608a9c5fb79a2f..35317d209de1f9a 100644
--- a/lldb/source/Symbol/ObjectFile.cpp
+++ b/lldb/source/Symbol/ObjectFile.cpp
@@ -454,9 +454,10 @@ AddressClass ObjectFile::GetAddressClass(addr_t file_addr) {
   return AddressClass::eUnknown;
 }
 
-DataBufferSP ObjectFile::ReadMemory(const ProcessSP &process_sp,
-                                    lldb::addr_t addr, size_t byte_size) {
-  DataBufferSP data_sp;
+WritableDataBufferSP ObjectFile::ReadMemory(const ProcessSP &process_sp,
+                                            lldb::addr_t addr,
+                                            size_t byte_size) {
+  WritableDataBufferSP data_sp;
   if (process_sp) {
     std::unique_ptr<DataBufferHeap> data_up(new DataBufferHeap(byte_size, 0));
     Status error;
diff --git a/lldb/test/Shell/ObjectFile/ELF/Inputs/memory-elf.cpp b/lldb/test/Shell/ObjectFile/ELF/Inputs/memory-elf.cpp
new file mode 100644
index 000000000000000..9cae6c99c9f7616
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/ELF/Inputs/memory-elf.cpp
@@ -0,0 +1,5 @@
+#include <stdio.h>
+int main() {
+  printf("Hello World\n"); // Use something from libc.so
+  return 0;
+}
diff --git a/lldb/test/Shell/ObjectFile/ELF/elf-dynamic-no-shdrs.yaml b/lldb/test/Shell/ObjectFile/ELF/elf-dynamic-no-shdrs.yaml
new file mode 100644
index 000000000000000..bc34e16e0b6ad24
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/ELF/elf-dynamic-no-shdrs.yaml
@@ -0,0 +1,90 @@
+## This test verifies that loading an ELF file that has no section headers can
+## find the contents on the .dynamic section and the strings associated with
+## the .dynamic seciton.
+## - Loading the .dynamic section from the PT_DYNAMIC
+## This test will make a simple executable that links against libc.so and we
+## verify that we can find the DT_NEEDED entry with the shared library found
+## in the .dynamic dump from "image dump objfile"
+
+# RUN: yaml2obj %s -o %t
+
+# RUN: %lldb -b \
+# RUN:   -o "target create -d '%t'" \
+# RUN:   -o "image dump objfile" \
+# RUN:   | FileCheck %s --dump-input=always
+# CHECK: (lldb) image dump objfile
+# CHECK: Dumping headers for 1 module(s).
+# CHECK: ObjectFileELF, file =
+# CHECK: ELF Header
+# Make sure there are no section headers
+# CHECK: e_shnum = 0x00000000
+
+# Make sure we find the program headers and see a PT_DYNAMIC entry.
+# CHECK: Program Headers
+# CHECK: IDX  p_type          p_offset p_vaddr  p_paddr  p_filesz p_memsz  p_flags                   p_align
+# CHECK: ==== --------------- -------- -------- -------- -------- -------- ------------------------- --------
+# CHECK: [ 0] PT_LOAD         000000b0 00000000 00000000 00000170 00000170 00000000 (              ) 00000001
+# CHECK: [ 1] PT_DYNAMIC      000001b0 00000100 00000100 00000070 00000070 00000000 (              ) 00000008
+
+# CHECK: Dependent Modules:
+# CHECK:    ccc
+# CHECK:    aaa
+# CHECK:    bbb
+
+# Make sure we see some sections created from the program headers
+# MAIN: SectID
+# MAIN: PT_LOAD[0]
+
+# CHECK: .dynamic:
+# CHECK: IDX  d_tag            d_val/d_ptr
+# CHECK: ==== ---------------- ------------------
+# CHECK: [ 0] STRTAB           0x0000000000000000
+# CHECK: [ 1] NEEDED           0x0000000000000009 "ccc"
+# CHECK: [ 2] NEEDED           0x0000000000000001 "aaa"
+# CHECK: [ 3] NEEDED           0x0000000000000005 "bbb"
+# CHECK: [ 4] STRSZ            0x0000000000000100
+# CHECK: [ 5] DEBUG            0x00000000deadbeef
+# CHECK: [ 6] NULL             0x0000000000000000
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Type:      SectionHeaderTable
+    NoHeaders: true
+  - Name:  .dynstr
+    Type:  SHT_STRTAB
+    Flags: [ SHF_ALLOC ]
+    Content: '00616161006262620063636300' ## 0,a,a,a,0,b,b,b,0,c,c,c,0
+    Size:    0x100
+  - Name:    .dynamic
+    Type:    SHT_DYNAMIC
+    Address: 0x100
+    Entries:
+      - Tag:   DT_STRTAB
+        Value: 0x0000000000000000
+      - Tag:   DT_NEEDED
+        Value: 9
+      - Tag:   DT_NEEDED
+        Value: 1
+      - Tag:   DT_NEEDED
+        Value: 5
+      - Tag:   DT_STRSZ
+        Value: 0x100
+      - Tag:   DT_DEBUG
+        Value: 0xdeadbeef
+      - Tag:   DT_NULL
+        Value: 0x0
+ProgramHeaders:
+  - Type: PT_LOAD
+    VAddr: 0x0
+    FirstSec: .dynstr
+    LastSec:  .dynamic
+  - Type:            PT_DYNAMIC
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x100
+    Align:           0x8
diff --git a/lldb/test/Shell/ObjectFile/ELF/elf-memory.test b/lldb/test/Shell/ObjectFile/ELF/elf-memory.test
new file mode 100644
index 000000000000000..0b1c01486a4b431
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/ELF/elf-memory.test
@@ -0,0 +1,55 @@
+// REQUIRES: system-linux, native
+
+// This test verifies that loading an ELF file from memory works and the new
+// features that were added when loading from memory work like:
+// - Loading the .dynamic section from the PT_DYNAMIC since ELF files loaded
+//   from memory don't have the section headers available.
+// This test will make a simple executable that:
+//   - links against libc
+//   - runs and stops at a breakpoint
+//   - create a memory ELF file
+//   - verify that "image dump objfile" will dump the dynamic section of the
+//     memory elf file and find the .dynamic string table.
+
+// RUN: %clang_host %p/Inputs/memory-elf.cpp -g -O0 -o %t
+
+// RUN: %lldb %t -b \
+// RUN:   -o "b main" \
+// RUN:   -o "run" \
+// RUN:   -o "script real_module = lldb.target.module[0]" \
+// RUN:   -o "script base_addr = real_module.GetObjectFileHeaderAddress().GetLoadAddress(lldb.target)" \
+// RUN:   -o "script mem_module = lldb.SBModule(lldb.process, base_addr)" \
+// RUN:   -o "script target2 = lldb.debugger.CreateTarget('')" \
+// RUN:   -o "script target2.AddModule(mem_module)" \
+// RUN:   -o "target select 1" \
+// RUN:   -o "image dump objfile" \
+// RUN:   | FileCheck %s --check-prefix=MAIN --dump-input=always
+// MAIN: (lldb) image dump objfile
+// MAIN: Dumping headers for 1 module(s).
+// MAIN: ObjectFileELF, file = '', arch = {{.*, addr = 0x[0-9a-f]+}}
+// MAIN: ELF Header
+
+// Make sure we find the program headers and see a PT_DYNAMIC entry.
+// MAIN: Program Headers
+// MAIN: ] PT_DYNAMIC
+
+// Make sure we see some sections created from the program headers
+// MAIN: SectID
+// MAIN: PT_LOAD[0]
+
+// Ensure we find some dependent modules as won't find these if we aren't able
+// to load the .dynamic section from the PT_DYNAMIC program header.
+// MAIN: Dependent Modules:
+
+// Check for the .dynamic dump and ensure we find all dynamic entries that are
+// required to be there and needed to get the .dynstr section and the symbol
+// table, and the DT_DEBUG entry to find the list of shared libraries.
+// MAIN: .dynamic:
+// Make sure we found the .dynstr section by checking for valid strings after NEEDED
+// MAIN-DAG: NEEDED {{0x[0-9a-f]+ ".*libc.*"}}
+// MAIN-DAG: STRTAB {{0x[0-9a-f]+}}
+// MAIN-DAG: SYMTAB {{0x[0-9a-f]+}}
+// MAIN-DAG: STRSZ {{0x[0-9a-f]+}}
+// MAIN-DAG: SYMENT {{0x[0-9a-f]+}}
+// MAIN-DAG: DEBUG {{0x[0-9a-f]+}}
+// MAIN:     NULL {{0x[0-9a-f]+}}

From d9af9cf436ad1b892ecf8b794b0052135cc4029c Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Thu, 8 Aug 2024 11:05:39 -0700
Subject: [PATCH 048/112] [flang] Don't set Subroutine flag on PROCEDURE()
 pointers (#102011)

External procedures about which no characteristics are known -- from
EXTERNAL and PROCEDURE() statements of entities that are never called --
are marked as subroutines. This shouldn't be done for procedure
pointers, however.

Fixes https://github.com/llvm/llvm-project/issues/101908.
---
 flang/lib/Semantics/resolve-names.cpp |  2 +-
 flang/test/Semantics/assign03.f90     | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index b7725c5b0022844..2fe45f9c941d71f 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -9235,7 +9235,7 @@ void ResolveNamesVisitor::ResolveSpecificationParts(ProgramTree &node) {
       node.GetKind() == ProgramTree::Kind::Submodule};
   for (auto &pair : *node.scope()) {
     Symbol &symbol{*pair.second};
-    if (inModule && symbol.attrs().test(Attr::EXTERNAL) &&
+    if (inModule && symbol.attrs().test(Attr::EXTERNAL) && !IsPointer(symbol) &&
         !symbol.test(Symbol::Flag::Function) &&
         !symbol.test(Symbol::Flag::Subroutine)) {
       // in a module, external proc without return type is subroutine
diff --git a/flang/test/Semantics/assign03.f90 b/flang/test/Semantics/assign03.f90
index a80ef1e102b2b99..d8e7f14238f9203 100644
--- a/flang/test/Semantics/assign03.f90
+++ b/flang/test/Semantics/assign03.f90
@@ -1,6 +1,10 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
 ! Pointer assignment constraints 10.2.2.2 (see also assign02.f90)
 
+module m0
+  procedure(),pointer,save :: p
+end
+
 module m
   interface
     subroutine s(i)
@@ -324,4 +328,10 @@ subroutine s14
     !ERROR: Statement function 'sf' may not be the target of a pointer assignment
     ptr => sf
   end subroutine
+
+  subroutine s15
+    use m0
+    intrinsic sin
+    p=>sin ! ok
+  end
 end

From d46c639ebf19eacc6bd37240981ff1b1ef497b1b Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Thu, 8 Aug 2024 11:06:05 -0700
Subject: [PATCH 049/112] [flang] Fix derived type compatibility checking in
 ALLOCATE (#102035)

The derived type compatibility checking for ALLOCATE statements with
SOURCE= or MOLD= was only checking for the same derived type name. That
is a necessary but not sufficient check, and it can produce bogus errors
as well as miss valid errors.

Fixes https://github.com/llvm/llvm-project/issues/101909.
---
 flang/include/flang/Evaluate/type.h    |  4 ++-
 flang/lib/Evaluate/type.cpp            |  8 +++++-
 flang/lib/Semantics/check-allocate.cpp |  6 +++--
 flang/test/Semantics/allocate08.f90    | 36 ++++++++++++++++++++++++++
 4 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/flang/include/flang/Evaluate/type.h b/flang/include/flang/Evaluate/type.h
index 16c2b319ff1de6d..bd8887dbce4e82b 100644
--- a/flang/include/flang/Evaluate/type.h
+++ b/flang/include/flang/Evaluate/type.h
@@ -494,7 +494,9 @@ bool IsCUDAIntrinsicType(const DynamicType &);
 // Determine whether two derived type specs are sufficiently identical
 // to be considered the "same" type even if declared separately.
 bool AreSameDerivedType(
-    const semantics::DerivedTypeSpec &x, const semantics::DerivedTypeSpec &y);
+    const semantics::DerivedTypeSpec &, const semantics::DerivedTypeSpec &);
+bool AreSameDerivedTypeIgnoringTypeParameters(
+    const semantics::DerivedTypeSpec &, const semantics::DerivedTypeSpec &);
 
 // For generating "[extern] template class", &c. boilerplate
 #define EXPAND_FOR_EACH_INTEGER_KIND(M, P, S) \
diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp
index 463ac01da0e295e..5ecc3701b4f2460 100644
--- a/flang/lib/Evaluate/type.cpp
+++ b/flang/lib/Evaluate/type.cpp
@@ -505,7 +505,13 @@ bool AreSameDerivedType(
   return AreSameDerivedType(x, y, false, false, inProgress);
 }
 
-bool AreSameDerivedType(
+bool AreSameDerivedTypeIgnoringTypeParameters(
+    const semantics::DerivedTypeSpec &x, const semantics::DerivedTypeSpec &y) {
+  SetOfDerivedTypePairs inProgress;
+  return AreSameDerivedType(x, y, true, true, inProgress);
+}
+
+static bool AreSameDerivedType(
     const semantics::DerivedTypeSpec *x, const semantics::DerivedTypeSpec *y) {
   return x == y || (x && y && AreSameDerivedType(*x, *y));
 }
diff --git a/flang/lib/Semantics/check-allocate.cpp b/flang/lib/Semantics/check-allocate.cpp
index 8f7a200d23239b7..a5363a6710d3196 100644
--- a/flang/lib/Semantics/check-allocate.cpp
+++ b/flang/lib/Semantics/check-allocate.cpp
@@ -270,11 +270,13 @@ static bool IsTypeCompatible(
     const DeclTypeSpec &type1, const DerivedTypeSpec &derivedType2) {
   if (const DerivedTypeSpec * derivedType1{type1.AsDerived()}) {
     if (type1.category() == DeclTypeSpec::Category::TypeDerived) {
-      return &derivedType1->typeSymbol() == &derivedType2.typeSymbol();
+      return evaluate::AreSameDerivedTypeIgnoringTypeParameters(
+          *derivedType1, derivedType2);
     } else if (type1.category() == DeclTypeSpec::Category::ClassDerived) {
       for (const DerivedTypeSpec *parent{&derivedType2}; parent;
            parent = parent->typeSymbol().GetParentTypeSpec()) {
-        if (&derivedType1->typeSymbol() == &parent->typeSymbol()) {
+        if (evaluate::AreSameDerivedTypeIgnoringTypeParameters(
+                *derivedType1, *parent)) {
           return true;
         }
       }
diff --git a/flang/test/Semantics/allocate08.f90 b/flang/test/Semantics/allocate08.f90
index cc074a149ae9ed4..b2b88f78b32bea5 100644
--- a/flang/test/Semantics/allocate08.f90
+++ b/flang/test/Semantics/allocate08.f90
@@ -95,6 +95,42 @@ subroutine bar
   end subroutine
 end module
 
+module mod1
+  type, bind(C) :: t
+     integer :: n
+  end type
+  type(t), allocatable :: x
+end
+
+module mod2
+  type, bind(C) :: t
+     integer :: n
+  end type
+  type(t), allocatable :: x
+end
+
+module mod3
+  type, bind(C) :: t
+     real :: a
+  end type
+  type(t), allocatable :: x
+end
+
+subroutine same_type
+  use mod1, only: a => x
+  use mod2, only: b => x
+  use mod3, only: c => x
+  allocate(a)
+  allocate(b, source=a) ! ok
+  deallocate(a)
+  allocate(a, source=b) ! ok
+  !ERROR: Allocatable object in ALLOCATE must be type compatible with source expression from MOLD or SOURCE
+  allocate(c, source=a)
+  deallocate(a)
+  !ERROR: Allocatable object in ALLOCATE must be type compatible with source expression from MOLD or SOURCE
+  allocate(a, source=c)
+end
+
 ! Related to C945, check typeless expression are caught
 
 subroutine sub

From e83c5b25f3173791d72b14d3837a07a6b55b871c Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Thu, 8 Aug 2024 11:06:32 -0700
Subject: [PATCH 050/112] =?UTF-8?q?[flang]=20Warn=20about=20automatic=20da?=
 =?UTF-8?q?ta=20in=20main=20program,=20disallow=20in=20BLOCK=20=E2=80=A6?=
 =?UTF-8?q?=20(#102045)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…DATA

We allow automatic data objects in the specification part of the main
program; add an optional portability warning and documentation. Don't
allow them in BLOCK DATA. They're already disallowed as module
variables.
---
 flang/docs/Extensions.md                      |  2 ++
 flang/include/flang/Common/Fortran-features.h |  2 +-
 flang/lib/Semantics/check-declarations.cpp    | 29 ++++++++++++++++---
 flang/test/Semantics/resolve77.f90            | 15 +++++++++-
 flang/test/Semantics/stmt-func01.f90          |  1 +
 5 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 093596c9dc8ebca..fb57744c2157033 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -384,6 +384,8 @@ end
 * `BIND(C, NAME="...", CDEFINED)` signifies that the storage for an
   interoperable variable will be allocated outside of Fortran,
   probably by a C or C++ external definition.
+* An automatic data object may be declared in the specification part
+  of the main program.
 
 ### Extensions supported when enabled by options
 
diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index 938da08e19d6b17..518304191d63f7d 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -51,7 +51,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     BadBranchTarget, ConvertedArgument, HollerithPolymorphic, ListDirectedSize,
     NonBindCInteroperability, CudaManaged, CudaUnified,
     PolymorphicActualAllocatableOrPointerToMonomorphicDummy, RelaxedPureDummy,
-    UndefinableAsynchronousOrVolatileActual)
+    UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram)
 
 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index a52f013a70b9d9e..be0de5b4e03a42a 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -247,6 +247,14 @@ void CheckHelper::Check(
   }
 }
 
+static bool IsBlockData(const Scope &scope) {
+  return scope.kind() == Scope::Kind::BlockData;
+}
+
+static bool IsBlockData(const Symbol &symbol) {
+  return symbol.scope() && IsBlockData(*symbol.scope());
+}
+
 void CheckHelper::Check(const Symbol &symbol) {
   if (symbol.name().size() > common::maxNameLen &&
       &symbol == &symbol.GetUltimate()) {
@@ -463,6 +471,23 @@ void CheckHelper::Check(const Symbol &symbol) {
       messages_.Say(
           "Automatic data object '%s' may not appear in a module"_err_en_US,
           symbol.name());
+    } else if (IsBlockData(symbol.owner())) {
+      messages_.Say(
+          "Automatic data object '%s' may not appear in a BLOCK DATA subprogram"_err_en_US,
+          symbol.name());
+    } else if (symbol.owner().kind() == Scope::Kind::MainProgram) {
+      if (context_.IsEnabled(common::LanguageFeature::AutomaticInMainProgram)) {
+        if (context_.ShouldWarn(
+                common::LanguageFeature::AutomaticInMainProgram)) {
+          messages_.Say(
+              "Automatic data object '%s' should not appear in the specification part of a main program"_port_en_US,
+              symbol.name());
+        }
+      } else {
+        messages_.Say(
+            "Automatic data object '%s' may not appear in the specification part of a main program"_err_en_US,
+            symbol.name());
+      }
     }
   }
   if (IsProcedure(symbol)) {
@@ -2799,10 +2824,6 @@ static bool IsSubprogramDefinition(const Symbol &symbol) {
       symbol.scope()->kind() == Scope::Kind::Subprogram;
 }
 
-static bool IsBlockData(const Symbol &symbol) {
-  return symbol.scope() && symbol.scope()->kind() == Scope::Kind::BlockData;
-}
-
 static bool IsExternalProcedureDefinition(const Symbol &symbol) {
   return IsBlockData(symbol) ||
       (IsSubprogramDefinition(symbol) &&
diff --git a/flang/test/Semantics/resolve77.f90 b/flang/test/Semantics/resolve77.f90
index ffee10271d51bfa..943993ee74d76ee 100644
--- a/flang/test/Semantics/resolve77.f90
+++ b/flang/test/Semantics/resolve77.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
 ! Tests valid and invalid usage of forward references to procedures
 ! in specification expressions.
 module m
@@ -56,3 +56,16 @@ pure integer function if2(n)
     if2 = n
   end function
 end subroutine
+
+block data
+  common /blk2/ n
+  data n/100/
+  !ERROR: Automatic data object 'a' may not appear in a BLOCK DATA subprogram
+  real a(n)
+end
+
+program main
+  common /blk2/ n
+  !PORTABILITY: Automatic data object 'a' should not appear in the specification part of a main program
+  real a(n)
+end
diff --git a/flang/test/Semantics/stmt-func01.f90 b/flang/test/Semantics/stmt-func01.f90
index 83c31ded1d39b93..a87b0d7af52b470 100644
--- a/flang/test/Semantics/stmt-func01.f90
+++ b/flang/test/Semantics/stmt-func01.f90
@@ -10,6 +10,7 @@ program main
     pure integer function ifunc()
     end function
   end interface
+  !PORTABILITY: Automatic data object 'x1' should not appear in the specification part of a main program
   type(t1(k=4,l=ifunc())) x1
   !PORTABILITY: Statement function 'sf1' should not contain an array constructor
   sf1(n) = sum([(j,j=1,n)])

From 9390eb92212a584db75101d7e0ecd3f8819cd201 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Thu, 8 Aug 2024 11:06:52 -0700
Subject: [PATCH 051/112] [flang] Catch impure calls in nested
 concurrent-headers (#102075)

The start, end, and stride expressions of a concurrent-header in a DO
CONCURRENT or FORALL statement can contain calls to impure functions...
unless they appear in a statement that's nested in an enclosing DO
CONCURRENT or FORALL construct. Ensure that we catch this nested case.
---
 flang/lib/Semantics/check-do-forall.cpp | 55 ++++++++++++++++++-------
 flang/lib/Semantics/check-do-forall.h   |  1 +
 flang/test/Semantics/call11.f90         | 40 ++++++++++++++++++
 3 files changed, 81 insertions(+), 15 deletions(-)

diff --git a/flang/lib/Semantics/check-do-forall.cpp b/flang/lib/Semantics/check-do-forall.cpp
index 34225cd40619285..e55a01d80acfb67 100644
--- a/flang/lib/Semantics/check-do-forall.cpp
+++ b/flang/lib/Semantics/check-do-forall.cpp
@@ -269,8 +269,7 @@ class DoConcurrentBodyEnforce {
       const parser::CharBlock statementLocation{
           GetImageControlStmtLocation(construct)};
       auto &msg{context_.Say(statementLocation,
-          "An image control statement is not allowed in DO"
-          " CONCURRENT"_err_en_US)};
+          "An image control statement is not allowed in DO CONCURRENT"_err_en_US)};
       if (auto coarrayMsg{GetImageControlStmtCoarrayMsg(construct)}) {
         msg.Attach(statementLocation, *coarrayMsg);
       }
@@ -372,8 +371,8 @@ class DoConcurrentVariableEnforce {
 // Find a DO or FORALL and enforce semantics checks on its body
 class DoContext {
 public:
-  DoContext(SemanticsContext &context, IndexVarKind kind)
-      : context_{context}, kind_{kind} {}
+  DoContext(SemanticsContext &context, IndexVarKind kind, bool isNested)
+      : context_{context}, kind_{kind}, isNested_{isNested} {}
 
   // Mark this DO construct as a point of definition for the DO variables
   // or index-names it contains.  If they're already defined, emit an error
@@ -743,13 +742,21 @@ class DoContext {
             std::get<std::optional<parser::ScalarLogicalExpr>>(header.t)}) {
       CheckMaskIsPure(*mask);
     }
-    auto &controls{std::get<std::list<parser::ConcurrentControl>>(header.t)};
+    const auto &controls{
+        std::get<std::list<parser::ConcurrentControl>>(header.t)};
     UnorderedSymbolSet indexNames;
     for (const parser::ConcurrentControl &control : controls) {
       const auto &indexName{std::get<parser::Name>(control.t)};
       if (indexName.symbol) {
         indexNames.insert(*indexName.symbol);
       }
+      if (isNested_) {
+        CheckForImpureCall(std::get<1>(control.t));
+        CheckForImpureCall(std::get<2>(control.t));
+        if (const auto &stride{std::get<3>(control.t)}) {
+          CheckForImpureCall(*stride);
+        }
+      }
     }
     if (!indexNames.empty()) {
       for (const parser::ConcurrentControl &control : controls) {
@@ -808,13 +815,24 @@ class DoContext {
     CheckConcurrentHeader(std::get<parser::ConcurrentHeader>(concurrent.t));
   }
 
-  template <typename T> void CheckForImpureCall(const T &x) {
+  template <typename T> void CheckForImpureCall(const T &x) const {
     if (auto bad{FindImpureCall(context_.foldingContext(), x)}) {
       context_.Say(
           "Impure procedure '%s' may not be referenced in a %s"_err_en_US, *bad,
           LoopKindName());
     }
   }
+  void CheckForImpureCall(const parser::ScalarIntExpr &x) const {
+    const auto &parsedExpr{x.thing.thing.value()};
+    auto oldLocation{context_.location()};
+    context_.set_location(parsedExpr.source);
+    if (const auto &typedExpr{parsedExpr.typedExpr}) {
+      if (const auto &expr{typedExpr->v}) {
+        CheckForImpureCall(*expr);
+      }
+    }
+    context_.set_location(oldLocation);
+  }
 
   // Each index should be used on the LHS of each assignment in a FORALL
   void CheckForallIndexesUsed(const evaluate::Assignment &assignment) {
@@ -870,40 +888,47 @@ class DoContext {
   SemanticsContext &context_;
   const IndexVarKind kind_;
   parser::CharBlock currentStatementSourcePosition_;
+  bool isNested_{false};
 }; // class DoContext
 
 void DoForallChecker::Enter(const parser::DoConstruct &doConstruct) {
-  DoContext doContext{context_, IndexVarKind::DO};
+  DoContext doContext{context_, IndexVarKind::DO, constructNesting_ > 0};
   doContext.DefineDoVariables(doConstruct);
 }
 
 void DoForallChecker::Leave(const parser::DoConstruct &doConstruct) {
-  DoContext doContext{context_, IndexVarKind::DO};
+  DoContext doContext{context_, IndexVarKind::DO, constructNesting_ > 0};
+  ++constructNesting_;
   doContext.Check(doConstruct);
   doContext.ResetDoVariables(doConstruct);
+  --constructNesting_;
 }
 
 void DoForallChecker::Enter(const parser::ForallConstruct &construct) {
-  DoContext doContext{context_, IndexVarKind::FORALL};
+  DoContext doContext{context_, IndexVarKind::FORALL, constructNesting_ > 0};
   doContext.ActivateIndexVars(GetControls(construct));
+  ++constructNesting_;
+  doContext.Check(construct);
 }
 void DoForallChecker::Leave(const parser::ForallConstruct &construct) {
-  DoContext doContext{context_, IndexVarKind::FORALL};
-  doContext.Check(construct);
+  DoContext doContext{context_, IndexVarKind::FORALL, constructNesting_ > 0};
   doContext.DeactivateIndexVars(GetControls(construct));
+  --constructNesting_;
 }
 
 void DoForallChecker::Enter(const parser::ForallStmt &stmt) {
-  DoContext doContext{context_, IndexVarKind::FORALL};
+  DoContext doContext{context_, IndexVarKind::FORALL, constructNesting_ > 0};
+  ++constructNesting_;
+  doContext.Check(stmt);
   doContext.ActivateIndexVars(GetControls(stmt));
 }
 void DoForallChecker::Leave(const parser::ForallStmt &stmt) {
-  DoContext doContext{context_, IndexVarKind::FORALL};
-  doContext.Check(stmt);
+  DoContext doContext{context_, IndexVarKind::FORALL, constructNesting_ > 0};
   doContext.DeactivateIndexVars(GetControls(stmt));
+  --constructNesting_;
 }
 void DoForallChecker::Leave(const parser::ForallAssignmentStmt &stmt) {
-  DoContext doContext{context_, IndexVarKind::FORALL};
+  DoContext doContext{context_, IndexVarKind::FORALL, constructNesting_ > 0};
   doContext.Check(stmt);
 }
 
diff --git a/flang/lib/Semantics/check-do-forall.h b/flang/lib/Semantics/check-do-forall.h
index 3b2ae59f5f3fff3..f08ff3467d4cc93 100644
--- a/flang/lib/Semantics/check-do-forall.h
+++ b/flang/lib/Semantics/check-do-forall.h
@@ -60,6 +60,7 @@ class DoForallChecker : public virtual BaseChecker {
 private:
   SemanticsContext &context_;
   int exprDepth_{0};
+  int constructNesting_{0};
 
   void SayBadLeave(
       StmtType, const char *enclosingStmt, const ConstructNode &) const;
diff --git a/flang/test/Semantics/call11.f90 b/flang/test/Semantics/call11.f90
index f4f474079556238..7bc4931890dee59 100644
--- a/flang/test/Semantics/call11.f90
+++ b/flang/test/Semantics/call11.f90
@@ -42,6 +42,46 @@ subroutine test
       !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
       a(j) = impure(j) ! C1139
     end do
+    do concurrent (k=impure(1):1); end do ! ok
+    do concurrent (k=1:impure(1)); end do ! ok
+    do concurrent (k=1:1:impure(1)); end do ! ok
+    forall (k=impure(1):1); end forall ! ok
+    forall (k=1:impure(1)); end forall ! ok
+    forall (k=1:1:impure(1)); end forall ! ok
+    do concurrent (j=1:1)
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      do concurrent (k=impure(1):1); end do
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      do concurrent (k=1:impure(1)); end do
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      do concurrent (k=1:1:impure(1)); end do
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      forall (k=impure(1):1); end forall
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      forall (k=1:impure(1)); end forall
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      forall (k=1:1:impure(1)); end forall
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      forall (k=impure(1):1) a(k) = 0.
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      forall (k=1:impure(1)) a(k) = 0.
+      !ERROR: Impure procedure 'impure' may not be referenced in DO CONCURRENT
+      forall (k=1:1:impure(1)) a(k) = 0.
+    end do
+    forall (j=1:1)
+      !ERROR: Impure procedure 'impure' may not be referenced in a FORALL
+      forall (k=impure(1):1); end forall
+      !ERROR: Impure procedure 'impure' may not be referenced in a FORALL
+      forall (k=1:impure(1)); end forall
+      !ERROR: Impure procedure 'impure' may not be referenced in a FORALL
+      forall (k=1:1:impure(1)); end forall
+      !ERROR: Impure procedure 'impure' may not be referenced in a FORALL
+      forall (k=impure(1):1) a(j*k) = 0.
+      !ERROR: Impure procedure 'impure' may not be referenced in a FORALL
+      forall (k=1:impure(1)) a(j*k) = 0.
+      !ERROR: Impure procedure 'impure' may not be referenced in a FORALL
+      forall (k=1:1:impure(1)) a(j*k) = 0.
+    end forall
   end subroutine
 
   subroutine test2

From 7c512cef61ea6894c09da8ae5dad6f1ed44812f7 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Thu, 8 Aug 2024 11:07:19 -0700
Subject: [PATCH 052/112] [flang] Disallow references to some IEEE procedures
 in DO CONCURRENT (#102082)

There's a numbered constraint that prohibits calls to some IEEE
arithmetic and exception procedures within the body of a DO CONCURRENT
construct. Clean up the implementation to catch missing cases.
---
 flang/lib/Semantics/check-do-forall.cpp    | 44 ++++++++++++----------
 flang/module/__fortran_ieee_exceptions.f90 |  2 +-
 flang/test/Semantics/doconcurrent01.f90    | 20 ++++++----
 3 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/flang/lib/Semantics/check-do-forall.cpp b/flang/lib/Semantics/check-do-forall.cpp
index e55a01d80acfb67..d798244ff1ef2d3 100644
--- a/flang/lib/Semantics/check-do-forall.cpp
+++ b/flang/lib/Semantics/check-do-forall.cpp
@@ -285,19 +285,32 @@ class DoConcurrentBodyEnforce {
         .Attach(doConcurrentSourcePosition_, GetEnclosingDoMsg());
   }
 
-  // C1139: call to impure procedure and ...
-  // C1141: cannot call ieee_get_flag, ieee_[gs]et_halting_mode
-  // It's not necessary to check the ieee_get* procedures because they're
-  // not pure, and impure procedures are caught by checks for constraint C1139
+  // C1145, C1146: cannot call ieee_[gs]et_flag, ieee_[gs]et_halting_mode,
+  // ieee_[gs]et_status, ieee_set_rounding_mode, or ieee_set_underflow_mode
   void Post(const parser::ProcedureDesignator &procedureDesignator) {
     if (auto *name{std::get_if<parser::Name>(&procedureDesignator.u)}) {
-      if (name->symbol &&
-          fromScope(*name->symbol, "__fortran_ieee_exceptions"s)) {
-        if (name->source == "ieee_set_halting_mode") {
-          SayWithDo(context_, currentStatementSourcePosition_,
-              "IEEE_SET_HALTING_MODE is not allowed in DO "
-              "CONCURRENT"_err_en_US,
-              doConcurrentSourcePosition_);
+      if (name->symbol) {
+        const Symbol &ultimate{name->symbol->GetUltimate()};
+        const Scope &scope{ultimate.owner()};
+        if (const Symbol * module{scope.IsModule() ? scope.symbol() : nullptr};
+            module &&
+            (module->name() == "__fortran_ieee_arithmetic" ||
+                module->name() == "__fortran_ieee_exceptions")) {
+          std::string s{ultimate.name().ToString()};
+          static constexpr const char *badName[]{"ieee_get_flag",
+              "ieee_set_flag", "ieee_get_halting_mode", "ieee_set_halting_mode",
+              "ieee_get_status", "ieee_set_status", "ieee_set_rounding_mode",
+              "ieee_set_underflow_mode", nullptr};
+          for (std::size_t j{0}; badName[j]; ++j) {
+            if (s.find(badName[j]) != s.npos) {
+              context_
+                  .Say(name->source,
+                      "'%s' may not be called in DO CONCURRENT"_err_en_US,
+                      badName[j])
+                  .Attach(doConcurrentSourcePosition_, GetEnclosingDoMsg());
+              break;
+            }
+          }
         }
       }
     }
@@ -318,15 +331,6 @@ class DoConcurrentBodyEnforce {
   }
 
 private:
-  bool fromScope(const Symbol &symbol, const std::string &moduleName) {
-    if (symbol.GetUltimate().owner().IsModule() &&
-        symbol.GetUltimate().owner().GetName().value().ToString() ==
-            moduleName) {
-      return true;
-    }
-    return false;
-  }
-
   std::set<parser::Label> labels_;
   parser::CharBlock currentStatementSourcePosition_;
   SemanticsContext &context_;
diff --git a/flang/module/__fortran_ieee_exceptions.f90 b/flang/module/__fortran_ieee_exceptions.f90
index 810a2b0e400f242..cebd60452018125 100644
--- a/flang/module/__fortran_ieee_exceptions.f90
+++ b/flang/module/__fortran_ieee_exceptions.f90
@@ -129,7 +129,7 @@ end subroutine ieee_set_modes_0
   public :: ieee_set_modes
 
   interface ieee_set_status
-    subroutine ieee_set_status_0(status)
+    pure subroutine ieee_set_status_0(status)
       import ieee_status_type
       type(ieee_status_type), intent(in) :: status
     end subroutine ieee_set_status_0
diff --git a/flang/test/Semantics/doconcurrent01.f90 b/flang/test/Semantics/doconcurrent01.f90
index 7c13a26814e5be3..9bb2b4537683510 100644
--- a/flang/test/Semantics/doconcurrent01.f90
+++ b/flang/test/Semantics/doconcurrent01.f90
@@ -48,18 +48,22 @@ subroutine do_concurrent_test2(i,j,n,flag)
     change team (j)
 !ERROR: An image control statement is not allowed in DO CONCURRENT
       critical
-        call ieee_get_status(status) ! ok
-!ERROR: IEEE_SET_HALTING_MODE is not allowed in DO CONCURRENT
-        call ieee_set_halting_mode(flag, halting)
       end critical
     end team
 !ERROR: ADVANCE specifier is not allowed in DO CONCURRENT
     write(*,'(a35)',advance='no')
-  end do
-
-! The following is OK
-  do concurrent (i = 1:n)
-        call ieee_set_flag(flag, flagValue)
+!ERROR: 'ieee_get_status' may not be called in DO CONCURRENT
+    call ieee_get_status(status)
+!ERROR: 'ieee_set_status' may not be called in DO CONCURRENT
+    call ieee_set_status(status)
+!ERROR: 'ieee_get_halting_mode' may not be called in DO CONCURRENT
+    call ieee_get_halting_mode(flag, halting)
+!ERROR: 'ieee_set_halting_mode' may not be called in DO CONCURRENT
+    call ieee_set_halting_mode(flag, halting)
+!ERROR: 'ieee_get_flag' may not be called in DO CONCURRENT
+    call ieee_get_flag(flag, flagValue)
+!ERROR: 'ieee_set_flag' may not be called in DO CONCURRENT
+    call ieee_set_flag(flag, flagValue)
   end do
 end subroutine do_concurrent_test2
 

From 25822dc392dcb8e15f6b24feecce06f28d07b8ad Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Thu, 8 Aug 2024 11:07:40 -0700
Subject: [PATCH 053/112] [flang] Fix searches for polymorphic components
 (#102212)

FindPolymorphicAllocatableUltimateComponent needs to be
FindPolymorphicAllocatablePotentialComponent. The current search is
missing cases where a derived type has an allocatable component whose
type has a polymorphic allocatable component.
---
 flang/include/flang/Semantics/tools.h      | 16 ++++++++++++++--
 flang/lib/Semantics/check-declarations.cpp |  5 +++--
 flang/lib/Semantics/definable.cpp          |  3 ++-
 flang/lib/Semantics/tools.cpp              | 10 +++++-----
 flang/test/Semantics/call10.f90            |  2 +-
 flang/test/Semantics/typeinfo11.f90        | 17 +++++++++++++++++
 6 files changed, 42 insertions(+), 11 deletions(-)
 create mode 100644 flang/test/Semantics/typeinfo11.f90

diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index ec275f349e81bfd..15c02ecc0058cc0 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -442,6 +442,18 @@ std::list<std::list<SymbolRef>> GetStorageAssociations(const Scope &);
 //     closure of its components (including POINTERs) and the
 //     PotentialAndPointer subobject components of its non-POINTER derived type
 //     components.
+//
+// type t1                     ultimate components:  x, a, p
+//  real x                     direct components:    x, a, p
+//  real, allocatable :: a     potential components: x, a
+//  real, pointer :: p         potential & pointers: x, a, p
+// end type
+// type t2                     ultimate components:  y, c%x, c%a, c%p, b
+//  real y                     direct components:    y, c, c%x, c%a, c%p, b
+//  type(t1) :: c              potential components: y, c, c%x, c%a, b, b%x, b%a
+//  type(t1), allocatable :: b potential & pointers: potentials + c%p + b%p
+// end type
+//
 // Parent and procedure components are considered against these definitions.
 // For this kind of iterator, the component tree is recursively visited in the
 // following order:
@@ -620,8 +632,8 @@ UltimateComponentIterator::const_iterator FindAllocatableUltimateComponent(
     const DerivedTypeSpec &);
 DirectComponentIterator::const_iterator FindAllocatableOrPointerDirectComponent(
     const DerivedTypeSpec &);
-UltimateComponentIterator::const_iterator
-FindPolymorphicAllocatableUltimateComponent(const DerivedTypeSpec &);
+PotentialComponentIterator::const_iterator
+FindPolymorphicAllocatablePotentialComponent(const DerivedTypeSpec &);
 
 // The LabelEnforce class (given a set of labels) provides an error message if
 // there is a branch to a label which is not in the given set.
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index be0de5b4e03a42a..de3fa8794caedf7 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -405,9 +405,10 @@ void CheckHelper::Check(const Symbol &symbol) {
             messages_.Say(
                 "Result of pure function may not have an impure FINAL subroutine"_err_en_US);
           }
-          if (auto bad{FindPolymorphicAllocatableUltimateComponent(*derived)}) {
+          if (auto bad{
+                  FindPolymorphicAllocatablePotentialComponent(*derived)}) {
             SayWithDeclaration(*bad,
-                "Result of pure function may not have polymorphic ALLOCATABLE ultimate component '%s'"_err_en_US,
+                "Result of pure function may not have polymorphic ALLOCATABLE potential component '%s'"_err_en_US,
                 bad.BuildResultDesignatorName());
           }
         }
diff --git a/flang/lib/Semantics/definable.cpp b/flang/lib/Semantics/definable.cpp
index ae76f668f6ce7b9..62fed63df4475cf 100644
--- a/flang/lib/Semantics/definable.cpp
+++ b/flang/lib/Semantics/definable.cpp
@@ -223,7 +223,8 @@ static std::optional<parser::Message> WhyNotDefinableLast(parser::CharBlock at,
       }
       if (const DerivedTypeSpec * derived{GetDerivedTypeSpec(dyType)}) {
         if (!flags.test(DefinabilityFlag::PolymorphicOkInPure)) {
-          if (auto bad{FindPolymorphicAllocatableUltimateComponent(*derived)}) {
+          if (auto bad{
+                  FindPolymorphicAllocatablePotentialComponent(*derived)}) {
             return BlameSymbol(at,
                 "'%s' has polymorphic component '%s' in a pure subprogram"_en_US,
                 original, bad.BuildResultDesignatorName());
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index fdaf052c2d34eb3..57d84bde60b43c0 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -866,7 +866,7 @@ const Symbol *HasImpureFinal(const Symbol &original, std::optional<int> rank) {
 
 bool MayRequireFinalization(const DerivedTypeSpec &derived) {
   return IsFinalizable(derived) ||
-      FindPolymorphicAllocatableUltimateComponent(derived);
+      FindPolymorphicAllocatablePotentialComponent(derived);
 }
 
 bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived) {
@@ -1404,11 +1404,11 @@ DirectComponentIterator::const_iterator FindAllocatableOrPointerDirectComponent(
   return std::find_if(directs.begin(), directs.end(), IsAllocatableOrPointer);
 }
 
-UltimateComponentIterator::const_iterator
-FindPolymorphicAllocatableUltimateComponent(const DerivedTypeSpec &derived) {
-  UltimateComponentIterator ultimates{derived};
+PotentialComponentIterator::const_iterator
+FindPolymorphicAllocatablePotentialComponent(const DerivedTypeSpec &derived) {
+  PotentialComponentIterator potentials{derived};
   return std::find_if(
-      ultimates.begin(), ultimates.end(), IsPolymorphicAllocatable);
+      potentials.begin(), potentials.end(), IsPolymorphicAllocatable);
 }
 
 const Symbol *FindUltimateComponent(const DerivedTypeSpec &derived,
diff --git a/flang/test/Semantics/call10.f90 b/flang/test/Semantics/call10.f90
index ffb3b48c329e72e..2d2f57934cd8aa2 100644
--- a/flang/test/Semantics/call10.f90
+++ b/flang/test/Semantics/call10.f90
@@ -78,7 +78,7 @@ pure function f07() ! C1585
     class(t), allocatable :: f07
   end function
   pure function f08() ! C1585
-    !ERROR: Result of pure function may not have polymorphic ALLOCATABLE ultimate component '%a'
+    !ERROR: Result of pure function may not have polymorphic ALLOCATABLE potential component '%a'
     type(polyAlloc) :: f08
   end function
 
diff --git a/flang/test/Semantics/typeinfo11.f90 b/flang/test/Semantics/typeinfo11.f90
new file mode 100644
index 000000000000000..92efc8f9ea54b8a
--- /dev/null
+++ b/flang/test/Semantics/typeinfo11.f90
@@ -0,0 +1,17 @@
+!RUN: bbc --dump-symbols %s | FileCheck %s
+!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s
+
+!Tests that derived types with polymorphic potential subobject
+!components do not have their noFinalizationNeeded flags set, even
+!when those components are packaged within another allocatable.
+
+type t1
+  class(*), allocatable :: a
+end type
+type t2
+  type(t1), allocatable :: b
+end type
+type(t2) x
+end
+
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)

From 245eb0a716a839ebe18bd5d489184bebd352e769 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Thu, 8 Aug 2024 11:08:00 -0700
Subject: [PATCH 054/112] [flang] Catch structure constructor in its own type
 definition (#102241)

The check for a structure constructor to a forward-referenced derived
type wasn't tripping for constructors in the type definition itself. Set
the forward reference flag unconditionally at the beginning of name
resolution for the type definition.
---
 flang/lib/Semantics/expression.cpp        | 3 +--
 flang/lib/Semantics/resolve-names.cpp     | 7 ++-----
 flang/test/Semantics/bad-forward-type.f90 | 3 ++-
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 0ec96d447c0b9d4..4f1c53f1bb53fcb 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -3026,8 +3026,7 @@ const Symbol *AssumedTypeDummy<parser::PointerObject>(
 bool ExpressionAnalyzer::CheckIsValidForwardReference(
     const semantics::DerivedTypeSpec &dtSpec) {
   if (dtSpec.IsForwardReferenced()) {
-    Say("Cannot construct value for derived type '%s' "
-        "before it is defined"_err_en_US,
+    Say("Cannot construct value for derived type '%s' before it is defined"_err_en_US,
         dtSpec.name());
     return false;
   }
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 2fe45f9c941d71f..b4875d87d172c2b 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -5507,11 +5507,8 @@ void DeclarationVisitor::Post(const parser::DerivedTypeStmt &x) {
   std::optional<DerivedTypeSpec> extendsType{
       ResolveExtendsType(name, extendsName)};
   DerivedTypeDetails derivedTypeDetails;
-  if (Symbol * typeSymbol{FindInScope(currScope(), name)}; typeSymbol &&
-      typeSymbol->has<DerivedTypeDetails>() &&
-      typeSymbol->get<DerivedTypeDetails>().isForwardReferenced()) {
-    derivedTypeDetails.set_isForwardReferenced(true);
-  }
+  // Catch any premature structure constructors within the definition
+  derivedTypeDetails.set_isForwardReferenced(true);
   auto &symbol{MakeSymbol(name, GetAttrs(), std::move(derivedTypeDetails))};
   symbol.ReplaceName(name.source);
   derivedTypeInfo_.type = &symbol;
diff --git a/flang/test/Semantics/bad-forward-type.f90 b/flang/test/Semantics/bad-forward-type.f90
index 432d450a15f3fc5..27c6045b0059fa2 100644
--- a/flang/test/Semantics/bad-forward-type.f90
+++ b/flang/test/Semantics/bad-forward-type.f90
@@ -76,7 +76,8 @@ subroutine s8
   !ERROR: Cannot construct value for derived type 't2' before it is defined
   parameter(y=t2(12.3))
   type t2
-    real :: c
+    !ERROR: Cannot construct value for derived type 't2' before it is defined
+    real :: c = transfer(t2(),0.)
   end type
 end subroutine
 

From b949a6f5e3219290ce090895e284d12fc76f9d61 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Thu, 8 Aug 2024 11:08:26 -0700
Subject: [PATCH 055/112] [flang] Warn on useless IOMSG= (#102250)

An I/O statement with IOMSG= but neither ERR= nor IOSTAT= deserves a
warning to the effect that it's not useful.
---
 flang/include/flang/Common/Fortran-features.h |  3 ++-
 flang/lib/Semantics/check-io.cpp              | 19 +++++++++++++++++++
 flang/lib/Semantics/check-io.h                |  1 +
 flang/lib/Semantics/semantics.cpp             |  5 ++++-
 flang/test/Semantics/io05.f90                 |  1 +
 flang/test/Semantics/undef-result01.f90       |  1 +
 6 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index 518304191d63f7d..6ef5f44c89db07e 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -70,7 +70,7 @@ ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
     IgnoredIntrinsicFunctionType, PreviousScalarUse,
     RedeclaredInaccessibleComponent, ImplicitShared, IndexVarRedefinition,
     IncompatibleImplicitInterfaces, BadTypeForTarget,
-    VectorSubscriptFinalization, UndefinedFunctionResult)
+    VectorSubscriptFinalization, UndefinedFunctionResult, UselessIomsg)
 
 using LanguageFeatures = EnumSet<LanguageFeature, LanguageFeature_enumSize>;
 using UsageWarnings = EnumSet<UsageWarning, UsageWarning_enumSize>;
@@ -145,6 +145,7 @@ class LanguageFeatureControl {
     warnUsage_.set(UsageWarning::BadTypeForTarget);
     warnUsage_.set(UsageWarning::VectorSubscriptFinalization);
     warnUsage_.set(UsageWarning::UndefinedFunctionResult);
+    warnUsage_.set(UsageWarning::UselessIomsg);
   }
   LanguageFeatureControl(const LanguageFeatureControl &) = default;
 
diff --git a/flang/lib/Semantics/check-io.cpp b/flang/lib/Semantics/check-io.cpp
index 8bde737c4cb948d..54e8e09cbf7e48a 100644
--- a/flang/lib/Semantics/check-io.cpp
+++ b/flang/lib/Semantics/check-io.cpp
@@ -675,6 +675,7 @@ void IoChecker::Leave(const parser::BackspaceStmt &) {
   CheckForPureSubprogram();
   CheckForRequiredSpecifier(
       flags_.test(Flag::NumberUnit), "UNIT number"); // C1240
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -682,6 +683,7 @@ void IoChecker::Leave(const parser::CloseStmt &) {
   CheckForPureSubprogram();
   CheckForRequiredSpecifier(
       flags_.test(Flag::NumberUnit), "UNIT number"); // C1208
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -689,6 +691,7 @@ void IoChecker::Leave(const parser::EndfileStmt &) {
   CheckForPureSubprogram();
   CheckForRequiredSpecifier(
       flags_.test(Flag::NumberUnit), "UNIT number"); // C1240
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -696,6 +699,7 @@ void IoChecker::Leave(const parser::FlushStmt &) {
   CheckForPureSubprogram();
   CheckForRequiredSpecifier(
       flags_.test(Flag::NumberUnit), "UNIT number"); // C1243
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -708,6 +712,7 @@ void IoChecker::Leave(const parser::InquireStmt &stmt) {
         "UNIT number or FILE"); // C1246
     CheckForProhibitedSpecifier(IoSpecKind::File, IoSpecKind::Unit); // C1246
     CheckForRequiredSpecifier(IoSpecKind::Id, IoSpecKind::Pending); // C1248
+    CheckForUselessIomsg();
   }
   Done();
 }
@@ -742,11 +747,13 @@ void IoChecker::Leave(const parser::OpenStmt &) {
     CheckForProhibitedSpecifier(flags_.test(Flag::AccessStream),
         "STATUS='STREAM'", IoSpecKind::Recl); // 12.5.6.15
   }
+  CheckForUselessIomsg();
   Done();
 }
 
 void IoChecker::Leave(const parser::PrintStmt &) {
   CheckForPureSubprogram();
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -817,6 +824,7 @@ void IoChecker::Leave(const parser::RewindStmt &) {
   CheckForRequiredSpecifier(
       flags_.test(Flag::NumberUnit), "UNIT number"); // C1240
   CheckForPureSubprogram();
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -824,6 +832,7 @@ void IoChecker::Leave(const parser::WaitStmt &) {
   CheckForRequiredSpecifier(
       flags_.test(Flag::NumberUnit), "UNIT number"); // C1237
   CheckForPureSubprogram();
+  CheckForUselessIomsg();
   Done();
 }
 
@@ -883,6 +892,7 @@ void IoChecker::LeaveReadWrite() const {
       "FMT or NML"); // C1227
   CheckForRequiredSpecifier(IoSpecKind::Round, flags_.test(Flag::FmtOrNml),
       "FMT or NML"); // C1227
+  CheckForUselessIomsg();
 }
 
 void IoChecker::SetSpecifier(IoSpecKind specKind) {
@@ -1057,6 +1067,15 @@ void IoChecker::CheckForPureSubprogram() const { // C1597
   }
 }
 
+void IoChecker::CheckForUselessIomsg() const {
+  if (specifierSet_.test(IoSpecKind::Iomsg) &&
+      !specifierSet_.test(IoSpecKind::Err) &&
+      !specifierSet_.test(IoSpecKind::Iostat) &&
+      context_.ShouldWarn(common::UsageWarning::UselessIomsg)) {
+    context_.Say("IOMSG= is useless without either ERR= or IOSTAT="_warn_en_US);
+  }
+}
+
 // Seeks out an allocatable or pointer ultimate component that is not
 // nested in a nonallocatable/nonpointer component with a specific
 // defined I/O procedure.
diff --git a/flang/lib/Semantics/check-io.h b/flang/lib/Semantics/check-io.h
index 0ef166f7f100edb..2fb03c63afe353d 100644
--- a/flang/lib/Semantics/check-io.h
+++ b/flang/lib/Semantics/check-io.h
@@ -125,6 +125,7 @@ class IoChecker : public virtual BaseChecker {
   void CheckForDefinableVariable(const A &var, const std::string &s) const;
 
   void CheckForPureSubprogram() const;
+  void CheckForUselessIomsg() const;
 
   parser::Message *CheckForBadIoType(const evaluate::DynamicType &,
       common::DefinedIo, parser::CharBlock) const;
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
index 3cb24f6c6af4371..f7a277d1b414f6d 100644
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -221,10 +221,13 @@ static bool PerformStatementSemantics(
   if (context.languageFeatures().IsEnabled(common::LanguageFeature::CUDA)) {
     SemanticsVisitor<CUDAChecker>{context}.Walk(program);
   }
-  if (!context.AnyFatalError()) {
+  if (!context.messages().AnyFatalError()) {
+    // Do this if all messages are only warnings
     if (context.ShouldWarn(common::UsageWarning::UndefinedFunctionResult)) {
       WarnUndefinedFunctionResult(context, context.globalScope());
     }
+  }
+  if (!context.AnyFatalError()) {
     pass2.CompileDataInitializationsIntoInitializers();
   }
   return !context.AnyFatalError();
diff --git a/flang/test/Semantics/io05.f90 b/flang/test/Semantics/io05.f90
index 8480ea4b784c299..bef0d6db89524a8 100644
--- a/flang/test/Semantics/io05.f90
+++ b/flang/test/Semantics/io05.f90
@@ -55,6 +55,7 @@
   inquire(1, read=c(1), write=c(2), sign=c(3), sign=c(4), read=c(5), write=c(1))
 
   !ERROR: Duplicate IOMSG specifier
+  !WARNING: IOMSG= is useless without either ERR= or IOSTAT=
   inquire(10, iomsg=msg, pos=ipos, iomsg=msg)
 
   !ERROR: If ID appears, PENDING must also appear
diff --git a/flang/test/Semantics/undef-result01.f90 b/flang/test/Semantics/undef-result01.f90
index a372fcd544feead..dd73f9c76df0a55 100644
--- a/flang/test/Semantics/undef-result01.f90
+++ b/flang/test/Semantics/undef-result01.f90
@@ -121,6 +121,7 @@ integer function defdBySize()
 end
 
 character(40) function defdByIomsg()
+  !WARNING: IOMSG= is useless without either ERR= or IOSTAT=
   write(123,*,iomsg=defdByIomsg)
 end
 

From 7ea78643fe1577afb60bfc670357a79be53a31e8 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Thu, 8 Aug 2024 11:08:48 -0700
Subject: [PATCH 056/112] [flang] Improve error message output (#102324)

When a local character variable with non-constant length has an
initializer, it's an error in a couple of ways (SAVE variable with
unknown size, static initializer that isn't constant due to conversion
to an unknown length). The error that f18 reports is the latter, but the
message contains a formatted representation of the initialization
expression that exposes a non-Fortran %SET_LENGTH() operation. Print the
original expression in the message instead.
---
 flang/lib/Evaluate/check-expression.cpp | 2 +-
 flang/test/Semantics/structconst02.f90  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp
index 342aac4dd5d53e7..fef4620857a08ab 100644
--- a/flang/lib/Evaluate/check-expression.cpp
+++ b/flang/lib/Evaluate/check-expression.cpp
@@ -493,7 +493,7 @@ std::optional<Expr<SomeType>> NonPointerInitializationExpr(const Symbol &symbol,
       } else {
         context.messages().Say(
             "Initialization expression for '%s' (%s) cannot be computed as a constant value"_err_en_US,
-            symbol.name(), folded.AsFortran());
+            symbol.name(), x.AsFortran());
       }
     } else if (xType) {
       context.messages().Say(
diff --git a/flang/test/Semantics/structconst02.f90 b/flang/test/Semantics/structconst02.f90
index 24ec0a196401574..71d6b720fb41a4c 100644
--- a/flang/test/Semantics/structconst02.f90
+++ b/flang/test/Semantics/structconst02.f90
@@ -14,7 +14,7 @@ end function realfunc
     integer(kind=ik) :: ix = int(0,kind=ik)
     real(kind=rk) :: rx = real(0.,kind=rk)
     complex(kind=zk) :: zx = cmplx(0.,0.,kind=zk)
-    !ERROR: Initialization expression for 'cx' (%SET_LENGTH(" ",len)) cannot be computed as a constant value
+    !ERROR: Initialization expression for 'cx' (" ") cannot be computed as a constant value
     character(kind=ck,len=len) :: cx = ' '
     logical(kind=lk) :: lx = .false.
     real(kind=rk), pointer :: rp => NULL()

From 88a85942cefc6c2bb220da639c358e98a591d8b1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 8 Aug 2024 22:45:43 +0400
Subject: [PATCH 057/112] AMDGPU: Directly handle all atomicrmw cases in
 SIISelLowering (#102439)

---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 28 --------
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |  2 -
 llvm/lib/Target/AMDGPU/R600ISelLowering.cpp   | 23 ++++++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 69 +++++++++++++++++--
 4 files changed, 85 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 862b5c7e3e3d753..3fc02c6da37cb8a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5999,34 +5999,6 @@ bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
   return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
 }
 
-TargetLowering::AtomicExpansionKind
-AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
-  switch (RMW->getOperation()) {
-  case AtomicRMWInst::Nand:
-  case AtomicRMWInst::FAdd:
-  case AtomicRMWInst::FSub:
-  case AtomicRMWInst::FMax:
-  case AtomicRMWInst::FMin:
-    return AtomicExpansionKind::CmpXChg;
-  case AtomicRMWInst::Xchg: {
-    const DataLayout &DL = RMW->getFunction()->getDataLayout();
-    unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
-    if (ValSize == 32 || ValSize == 64)
-      return AtomicExpansionKind::None;
-    return AtomicExpansionKind::CmpXChg;
-  }
-  default: {
-    if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
-      unsigned Size = IntTy->getBitWidth();
-      if (Size == 32 || Size == 64)
-        return AtomicExpansionKind::None;
-    }
-
-    return AtomicExpansionKind::CmpXChg;
-  }
-  }
-}
-
 /// Whether it is profitable to sink the operands of an
 /// Instruction I to the basic block of I.
 /// This helps using several modifiers (like abs and neg) more often.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 37572af3897f2ec..1a5244f7ec809b8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -388,8 +388,6 @@ class AMDGPUTargetLowering : public TargetLowering {
     return MVT::i32;
   }
 
-  AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
-
   bool shouldSinkOperands(Instruction *I,
                           SmallVectorImpl<Use *> &Ops) const override;
 };
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 7e0d96622f3c5da..c79688bf038be30 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -2175,14 +2175,33 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
 TargetLowering::AtomicExpansionKind
 R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   switch (RMW->getOperation()) {
+  case AtomicRMWInst::Nand:
+  case AtomicRMWInst::FAdd:
+  case AtomicRMWInst::FSub:
+  case AtomicRMWInst::FMax:
+  case AtomicRMWInst::FMin:
+    return AtomicExpansionKind::CmpXChg;
   case AtomicRMWInst::UIncWrap:
   case AtomicRMWInst::UDecWrap:
     // FIXME: Cayman at least appears to have instructions for this, but the
     // instruction defintions appear to be missing.
     return AtomicExpansionKind::CmpXChg;
+  case AtomicRMWInst::Xchg: {
+    const DataLayout &DL = RMW->getFunction()->getDataLayout();
+    unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
+    if (ValSize == 32 || ValSize == 64)
+      return AtomicExpansionKind::None;
+    return AtomicExpansionKind::CmpXChg;
+  }
   default:
-    break;
+    if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
+      unsigned Size = IntTy->getBitWidth();
+      if (Size == 32 || Size == 64)
+        return AtomicExpansionKind::None;
+    }
+
+    return AtomicExpansionKind::CmpXChg;
   }
 
-  return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
+  llvm_unreachable("covered atomicrmw op switch");
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4e9c271197613b1..8497aa33449e995 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16114,6 +16114,39 @@ static bool isBFloat2(Type *Ty) {
   return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
 }
 
+/// \return true if atomicrmw integer ops work for the type.
+static bool isAtomicRMWLegalIntTy(Type *Ty) {
+  if (auto *IT = dyn_cast<IntegerType>(Ty)) {
+    unsigned BW = IT->getBitWidth();
+    return BW == 32 || BW == 64;
+  }
+
+  return false;
+}
+
+/// \return true if this atomicrmw xchg type can be selected.
+static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
+  Type *Ty = RMW->getType();
+  if (isAtomicRMWLegalIntTy(Ty))
+    return true;
+
+  if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
+    const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
+    unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
+    return BW == 32 || BW == 64;
+  }
+
+  if (Ty->isFloatTy() || Ty->isDoubleTy())
+    return true;
+
+  if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
+    return VT->getNumElements() == 2 &&
+           VT->getElementType()->getPrimitiveSizeInBits() == 16;
+  }
+
+  return false;
+}
+
 /// \returns true if it's valid to emit a native instruction for \p RMW, based
 /// on the properties of the target memory.
 static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
@@ -16142,6 +16175,14 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
       .getValueAsBool();
 }
 
+/// \return Action to perform on AtomicRMWInsts for integer operations.
+static TargetLowering::AtomicExpansionKind
+atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
+  return isAtomicRMWLegalIntTy(RMW->getType())
+             ? TargetLowering::AtomicExpansionKind::None
+             : TargetLowering::AtomicExpansionKind::CmpXChg;
+}
+
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   unsigned AS = RMW->getPointerAddressSpace();
@@ -16161,7 +16202,22 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
       SSID == SyncScope::System ||
       SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
 
-  switch (RMW->getOperation()) {
+  auto Op = RMW->getOperation();
+  switch (Op) {
+  case AtomicRMWInst::Xchg: {
+    // PCIe supports add and xchg for system atomics.
+    return isAtomicRMWLegalXChgTy(RMW)
+               ? TargetLowering::AtomicExpansionKind::None
+               : TargetLowering::AtomicExpansionKind::CmpXChg;
+
+    // PCIe supports add and xchg for system atomics.
+    return atomicSupportedIfLegalIntType(RMW);
+  }
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::And:
+  case AtomicRMWInst::UIncWrap:
+  case AtomicRMWInst::UDecWrap:
+    return atomicSupportedIfLegalIntType(RMW);
   case AtomicRMWInst::Sub:
   case AtomicRMWInst::Or:
   case AtomicRMWInst::Xor: {
@@ -16173,7 +16229,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
         return AtomicExpansionKind::Expand;
     }
 
-    break;
+    return atomicSupportedIfLegalIntType(RMW);
   }
   case AtomicRMWInst::FAdd: {
     Type *Ty = RMW->getType();
@@ -16335,13 +16391,16 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
       if (HasSystemScope)
         return AtomicExpansionKind::CmpXChg;
     }
-    break;
+
+    return atomicSupportedIfLegalIntType(RMW);
   }
+  case AtomicRMWInst::Nand:
+  case AtomicRMWInst::FSub:
   default:
-    break;
+    return AtomicExpansionKind::CmpXChg;
   }
 
-  return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
+  llvm_unreachable("covered atomicrmw op switch");
 }
 
 TargetLowering::AtomicExpansionKind

From a821fee312d15941174827a70cb534c2f2fe1177 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 8 Aug 2024 12:02:44 -0700
Subject: [PATCH 058/112] [ELF] scanRelocations: support .crel.eh_frame

Follow-up to #98115. For EhInputSection, RelocationScanner::scan calls
sortRels, which doesn't support the CREL iterator. We should set
supportsCrel to false to ensure that the initial_location fields in
.eh_frame FDEs are relocated.
---
 lld/ELF/Relocations.cpp | 10 ++++++----
 lld/test/ELF/crel.s     |  8 ++++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index c2f4f3e755b95ef..b60b00cee4a6022 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -459,7 +459,8 @@ class OffsetGetter {
 // InputSectionBase.
 class RelocationScanner {
 public:
-  template <class ELFT> void scanSection(InputSectionBase &s);
+  template <class ELFT>
+  void scanSection(InputSectionBase &s, bool isEH = false);
 
 private:
   InputSectionBase *sec;
@@ -1617,10 +1618,11 @@ void RelocationScanner::scan(Relocs<RelTy> rels) {
                       });
 }
 
-template <class ELFT> void RelocationScanner::scanSection(InputSectionBase &s) {
+template <class ELFT>
+void RelocationScanner::scanSection(InputSectionBase &s, bool isEH) {
   sec = &s;
   getter = OffsetGetter(s);
-  const RelsOrRelas<ELFT> rels = s.template relsOrRelas<ELFT>();
+  const RelsOrRelas<ELFT> rels = s.template relsOrRelas<ELFT>(!isEH);
   if (rels.areRelocsCrel())
     scan<ELFT>(rels.crels);
   else if (rels.areRelocsRel())
@@ -1658,7 +1660,7 @@ template <class ELFT> void elf::scanRelocations() {
     RelocationScanner scanner;
     for (Partition &part : partitions) {
       for (EhInputSection *sec : part.ehFrame->sections)
-        scanner.template scanSection<ELFT>(*sec);
+        scanner.template scanSection<ELFT>(*sec, /*isEH=*/true);
       if (part.armExidx && part.armExidx->isLive())
         for (InputSection *sec : part.armExidx->exidxSections)
           if (sec->isLive())
diff --git a/lld/test/ELF/crel.s b/lld/test/ELF/crel.s
index d7c87be9a540257..1de3f314fc6770c 100644
--- a/lld/test/ELF/crel.s
+++ b/lld/test/ELF/crel.s
@@ -5,6 +5,7 @@
 # RUN: ld.lld -pie a.o b.o -o out
 # RUN: llvm-objdump -d out | FileCheck %s
 # RUN: llvm-readelf -Srs out | FileCheck %s --check-prefix=RELOC
+# RUN: llvm-dwarfdump --eh-frame out | FileCheck %s --check-prefix=UNWIND
 
 # CHECK:       <_start>:
 # CHECK-NEXT:    callq {{.*}} <foo>
@@ -18,6 +19,13 @@
 
 # RELOC:  {{0*}}[[#DATA+8]]  0000000000000008 R_X86_64_RELATIVE [[#%x,DATA+0x8000000000000000]]
 
+# RELOC:      00000000000012f4     0 NOTYPE  GLOBAL DEFAULT [[#]] _start
+# RELOC-NEXT: 00000000000012fe     0 NOTYPE  GLOBAL DEFAULT [[#]] foo
+
+## initial_location fields in FDEs are correctly relocated.
+# UNWIND: 00000018 00000010 0000001c FDE cie=00000000 pc=000012f4...000012fe
+# UNWIND: 0000002c 00000010 00000030 FDE cie=00000000 pc=000012fe...0000130c
+
 # RUN: ld.lld -pie --emit-relocs a.o b.o -o out1
 # RUN: llvm-objdump -dr out1 | FileCheck %s --check-prefix=CHECKE
 # RUN: llvm-readelf -Sr out1 | FileCheck %s --check-prefix=RELOCE

From 0a62980ad386ec429beb69cbcb8e361a7a6283c4 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 8 Aug 2024 23:22:48 +0400
Subject: [PATCH 059/112] AMDGPU: Support VALU add instructions in
 localstackalloc (#101692)

Pre-enable this optimization before allowing folds of frame
indexes into add instructions. Disables this fold when using
scratch instructions for now. I see some code size improvements
with it, but the optimization needs to be smarter about the
uses depending on the register classes.
---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     | 164 ++++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |   4 +-
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |  12 +-
 ...local-stack-alloc-add-references.gfx10.mir |  80 ++
 .../local-stack-alloc-add-references.gfx8.mir | 863 ++++++++++++++++++
 .../local-stack-alloc-add-references.gfx9.mir | 523 +++++++++++
 6 files changed, 1642 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index dd4e0d53202d4ed..ee72837a50fc436 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -797,6 +797,23 @@ int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
 
 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
                                                  int Idx) const {
+  switch (MI->getOpcode()) {
+  case AMDGPU::V_ADD_U32_e32:
+  case AMDGPU::V_ADD_U32_e64:
+  case AMDGPU::V_ADD_CO_U32_e32: {
+    int OtherIdx = Idx == 1 ? 2 : 1;
+    const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
+    return OtherOp.isImm() ? OtherOp.getImm() : 0;
+  }
+  case AMDGPU::V_ADD_CO_U32_e64: {
+    int OtherIdx = Idx == 2 ? 3 : 2;
+    const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
+    return OtherOp.isImm() ? OtherOp.getImm() : 0;
+  }
+  default:
+    break;
+  }
+
   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
     return 0;
 
@@ -809,7 +826,60 @@ int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
   return getScratchInstrOffset(MI);
 }
 
+static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI,
+                              const MachineInstr &MI) {
+  assert(MI.getDesc().isAdd());
+  const MachineOperand &Src0 = MI.getOperand(1);
+  const MachineOperand &Src1 = MI.getOperand(2);
+
+  if (Src0.isFI()) {
+    return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
+                                                       Src1.getReg()));
+  }
+
+  if (Src1.isFI()) {
+    return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
+                                                       Src0.getReg()));
+  }
+
+  return false;
+}
+
 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
+  // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
+  switch (MI->getOpcode()) {
+  case AMDGPU::V_ADD_U32_e32: {
+    // TODO: We could handle this but it requires work to avoid violating
+    // operand restrictions.
+    if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
+        !isFIPlusImmOrVGPR(*this, *MI))
+      return false;
+    [[fallthrough]];
+  }
+  case AMDGPU::V_ADD_U32_e64:
+    // FIXME: This optimization is barely profitable enableFlatScratch as-is.
+    //
+    // Much of the benefit with the MUBUF handling is we avoid duplicating the
+    // shift of the frame register, which isn't needed with scratch.
+    //
+    // materializeFrameBaseRegister doesn't know the register classes of the
+    // uses, and unconditionally uses an s_add_i32, which will end up using a
+    // copy for the vector uses.
+    return !ST.enableFlatScratch();
+  case AMDGPU::V_ADD_CO_U32_e32:
+    if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
+        !isFIPlusImmOrVGPR(*this, *MI))
+      return false;
+    // We can't deal with the case where the carry out has a use (though this
+    // should never happen)
+    return MI->getOperand(3).isDead();
+  case AMDGPU::V_ADD_CO_U32_e64:
+    // TODO: Should we check use_empty instead?
+    return MI->getOperand(1).isDead();
+  default:
+    break;
+  }
+
   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
     return false;
 
@@ -860,6 +930,8 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
     .addFrameIndex(FrameIdx);
 
   if (ST.enableFlatScratch() ) {
+    // FIXME: Mark scc as dead
+    // FIXME: Make sure scc isn't live in.
     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
         .addReg(OffsetReg, RegState::Kill)
         .addReg(FIReg);
@@ -877,6 +949,86 @@ Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                                        int64_t Offset) const {
   const SIInstrInfo *TII = ST.getInstrInfo();
+
+  switch (MI.getOpcode()) {
+  case AMDGPU::V_ADD_U32_e32:
+  case AMDGPU::V_ADD_CO_U32_e32: {
+    MachineOperand *FIOp = &MI.getOperand(2);
+    MachineOperand *ImmOp = &MI.getOperand(1);
+    if (!FIOp->isFI())
+      std::swap(FIOp, ImmOp);
+
+    if (!ImmOp->isImm()) {
+      assert(Offset == 0);
+      FIOp->ChangeToRegister(BaseReg, false);
+      TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
+      return;
+    }
+
+    int64_t TotalOffset = ImmOp->getImm() + Offset;
+    if (TotalOffset == 0) {
+      MI.setDesc(TII->get(AMDGPU::COPY));
+      for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
+        MI.removeOperand(I);
+
+      MI.getOperand(1).ChangeToRegister(BaseReg, false);
+      return;
+    }
+
+    ImmOp->setImm(TotalOffset);
+
+    MachineBasicBlock *MBB = MI.getParent();
+    MachineFunction *MF = MBB->getParent();
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+
+    // FIXME: materializeFrameBaseRegister does not know the register class of
+    // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
+    // a copy so we have a legal operand and hope the register coalescer can
+    // clean it up.
+    if (isSGPRReg(MRI, BaseReg)) {
+      Register BaseRegVGPR =
+          MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
+          .addReg(BaseReg);
+      MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
+    } else {
+      MI.getOperand(2).ChangeToRegister(BaseReg, false);
+    }
+    return;
+  }
+  case AMDGPU::V_ADD_U32_e64:
+  case AMDGPU::V_ADD_CO_U32_e64: {
+    int Src0Idx = MI.getNumExplicitDefs();
+    MachineOperand *FIOp = &MI.getOperand(Src0Idx);
+    MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
+    if (!FIOp->isFI())
+      std::swap(FIOp, ImmOp);
+
+    if (!ImmOp->isImm()) {
+      FIOp->ChangeToRegister(BaseReg, false);
+      TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
+      return;
+    }
+
+    int64_t TotalOffset = ImmOp->getImm() + Offset;
+    if (TotalOffset == 0) {
+      MI.setDesc(TII->get(AMDGPU::COPY));
+
+      for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
+        MI.removeOperand(I);
+
+      MI.getOperand(1).ChangeToRegister(BaseReg, false);
+    } else {
+      FIOp->ChangeToRegister(BaseReg, false);
+      ImmOp->setImm(TotalOffset);
+    }
+
+    return;
+  }
+  default:
+    break;
+  }
+
   bool IsFlat = TII->isFLATScratch(MI);
 
 #ifndef NDEBUG
@@ -925,6 +1077,18 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
                                         Register BaseReg,
                                         int64_t Offset) const {
+
+  switch (MI->getOpcode()) {
+  case AMDGPU::V_ADD_U32_e32:
+  case AMDGPU::V_ADD_CO_U32_e32:
+    return true;
+  case AMDGPU::V_ADD_U32_e64:
+  case AMDGPU::V_ADD_CO_U32_e64:
+    return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
+  default:
+    break;
+  }
+
   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index de3191bd91df608..2e73a1a15f6b323 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -640,12 +640,12 @@ class SOP2_64_32_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
 
 
 let Defs = [SCC] in { // Carry out goes to SCC
-let isCommutable = 1 in {
+let isCommutable = 1, isAdd = 1 in {
 def S_ADD_U32 : SOP2_32 <"s_add_u32">;
 def S_ADD_I32 : SOP2_32 <"s_add_i32",
   [(set i32:$sdst, (UniformBinFrag<add> SSrc_b32:$src0, SSrc_b32:$src1))]
 >;
-} // End isCommutable = 1
+} // End isCommutable = 1, isAdd = 1
 
 def S_SUB_U32 : SOP2_32 <"s_sub_u32">;
 def S_SUB_I32 : SOP2_32 <"s_sub_i32",
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 44eb5f5abafe000..d17b4f24081312c 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -763,7 +763,11 @@ def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
 
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
-defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32", 1>;
+
+let isAdd = 1 in {
+  defm V_ADD_CO_U32 : VOP2bInst <"v_add_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_add_co_u32", 1>;
+}
+
 defm V_SUB_CO_U32 : VOP2bInst <"v_sub_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
 defm V_SUBREV_CO_U32 : VOP2bInst <"v_subrev_co_u32", VOP2b_I32_I1_I32_I32, null_frag, "v_sub_co_u32", 1>;
 defm V_ADDC_U32 : VOP2bInst <"v_addc_u32", VOP2b_I32_I1_I32_I32_I1, null_frag, "v_addc_u32", 1>;
@@ -772,7 +776,11 @@ defm V_SUBBREV_U32 : VOP2bInst <"v_subbrev_u32", VOP2b_I32_I1_I32_I32_I1, null_f
 
 
 let SubtargetPredicate = HasAddNoCarryInsts, isReMaterializable = 1 in {
-defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32", 1>;
+
+let isAdd = 1 in {
+  defm V_ADD_U32 : VOP2Inst_VOPD <"v_add_u32", VOP_I32_I32_I32_ARITH, 0x10, "v_add_nc_u32", null_frag, "v_add_u32", 1>;
+}
+
 defm V_SUB_U32 : VOP2Inst <"v_sub_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
 defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, "v_sub_u32", 1>;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
new file mode 100644
index 000000000000000..0c31b36e90cb0c9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx10.mir
@@ -0,0 +1,80 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+
+---
+name:            local_stack_alloc__v_add_u32_e64__literal_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], 256, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets
+    ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 256, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e64 %stack.0, 512, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e64__literal_offsets_commute
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, [[V_ADD_U32_e64_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], -156, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__literal_offsets_commute
+    ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_2]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_U32_e64 256, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e64 512, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    %2:vgpr_32 = V_ADD_U32_e64 %stack.0, 100, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %2
+    SI_RETURN
+
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
new file mode 100644
index 000000000000000..b7ade2147e40cd2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx8.mir
@@ -0,0 +1,863 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX803 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX900 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX940 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -mattr=+wavefrontsize64 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+wavefrontsize64 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+
+---
+name:            local_stack_alloc__v_add_co_u32_e32__literal_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
+    ; GFX803: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
+    ; GFX940: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets
+    ; GFX12: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, [[COPY1]], implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
+    ; GFX803: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
+    ; GFX900: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
+    ; GFX940: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
+    ; GFX10: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__literal_offsets_live_vcc
+    ; GFX12: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]], implicit $vcc
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_CO_U32_e32 256, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_CO_U32_e32 512, %stack.0, implicit-def $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1, implicit $vcc
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
+    ; GFX803: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_CO_U32_e64_]], implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
+    ; GFX940: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[V_ADD_U32_e64_]], implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__inline_imm_offsets
+    ; GFX12: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 8, [[COPY1]], implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_CO_U32_e32 8, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_CO_U32_e32 16, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
+    ; GFX803: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX803-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_CO_U32_e64_]], 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
+    ; GFX940: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX940-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets
+    ; GFX12: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX12-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 killed [[S_MOV_B32_]], [[S_MOV_B32_1]], implicit-def $scc
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_I32_]]
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 8, [[S_ADD_I32_]], 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
+    ; GFX803: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
+    ; GFX900: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
+    ; GFX940: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX940-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
+    ; GFX10: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__inline_imm_offsets_live_vcc
+    ; GFX12: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: SI_RETURN implicit [[V_ADD_CO_U32_e64_1]]
+    %0:vgpr_32, %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, 8, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 16, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN implicit %2
+
+...
+
+---
+name:            local_stack_alloc__s_add_i32__literal_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
+    ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
+    ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
+    ; GFX940: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX940-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
+    ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets
+    ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def dead $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, %0
+    %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def dead $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__s_add_i32__inline_imm_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
+    ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
+    ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
+    ; GFX940: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX940-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
+    ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__inline_imm_offsets
+    ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:sreg_32 = S_ADD_I32 8, %stack.0, implicit-def dead $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %0
+    %1:sreg_32 = S_ADD_I32 16, %stack.0, implicit-def dead $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:SREG_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__s_add_i32__literal_offsets_live_scc
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX803-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
+    ; GFX803: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX803-NEXT: S_NOP 0, implicit $scc
+    ; GFX803-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX803-NEXT: SI_RETURN implicit $scc
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
+    ; GFX900: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX900-NEXT: S_NOP 0, implicit $scc
+    ; GFX900-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX900-NEXT: SI_RETURN implicit $scc
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
+    ; GFX940: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX940-NEXT: S_NOP 0, implicit $scc
+    ; GFX940-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX940-NEXT: SI_RETURN implicit $scc
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
+    ; GFX10: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX10-NEXT: S_NOP 0, implicit $scc
+    ; GFX10-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX10-NEXT: SI_RETURN implicit $scc
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__s_add_i32__literal_offsets_live_scc
+    ; GFX12: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_]]
+    ; GFX12-NEXT: S_NOP 0, implicit $scc
+    ; GFX12-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, [[S_ADD_I32_1]]
+    ; GFX12-NEXT: SI_RETURN implicit $scc
+    %0:sreg_32 = S_ADD_I32 256, %stack.0, implicit-def $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, %0
+    S_NOP 0, implicit $scc
+    %1:sreg_32 = S_ADD_I32 512, %stack.0, implicit-def $scc
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2359305 /* reguse:SReg_32 */, %1
+    SI_RETURN implicit $scc
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
+    ; GFX803: liveins: $vgpr0
+    ; GFX803-NEXT: {{  $}}
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
+    ; GFX900: liveins: $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
+    ; GFX940: liveins: $vgpr0
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
+    ; GFX10: liveins: $vgpr0
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %vgpr_offset:vgpr_32 = COPY $vgpr0
+    %0:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_CO_U32_e32 %vgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
+    ; GFX803: liveins: $vgpr0
+    ; GFX803-NEXT: {{  $}}
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
+    ; GFX900: liveins: $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
+    ; GFX940: liveins: $vgpr0
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
+    ; GFX10: liveins: $vgpr0
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__vgpr_offsets_commute
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 [[S_MOV_B32_]], %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %vgpr_offset:vgpr_32 = COPY $vgpr0
+    %0:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_CO_U32_e32 %stack.0, %vgpr_offset, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr8
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
+    ; GFX803: liveins: $sgpr8
+    ; GFX803-NEXT: {{  $}}
+    ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
+    ; GFX900: liveins: $sgpr8
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
+    ; GFX940: liveins: $sgpr8
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
+    ; GFX10: liveins: $sgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit-def dead $vcc, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e32__sgpr_offsets
+    ; GFX12: liveins: $sgpr8
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY]], implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_]]
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, [[COPY1]], implicit-def dead $vcc, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %sgpr_offset:sreg_32 = COPY $sgpr8
+    %0:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_CO_U32_e32 %sgpr_offset, %stack.0, implicit-def dead $vcc, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr8
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
+    ; GFX803: liveins: $sgpr8
+    ; GFX803-NEXT: {{  $}}
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
+    ; GFX900: liveins: $sgpr8
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
+    ; GFX940: liveins: $sgpr8
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY]], 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[COPY1]], 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
+    ; GFX10: liveins: $sgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets
+    ; GFX12: liveins: $sgpr8
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, [[S_MOV_B32_]], 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: SI_RETURN
+    %sgpr_offset:sreg_32 = COPY $sgpr8
+    %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr8
+    ; GFX803-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
+    ; GFX803: liveins: $sgpr8
+    ; GFX803-NEXT: {{  $}}
+    ; GFX803-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX803-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX803-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX803-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX803-NEXT: SI_RETURN
+    ;
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
+    ; GFX900: liveins: $sgpr8
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX900-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
+    ; GFX940: liveins: $sgpr8
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX940-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY]], 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY %sgpr_offset
+    ; GFX940-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], [[COPY1]], 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
+    ; GFX10: liveins: $sgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_co_u32_e64__sgpr_offsets_commute
+    ; GFX12: liveins: $sgpr8
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 %stack.0
+    ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[S_MOV_B32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_CO_U32_e64_2]]
+    ; GFX12-NEXT: SI_RETURN
+    %sgpr_offset:sreg_32 = COPY $sgpr8
+    %0:vgpr_32, dead %2:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32, dead %3:sreg_64_xexec = V_ADD_CO_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir
new file mode 100644
index 000000000000000..7ed153133517786
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-add-references.gfx9.mir
@@ -0,0 +1,523 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX900 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX940 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass=localstackalloc -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+
+---
+name:            local_stack_alloc__v_add_u32_e32__literal_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
+    ; GFX940: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 256
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, [[V_ADD_U32_e64_]], implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__literal_offsets
+    ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_U32_e32 256, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e32 512, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e32__inline_imm_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
+    ; GFX940: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, [[V_ADD_U32_e64_]], implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__inline_imm_offsets
+    ; GFX12: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_U32_e32 8, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e32 16, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e64__inline_imm_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
+    ; GFX940: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets
+    ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e32__vgpr_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
+    ; GFX900: liveins: $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
+    ; GFX940: liveins: $vgpr0
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX940-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
+    ; GFX10: liveins: $vgpr0
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %vgpr_offset:vgpr_32 = COPY $vgpr0
+    %0:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e32 %vgpr_offset, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $vgpr0
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
+    ; GFX900: liveins: $vgpr0
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
+    ; GFX940: liveins: $vgpr0
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX940-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
+    ; GFX10: liveins: $vgpr0
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_MOV_B32_e32_]], %vgpr_offset, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__vgpr_offsets_commute
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: %vgpr_offset:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %vgpr_offset:vgpr_32 = COPY $vgpr0
+    %0:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e32 %stack.0, %vgpr_offset, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e32__sgpr_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr8
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
+    ; GFX900: liveins: $sgpr8
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX900-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX900-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
+    ; GFX940: liveins: $sgpr8
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX940-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
+    ; GFX10: liveins: $sgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX10-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX10-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, [[V_MOV_B32_e32_]], implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e32__sgpr_offsets
+    ; GFX12: liveins: $sgpr8
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX12-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e32_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %sgpr_offset:sreg_32 = COPY $sgpr8
+    %0:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e32 %sgpr_offset, %stack.0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e64__sgpr_offsets
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr8
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
+    ; GFX900: liveins: $sgpr8
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
+    ; GFX940: liveins: $sgpr8
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX940-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
+    ; GFX10: liveins: $sgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets
+    ; GFX12: liveins: $sgpr8
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %sgpr_offset:sreg_32 = COPY $sgpr8
+    %0:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e64 %sgpr_offset, %stack.0, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+---
+name:            local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 4096, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    liveins: $sgpr8
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
+    ; GFX900: liveins: $sgpr8
+    ; GFX900-NEXT: {{  $}}
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
+    ; GFX940: liveins: $sgpr8
+    ; GFX940-NEXT: {{  $}}
+    ; GFX940-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX940-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
+    ; GFX10: liveins: $sgpr8
+    ; GFX10-NEXT: {{  $}}
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], %sgpr_offset, 0, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__sgpr_offsets_commute
+    ; GFX12: liveins: $sgpr8
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: %sgpr_offset:sreg_32 = COPY $sgpr8
+    ; GFX12-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %sgpr_offset:sreg_32 = COPY $sgpr8
+    %0:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e64 %stack.0, %sgpr_offset, 0, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...
+
+# Should be OK to fold with clamp modifier, which should be preserved.
+---
+name:            local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
+tracksRegLiveness: true
+stack:
+  - { id: 0, size: 64, alignment: 4 }
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+body:             |
+  bb.0:
+    ; GFX900-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
+    ; GFX900: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX900-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX900-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX900-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX900-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec
+    ; GFX900-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX900-NEXT: SI_RETURN
+    ;
+    ; GFX940-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
+    ; GFX940: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX940-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec
+    ; GFX940-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX940-NEXT: SI_RETURN
+    ;
+    ; GFX10-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
+    ; GFX10: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 8
+    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
+    ; GFX10-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 killed [[S_MOV_B32_]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_ADD_U32_e64_]]
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[COPY]]
+    ; GFX10-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 8, [[V_ADD_U32_e64_]], 1, implicit $exec
+    ; GFX10-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX10-NEXT: SI_RETURN
+    ;
+    ; GFX12-LABEL: name: local_stack_alloc__v_add_u32_e64__inline_imm_offsets_clamp_modifier
+    ; GFX12: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, 1, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_]]
+    ; GFX12-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 16, %stack.0, 1, implicit $exec
+    ; GFX12-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, [[V_ADD_U32_e64_1]]
+    ; GFX12-NEXT: SI_RETURN
+    %0:vgpr_32 = V_ADD_U32_e64 %stack.0, 8, /*clamp*/1, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %0
+    %1:vgpr_32 = V_ADD_U32_e64 16, %stack.0, /*clamp*/1, implicit $exec
+    INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2228233 /* reguse:VGPR_32 */, %1
+    SI_RETURN
+
+...

From 8cae9dcd4a45ac78b22c05eff96b0fee3e1c5e55 Mon Sep 17 00:00:00 2001
From: Alexis Engelke <engelke@in.tum.de>
Date: Thu, 8 Aug 2024 21:26:17 +0200
Subject: [PATCH 060/112] [AMDGPU] Clear load addresses between functions
 (#102515)

SLoadAddresses previously held data across different functions and used
these for dominance queries of blocks in different functions. This is
not intended; clear the state at the end of the pass.
---
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  1 +
 .../CodeGen/AMDGPU/waitcnt-multiple-funcs.mir | 41 +++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/waitcnt-multiple-funcs.mir

diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1315aa08557888f..59a1eee8d4f91d1 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -2611,6 +2611,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     Modified = true;
   }
   ReleaseVGPRInsts.clear();
+  SLoadAddresses.clear();
 
   return Modified;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-multiple-funcs.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-multiple-funcs.mir
new file mode 100644
index 000000000000000..a65ec9c67654927
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-multiple-funcs.mir
@@ -0,0 +1,41 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass si-insert-waitcnts -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+# CHECK-LABEL: name: t1
+# CHECK: liveins: $vgpr0
+name: t1
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0
+...
+
+---
+# CHECK-LABEL: name: t2
+# CHECK: liveins: $sgpr2_sgpr3
+# CHECK: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr2_sgpr3, 0, 0 :: (load (s64), addrspace 4)
+name: t2
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $sgpr2_sgpr3
+     $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr2_sgpr3, 0, 0 :: (load (s64), addrspace 4)
+...
+
+---
+# CHECK-LABEL: name: t3
+# CHECK:  liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3
+# CHECK:  $vgpr2 = BUFFER_ATOMIC_ADD_ADDR64_RTN $vgpr2, $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 1, implicit $exec :: (load store (s32), addrspace 1)
+name: t3
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3
+    $vgpr2 = BUFFER_ATOMIC_ADD_ADDR64_RTN $vgpr2, $vgpr0_vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 1, implicit $exec :: (load store (s32), addrspace 1)
+...

From 0bc640774880121e3e59d11011313d3ee8c52b01 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 8 Aug 2024 23:44:07 +0400
Subject: [PATCH 061/112] TTI: Check legalization cost of add/sub overflow ISD
 nodes (#100518)

---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  90 +++---
 .../CostModel/ARM/active_lane_mask.ll         |  24 +-
 .../Analysis/CostModel/ARM/arith-overflow.ll  | 304 +++++++++---------
 .../test/Analysis/CostModel/ARM/arith-ssat.ll | 140 ++++----
 .../test/Analysis/CostModel/ARM/arith-usat.ll | 144 ++++-----
 5 files changed, 358 insertions(+), 344 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c0ee50db37e6761..ac3968de5d672ef 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2184,44 +2184,17 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       return Cost;
     }
     case Intrinsic::sadd_with_overflow:
-    case Intrinsic::ssub_with_overflow: {
-      Type *SumTy = RetTy->getContainedType(0);
-      Type *OverflowTy = RetTy->getContainedType(1);
-      unsigned Opcode = IID == Intrinsic::sadd_with_overflow
-                            ? BinaryOperator::Add
-                            : BinaryOperator::Sub;
-
-      //   Add:
-      //   Overflow -> (Result < LHS) ^ (RHS < 0)
-      //   Sub:
-      //   Overflow -> (Result < LHS) ^ (RHS > 0)
-      InstructionCost Cost = 0;
-      Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
-      Cost += 2 * thisT()->getCmpSelInstrCost(
-                      Instruction::ICmp, SumTy, OverflowTy,
-                      CmpInst::ICMP_SGT, CostKind);
-      Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
-                                              CostKind);
-      return Cost;
-    }
+      ISD = ISD::SADDO;
+      break;
+    case Intrinsic::ssub_with_overflow:
+      ISD = ISD::SSUBO;
+      break;
     case Intrinsic::uadd_with_overflow:
-    case Intrinsic::usub_with_overflow: {
-      Type *SumTy = RetTy->getContainedType(0);
-      Type *OverflowTy = RetTy->getContainedType(1);
-      unsigned Opcode = IID == Intrinsic::uadd_with_overflow
-                            ? BinaryOperator::Add
-                            : BinaryOperator::Sub;
-      CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
-                                    ? CmpInst::ICMP_ULT
-                                    : CmpInst::ICMP_UGT;
-
-      InstructionCost Cost = 0;
-      Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
-      Cost +=
-          thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
-                                      Pred, CostKind);
-      return Cost;
-    }
+      ISD = ISD::UADDO;
+      break;
+    case Intrinsic::usub_with_overflow:
+      ISD = ISD::USUBO;
+      break;
     case Intrinsic::smul_with_overflow:
     case Intrinsic::umul_with_overflow: {
       Type *MulTy = RetTy->getContainedType(0);
@@ -2300,8 +2273,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       break;
     }
 
+    auto *ST = dyn_cast<StructType>(RetTy);
+    Type *LegalizeTy = ST ? ST->getContainedType(0) : RetTy;
+    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(LegalizeTy);
+
     const TargetLoweringBase *TLI = getTLI();
-    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
 
     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
       if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
@@ -2357,6 +2333,44 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                           Pred, CostKind);
       return Cost;
     }
+    case Intrinsic::sadd_with_overflow:
+    case Intrinsic::ssub_with_overflow: {
+      Type *SumTy = RetTy->getContainedType(0);
+      Type *OverflowTy = RetTy->getContainedType(1);
+      unsigned Opcode = IID == Intrinsic::sadd_with_overflow
+                            ? BinaryOperator::Add
+                            : BinaryOperator::Sub;
+
+      //   Add:
+      //   Overflow -> (Result < LHS) ^ (RHS < 0)
+      //   Sub:
+      //   Overflow -> (Result < LHS) ^ (RHS > 0)
+      InstructionCost Cost = 0;
+      Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
+      Cost +=
+          2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy, OverflowTy,
+                                          CmpInst::ICMP_SGT, CostKind);
+      Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
+                                              CostKind);
+      return Cost;
+    }
+    case Intrinsic::uadd_with_overflow:
+    case Intrinsic::usub_with_overflow: {
+      Type *SumTy = RetTy->getContainedType(0);
+      Type *OverflowTy = RetTy->getContainedType(1);
+      unsigned Opcode = IID == Intrinsic::uadd_with_overflow
+                            ? BinaryOperator::Add
+                            : BinaryOperator::Sub;
+      CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
+                                    ? CmpInst::ICMP_ULT
+                                    : CmpInst::ICMP_UGT;
+
+      InstructionCost Cost = 0;
+      Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
+      Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
+                                          OverflowTy, Pred, CostKind);
+      return Cost;
+    }
     case Intrinsic::sadd_sat:
     case Intrinsic::ssub_sat: {
       // Assume a default expansion.
diff --git a/llvm/test/Analysis/CostModel/ARM/active_lane_mask.ll b/llvm/test/Analysis/CostModel/ARM/active_lane_mask.ll
index cdd61c147fdad93..c6f1cbab80438c2 100644
--- a/llvm/test/Analysis/CostModel/ARM/active_lane_mask.ll
+++ b/llvm/test/Analysis/CostModel/ARM/active_lane_mask.ll
@@ -3,18 +3,18 @@
 
 define void @get_lane_mask() {
 ; CHECK-LABEL: 'get_lane_mask'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %mask_v8i1_i16 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i16(i16 undef, i16 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %mask_v4i1_i16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i16(i16 undef, i16 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %mask_v2i1_i16 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %mask_v8i1_i16 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %mask_v4i1_i16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %mask_v2i1_i16 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
index 67ab70569175e73..484cd201d14a074 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll
@@ -30,79 +30,79 @@ declare {<64 x i8>, <64 x i1>}  @llvm.sadd.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @sadd(i32 %arg) {
 ; V8M-RECIP-LABEL: 'sadd'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'sadd'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-RECIP-LABEL: 'sadd'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 218 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 434 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 154 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 284 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 544 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'sadd'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'sadd'
@@ -110,34 +110,34 @@ define i32 @sadd(i32 %arg) {
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'sadd'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 147 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 528 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -188,26 +188,26 @@ declare {<64 x i8>, <64 x i1>}  @llvm.uadd.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @uadd(i32 %arg) {
 ; V8M-RECIP-LABEL: 'uadd'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'uadd'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -226,7 +226,7 @@ define i32 @uadd(i32 %arg) {
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-RECIP-LABEL: 'uadd'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -245,26 +245,26 @@ define i32 @uadd(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'uadd'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'uadd'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -283,7 +283,7 @@ define i32 @uadd(i32 %arg) {
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'uadd'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -346,79 +346,79 @@ declare {<64 x i8>, <64 x i1>}  @llvm.ssub.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @ssub(i32 %arg) {
 ; V8M-RECIP-LABEL: 'ssub'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'ssub'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-RECIP-LABEL: 'ssub'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 218 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 434 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 154 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 142 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 284 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 544 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'ssub'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'ssub'
@@ -426,34 +426,34 @@ define i32 @ssub(i32 %arg) {
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'ssub'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 147 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 141 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 270 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 264 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 528 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
@@ -504,26 +504,26 @@ declare {<64 x i8>, <64 x i1>}  @llvm.usub.with.overflow.v64i8(<64 x i8>, <64 x
 
 define i32 @usub(i32 %arg) {
 ; V8M-RECIP-LABEL: 'usub'
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'usub'
-; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -542,7 +542,7 @@ define i32 @usub(i32 %arg) {
 ; NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; MVE-RECIP-LABEL: 'usub'
-; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -561,26 +561,26 @@ define i32 @usub(i32 %arg) {
 ; MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; V8M-SIZE-LABEL: 'usub'
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'usub'
-; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
@@ -599,7 +599,7 @@ define i32 @usub(i32 %arg) {
 ; NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; MVE-SIZE-LABEL: 'usub'
-; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
+; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
 ; MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 145 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
index 384ebbee8eba144..da7096af4d80c1c 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll
@@ -37,27 +37,27 @@ declare <64 x i8>  @llvm.sadd.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @add(i32 %arg) {
 ; V8M-RECIP-LABEL: 'add'
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'add'
@@ -113,26 +113,26 @@ define i32 @add(i32 %arg) {
 ; V8M-SIZE-LABEL: 'add'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'add'
@@ -243,27 +243,27 @@ declare <64 x i8>  @llvm.ssub.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @sub(i32 %arg) {
 ; V8M-RECIP-LABEL: 'sub'
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'sub'
@@ -319,26 +319,26 @@ define i32 @sub(i32 %arg) {
 ; V8M-SIZE-LABEL: 'sub'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 69 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 133 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %V32I8 = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 131 for instruction: %V64I8 = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'sub'
diff --git a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
index 64d49f7e492db49..8a12ca2df234efe 100644
--- a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
+++ b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll
@@ -37,27 +37,27 @@ declare <64 x i8>  @llvm.uadd.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @add(i32 %arg) {
 ; V8M-RECIP-LABEL: 'add'
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'add'
@@ -112,27 +112,27 @@ define i32 @add(i32 %arg) {
 ;
 ; V8M-SIZE-LABEL: 'add'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V64I8 = call <64 x i8> @llvm.uadd.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'add'
@@ -243,27 +243,27 @@ declare <64 x i8>  @llvm.usub.sat.v64i8(<64 x i8>, <64 x i8>)
 define i32 @sub(i32 %arg) {
 ; V8M-RECIP-LABEL: 'sub'
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-RECIP-LABEL: 'sub'
@@ -318,27 +318,27 @@ define i32 @sub(i32 %arg) {
 ;
 ; V8M-SIZE-LABEL: 'sub'
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
-; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 65 for instruction: %V32I8 = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> undef, <32 x i8> undef)
+; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 129 for instruction: %V64I8 = call <64 x i8> @llvm.usub.sat.v64i8(<64 x i8> undef, <64 x i8> undef)
 ; V8M-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; NEON-SIZE-LABEL: 'sub'

From d52cc9de5ad92854613143bbd5f8b0771b2db1e1 Mon Sep 17 00:00:00 2001
From: Krystian Stasiowski <sdkrystian@gmail.com>
Date: Thu, 8 Aug 2024 15:46:27 -0400
Subject: [PATCH 062/112] [Clang][AST][NFC] Store template parameter position
 for TemplateTypeParmType in TypeBit (#102481)

`TemplateTypeParmType` currently stores the depth, index, and whether a
template type parameter is a pack in a union of `CanonicalTTPTInfo` and
`TemplateTypeParmDecl*`, and only the canonical type stores the position
information. These bits can be stored for all `TemplateTypeParmTypes` in
`TypeBits` to avoid unnecessary indirection when accessing the position
information.
---
 clang/include/clang/AST/Type.h | 72 ++++++++++++++++------------------
 clang/lib/AST/ASTContext.cpp   |  6 +--
 2 files changed, 37 insertions(+), 41 deletions(-)

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index a4804e4c6f61ca6..27618604192c51f 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2134,6 +2134,23 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     unsigned hasTypeDifferentFromDecl : 1;
   };
 
+  class TemplateTypeParmTypeBitfields {
+    friend class TemplateTypeParmType;
+
+    LLVM_PREFERRED_TYPE(TypeBitfields)
+    unsigned : NumTypeBits;
+
+    /// The depth of the template parameter.
+    unsigned Depth : 15;
+
+    /// Whether this is a template parameter pack.
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned ParameterPack : 1;
+
+    /// The index of the template parameter.
+    unsigned Index : 16;
+  };
+
   class SubstTemplateTypeParmTypeBitfields {
     friend class SubstTemplateTypeParmType;
 
@@ -2257,6 +2274,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     TypeWithKeywordBitfields TypeWithKeywordBits;
     ElaboratedTypeBitfields ElaboratedTypeBits;
     VectorTypeBitfields VectorTypeBits;
+    TemplateTypeParmTypeBitfields TemplateTypeParmTypeBits;
     SubstTemplateTypeParmTypeBitfields SubstTemplateTypeParmTypeBits;
     SubstTemplateTypeParmPackTypeBitfields SubstTemplateTypeParmPackTypeBits;
     TemplateSpecializationTypeBitfields TemplateSpecializationTypeBits;
@@ -6135,52 +6153,30 @@ class BTFTagAttributedType : public Type, public llvm::FoldingSetNode {
 class TemplateTypeParmType : public Type, public llvm::FoldingSetNode {
   friend class ASTContext; // ASTContext creates these
 
-  // Helper data collector for canonical types.
-  struct CanonicalTTPTInfo {
-    unsigned Depth : 15;
-    unsigned ParameterPack : 1;
-    unsigned Index : 16;
-  };
-
-  union {
-    // Info for the canonical type.
-    CanonicalTTPTInfo CanTTPTInfo;
-
-    // Info for the non-canonical type.
-    TemplateTypeParmDecl *TTPDecl;
-  };
+  // The associated TemplateTypeParmDecl for the non-canonical type.
+  TemplateTypeParmDecl *TTPDecl;
 
-  /// Build a non-canonical type.
-  TemplateTypeParmType(TemplateTypeParmDecl *TTPDecl, QualType Canon)
+  TemplateTypeParmType(unsigned D, unsigned I, bool PP,
+                       TemplateTypeParmDecl *TTPDecl, QualType Canon)
       : Type(TemplateTypeParm, Canon,
              TypeDependence::DependentInstantiation |
-                 (Canon->getDependence() & TypeDependence::UnexpandedPack)),
-        TTPDecl(TTPDecl) {}
-
-  /// Build the canonical type.
-  TemplateTypeParmType(unsigned D, unsigned I, bool PP)
-      : Type(TemplateTypeParm, QualType(this, 0),
-             TypeDependence::DependentInstantiation |
-                 (PP ? TypeDependence::UnexpandedPack : TypeDependence::None)) {
-    CanTTPTInfo.Depth = D;
-    CanTTPTInfo.Index = I;
-    CanTTPTInfo.ParameterPack = PP;
-  }
-
-  const CanonicalTTPTInfo& getCanTTPTInfo() const {
-    QualType Can = getCanonicalTypeInternal();
-    return Can->castAs<TemplateTypeParmType>()->CanTTPTInfo;
+                 (PP ? TypeDependence::UnexpandedPack : TypeDependence::None)),
+        TTPDecl(TTPDecl) {
+    assert(!TTPDecl == Canon.isNull());
+    TemplateTypeParmTypeBits.Depth = D;
+    TemplateTypeParmTypeBits.Index = I;
+    TemplateTypeParmTypeBits.ParameterPack = PP;
   }
 
 public:
-  unsigned getDepth() const { return getCanTTPTInfo().Depth; }
-  unsigned getIndex() const { return getCanTTPTInfo().Index; }
-  bool isParameterPack() const { return getCanTTPTInfo().ParameterPack; }
-
-  TemplateTypeParmDecl *getDecl() const {
-    return isCanonicalUnqualified() ? nullptr : TTPDecl;
+  unsigned getDepth() const { return TemplateTypeParmTypeBits.Depth; }
+  unsigned getIndex() const { return TemplateTypeParmTypeBits.Index; }
+  bool isParameterPack() const {
+    return TemplateTypeParmTypeBits.ParameterPack;
   }
 
+  TemplateTypeParmDecl *getDecl() const { return TTPDecl; }
+
   IdentifierInfo *getIdentifier() const;
 
   bool isSugared() const { return false; }
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 8ccf4a2e7732219..0dbbe0043f70317 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -5300,15 +5300,15 @@ QualType ASTContext::getTemplateTypeParmType(unsigned Depth, unsigned Index,
   if (TTPDecl) {
     QualType Canon = getTemplateTypeParmType(Depth, Index, ParameterPack);
     TypeParm = new (*this, alignof(TemplateTypeParmType))
-        TemplateTypeParmType(TTPDecl, Canon);
+        TemplateTypeParmType(Depth, Index, ParameterPack, TTPDecl, Canon);
 
     TemplateTypeParmType *TypeCheck
       = TemplateTypeParmTypes.FindNodeOrInsertPos(ID, InsertPos);
     assert(!TypeCheck && "Template type parameter canonical type broken");
     (void)TypeCheck;
   } else
-    TypeParm = new (*this, alignof(TemplateTypeParmType))
-        TemplateTypeParmType(Depth, Index, ParameterPack);
+    TypeParm = new (*this, alignof(TemplateTypeParmType)) TemplateTypeParmType(
+        Depth, Index, ParameterPack, /*TTPDecl=*/nullptr, /*Canon=*/QualType());
 
   Types.push_back(TypeParm);
   TemplateTypeParmTypes.InsertNode(TypeParm, InsertPos);

From 5bbbdabd7eb6e0d5142db7b8ac4a220b7aa373cf Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 8 Aug 2024 23:50:21 +0400
Subject: [PATCH 063/112] TTI: Check legalization cost of mul overflow ISD
 nodes (#100519)

---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h | 67 +++++++++++++-----------
 1 file changed, 36 insertions(+), 31 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index ac3968de5d672ef..bdf53abe9533df3 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2196,37 +2196,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       ISD = ISD::USUBO;
       break;
     case Intrinsic::smul_with_overflow:
-    case Intrinsic::umul_with_overflow: {
-      Type *MulTy = RetTy->getContainedType(0);
-      Type *OverflowTy = RetTy->getContainedType(1);
-      unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
-      Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
-      bool IsSigned = IID == Intrinsic::smul_with_overflow;
-
-      unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
-      TTI::CastContextHint CCH = TTI::CastContextHint::None;
-
-      InstructionCost Cost = 0;
-      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
-      Cost +=
-          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
-      Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
-                                            CCH, CostKind);
-      Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
-                                              CostKind,
-                                              {TTI::OK_AnyValue, TTI::OP_None},
-                                              {TTI::OK_UniformConstantValue, TTI::OP_None});
-
-      if (IsSigned)
-        Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
-                                                CostKind,
-                                                {TTI::OK_AnyValue, TTI::OP_None},
-                                                {TTI::OK_UniformConstantValue, TTI::OP_None});
-
-      Cost += thisT()->getCmpSelInstrCost(
-          BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
-      return Cost;
-    }
+      ISD = ISD::SMULO;
+      break;
+    case Intrinsic::umul_with_overflow:
+      ISD = ISD::UMULO;
+      break;
     case Intrinsic::fptosi_sat:
     case Intrinsic::fptoui_sat: {
       if (Tys.empty())
@@ -2371,6 +2345,37 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                           OverflowTy, Pred, CostKind);
       return Cost;
     }
+    case Intrinsic::smul_with_overflow:
+    case Intrinsic::umul_with_overflow: {
+      Type *MulTy = RetTy->getContainedType(0);
+      Type *OverflowTy = RetTy->getContainedType(1);
+      unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
+      Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
+      bool IsSigned = IID == Intrinsic::smul_with_overflow;
+
+      unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
+      TTI::CastContextHint CCH = TTI::CastContextHint::None;
+
+      InstructionCost Cost = 0;
+      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
+      Cost +=
+          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
+      Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
+                                            CCH, CostKind);
+      Cost += thisT()->getArithmeticInstrCost(
+          Instruction::LShr, ExtTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
+          {TTI::OK_UniformConstantValue, TTI::OP_None});
+
+      if (IsSigned)
+        Cost += thisT()->getArithmeticInstrCost(
+            Instruction::AShr, MulTy, CostKind,
+            {TTI::OK_AnyValue, TTI::OP_None},
+            {TTI::OK_UniformConstantValue, TTI::OP_None});
+
+      Cost += thisT()->getCmpSelInstrCost(
+          BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
+      return Cost;
+    }
     case Intrinsic::sadd_sat:
     case Intrinsic::ssub_sat: {
       // Assume a default expansion.

From 6a482972e01bf347c8ac22fc90f96fa01cb4eec1 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 8 Aug 2024 23:54:17 +0400
Subject: [PATCH 064/112] TTI: Check legalization cost of mulfix ISD nodes
 (#100520)

---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h | 53 +++++++++++++-----------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index bdf53abe9533df3..890c2b8ca36e117 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2159,30 +2159,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       ISD = ISD::USUBSAT;
       break;
     case Intrinsic::smul_fix:
-    case Intrinsic::umul_fix: {
-      unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
-      Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
-
-      unsigned ExtOp =
-          IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
-      TTI::CastContextHint CCH = TTI::CastContextHint::None;
-
-      InstructionCost Cost = 0;
-      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
-      Cost +=
-          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
-      Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
-                                            CCH, CostKind);
-      Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
-                                              CostKind,
-                                              {TTI::OK_AnyValue, TTI::OP_None},
-                                              {TTI::OK_UniformConstantValue, TTI::OP_None});
-      Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
-                                              {TTI::OK_AnyValue, TTI::OP_None},
-                                              {TTI::OK_UniformConstantValue, TTI::OP_None});
-      Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
-      return Cost;
-    }
+      ISD = ISD::SMULFIX;
+      break;
+    case Intrinsic::umul_fix:
+      ISD = ISD::UMULFIX;
+      break;
     case Intrinsic::sadd_with_overflow:
       ISD = ISD::SADDO;
       break;
@@ -2417,6 +2398,30 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
       return Cost;
     }
+    case Intrinsic::smul_fix:
+    case Intrinsic::umul_fix: {
+      unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
+      Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
+
+      unsigned ExtOp =
+          IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
+      TTI::CastContextHint CCH = TTI::CastContextHint::None;
+
+      InstructionCost Cost = 0;
+      Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
+      Cost +=
+          thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
+      Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
+                                            CCH, CostKind);
+      Cost += thisT()->getArithmeticInstrCost(
+          Instruction::LShr, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
+          {TTI::OK_UniformConstantValue, TTI::OP_None});
+      Cost += thisT()->getArithmeticInstrCost(
+          Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
+          {TTI::OK_UniformConstantValue, TTI::OP_None});
+      Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
+      return Cost;
+    }
     default:
       break;
     }

From 8334d2bfd34e2666db173269525d17352afa7bac Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Thu, 8 Aug 2024 12:55:10 -0700
Subject: [PATCH 065/112] [lldb/Interpreter] Fix ambiguous partial command
 resolution (#101934)

This patch is a follow-up to #97263 that fix ambigous abbreviated
command resolution.

When multiple commands are resolved, instead of failing to pick a
command to
run, this patch changes to resolution logic to check if there is a
single
alias match and if so, it will run the alias instead of the other
matches.

This has as a side-effect that we don't need to make aliases for every
substring of aliases to support abbrivated alias resolution.

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 lldb/docs/use/tutorial.rst                    |  4 +
 .../lldb/Interpreter/CommandInterpreter.h     |  4 +
 .../source/Commands/CommandObjectCommands.cpp |  8 +-
 .../source/Interpreter/CommandInterpreter.cpp | 84 ++++++++++++++-----
 .../TestAmbiguousCommands.py                  | 35 ++++++++
 .../ambigous_commands/categories              |  1 +
 6 files changed, 116 insertions(+), 20 deletions(-)
 create mode 100644 lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
 create mode 100644 lldb/test/API/functionalities/ambigous_commands/categories

diff --git a/lldb/docs/use/tutorial.rst b/lldb/docs/use/tutorial.rst
index 22354c6720e14ad..00e7befdd087a44 100644
--- a/lldb/docs/use/tutorial.rst
+++ b/lldb/docs/use/tutorial.rst
@@ -168,6 +168,10 @@ is more convenient to make the basic commands unique down to a letter or two,
 and then learn these sequences than to fill the namespace with lots of aliases,
 and then have to type them all the way out.
 
+If the alias abbreviation or the full alias command collides with another
+existing command, the command resolver will prefer to use the alias over any
+other command as far as there is only one alias command match.
+
 However, users are free to customize LLDB's command set however they like, and
 since LLDB reads the file ``~/.lldbinit`` at startup, you can store all your
 aliases there and they will be generally available to you. Your aliases are
diff --git a/lldb/include/lldb/Interpreter/CommandInterpreter.h b/lldb/include/lldb/Interpreter/CommandInterpreter.h
index 48f6618ab0e3929..2bafc30cc8e23ad 100644
--- a/lldb/include/lldb/Interpreter/CommandInterpreter.h
+++ b/lldb/include/lldb/Interpreter/CommandInterpreter.h
@@ -295,6 +295,10 @@ class CommandInterpreter : public Broadcaster,
                                       StringList *matches = nullptr,
                                       StringList *descriptions = nullptr) const;
 
+  CommandObject *
+  GetAliasCommandObject(llvm::StringRef cmd, StringList *matches = nullptr,
+                        StringList *descriptions = nullptr) const;
+
   /// Determine whether a root level, built-in command with this name exists.
   bool CommandExists(llvm::StringRef cmd) const;
 
diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp
index c63445b7c8c8686..7c439f4ddb93e38 100644
--- a/lldb/source/Commands/CommandObjectCommands.cpp
+++ b/lldb/source/Commands/CommandObjectCommands.cpp
@@ -322,7 +322,13 @@ rather than using a positional placeholder:"
 
 (lldb) command alias bl3 breakpoint set -f %1 -l 3
 
-    Always sets a breakpoint on line 3 of whatever file is indicated.)");
+    Always sets a breakpoint on line 3 of whatever file is indicated.
+
+)"
+
+        "If the alias abbreviation or the full alias command collides with another \
+existing command, the command resolver will prefer to use the alias over any \
+other command as far as there is only one alias command match.");
 
     CommandArgumentEntry arg1;
     CommandArgumentEntry arg2;
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index 71c928ec811fc65..e45112530404b8e 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -520,10 +520,6 @@ void CommandInterpreter::Initialize() {
 
   cmd_obj_sp = GetCommandSPExact("scripting run");
   if (cmd_obj_sp) {
-    AddAlias("sc", cmd_obj_sp);
-    AddAlias("scr", cmd_obj_sp);
-    AddAlias("scri", cmd_obj_sp);
-    AddAlias("scrip", cmd_obj_sp);
     AddAlias("script", cmd_obj_sp);
   }
 
@@ -1302,6 +1298,39 @@ CommandObject *CommandInterpreter::GetUserCommandObject(
   return {};
 }
 
+CommandObject *CommandInterpreter::GetAliasCommandObject(
+    llvm::StringRef cmd, StringList *matches, StringList *descriptions) const {
+  auto find_exact =
+      [&](const CommandObject::CommandMap &map) -> CommandObject * {
+    auto found_elem = map.find(cmd.str());
+    if (found_elem == map.end())
+      return (CommandObject *)nullptr;
+    CommandObject *exact_cmd = found_elem->second.get();
+    if (!exact_cmd)
+      return nullptr;
+
+    if (matches)
+      matches->AppendString(exact_cmd->GetCommandName());
+
+    if (descriptions)
+      descriptions->AppendString(exact_cmd->GetHelp());
+
+    return exact_cmd;
+    return nullptr;
+  };
+
+  CommandObject *exact_cmd = find_exact(GetAliases());
+  if (exact_cmd)
+    return exact_cmd;
+
+  // We didn't have an exact command, so now look for partial matches.
+  StringList tmp_list;
+  StringList *matches_ptr = matches ? matches : &tmp_list;
+  AddNamesMatchingPartialString(GetAliases(), cmd, *matches_ptr);
+
+  return {};
+}
+
 bool CommandInterpreter::CommandExists(llvm::StringRef cmd) const {
   return m_command_dict.find(std::string(cmd)) != m_command_dict.end();
 }
@@ -3421,6 +3450,19 @@ CommandInterpreter::ResolveCommandImpl(std::string &command_line,
   std::string next_word;
   StringList matches;
   bool done = false;
+
+  auto build_alias_cmd = [&](std::string &full_name) {
+    revised_command_line.Clear();
+    matches.Clear();
+    std::string alias_result;
+    cmd_obj =
+        BuildAliasResult(full_name, scratch_command, alias_result, result);
+    revised_command_line.Printf("%s", alias_result.c_str());
+    if (cmd_obj) {
+      wants_raw_input = cmd_obj->WantsRawCommandString();
+    }
+  };
+
   while (!done) {
     char quote_char = '\0';
     std::string suffix;
@@ -3432,14 +3474,7 @@ CommandInterpreter::ResolveCommandImpl(std::string &command_line,
       bool is_real_command =
           (!is_alias) || (cmd_obj != nullptr && !cmd_obj->IsAlias());
       if (!is_real_command) {
-        matches.Clear();
-        std::string alias_result;
-        cmd_obj =
-            BuildAliasResult(full_name, scratch_command, alias_result, result);
-        revised_command_line.Printf("%s", alias_result.c_str());
-        if (cmd_obj) {
-          wants_raw_input = cmd_obj->WantsRawCommandString();
-        }
+        build_alias_cmd(full_name);
       } else {
         if (cmd_obj) {
           llvm::StringRef cmd_name = cmd_obj->GetCommandName();
@@ -3486,21 +3521,32 @@ CommandInterpreter::ResolveCommandImpl(std::string &command_line,
     if (cmd_obj == nullptr) {
       const size_t num_matches = matches.GetSize();
       if (matches.GetSize() > 1) {
-        StreamString error_msg;
-        error_msg.Printf("Ambiguous command '%s'. Possible matches:\n",
-                         next_word.c_str());
+        StringList alias_matches;
+        GetAliasCommandObject(next_word, &alias_matches);
+
+        if (alias_matches.GetSize() == 1) {
+          std::string full_name;
+          GetAliasFullName(alias_matches.GetStringAtIndex(0), full_name);
+          build_alias_cmd(full_name);
+          done = static_cast<bool>(cmd_obj);
+        } else {
+          StreamString error_msg;
+          error_msg.Printf("Ambiguous command '%s'. Possible matches:\n",
+                           next_word.c_str());
 
-        for (uint32_t i = 0; i < num_matches; ++i) {
-          error_msg.Printf("\t%s\n", matches.GetStringAtIndex(i));
+          for (uint32_t i = 0; i < num_matches; ++i) {
+            error_msg.Printf("\t%s\n", matches.GetStringAtIndex(i));
+          }
+          result.AppendRawError(error_msg.GetString());
         }
-        result.AppendRawError(error_msg.GetString());
       } else {
         // We didn't have only one match, otherwise we wouldn't get here.
         lldbassert(num_matches == 0);
         result.AppendErrorWithFormat("'%s' is not a valid command.\n",
                                      next_word.c_str());
       }
-      return nullptr;
+      if (!done)
+        return nullptr;
     }
 
     if (cmd_obj->IsMultiwordObject()) {
diff --git a/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py b/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
new file mode 100644
index 000000000000000..14c66fefea7efda
--- /dev/null
+++ b/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
@@ -0,0 +1,35 @@
+"""
+Test how lldb reacts to ambiguous commands
+"""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class AmbiguousCommandTestCase(TestBase):
+    @no_debug_info_test
+    def test_ambiguous_command_with_alias(self):
+        command_interpreter = self.dbg.GetCommandInterpreter()
+        self.assertTrue(command_interpreter, VALID_COMMAND_INTERPRETER)
+        result = lldb.SBCommandReturnObject()
+
+        command_interpreter.HandleCommand(
+            "command alias corefile target create -c %0", result
+        )
+        self.assertTrue(result.Succeeded())
+
+        command_interpreter.ResolveCommand("co", result)
+        self.assertFalse(result.Succeeded())
+        self.assertEqual(
+            result.GetError(),
+            "Ambiguous command 'co'. Possible matches:\n\tcommand\n\tcontinue\n\tcorefile\n",
+        )
+
+        command_interpreter.HandleCommand("command unalias continue", result)
+        self.assertTrue(result.Succeeded())
+
+        command_interpreter.ResolveCommand("co", result)
+        self.assertTrue(result.Succeeded())
+        self.assertEqual(result.GetOutput(), "target create -c %0")
diff --git a/lldb/test/API/functionalities/ambigous_commands/categories b/lldb/test/API/functionalities/ambigous_commands/categories
new file mode 100644
index 000000000000000..3a3f4df6416b9c8
--- /dev/null
+++ b/lldb/test/API/functionalities/ambigous_commands/categories
@@ -0,0 +1 @@
+cmdline

From 9a070d6d0f0c111c2269a912f98908c821993e37 Mon Sep 17 00:00:00 2001
From: jameshu15869 <55058507+jameshu15869@users.noreply.github.com>
Date: Thu, 8 Aug 2024 15:05:34 -0500
Subject: [PATCH 066/112] [libc] [gpu] Add Generic, NvSin, and OcmlSinf64
 Throughput Benchmark (#101917)

This PR implements
https://github.com/lntue/llvm-project/commit/2a158426d4b90ffaa3eaecc9bc10e5aed11f1bcf
to provide better throughput benchmarking for libc `sin()` and
`__nv_sin()`.

These changes have not been tested on AMDGPU yet, only compiled.
---
 libc/benchmarks/gpu/LibcGpuBenchmark.h        | 42 +++++++------
 .../benchmarks/gpu/src/math/sin_benchmark.cpp | 60 ++++++++----------
 .../gpu/timing/amdgpu/CMakeLists.txt          |  4 ++
 libc/benchmarks/gpu/timing/amdgpu/timing.h    | 61 +++++++++++++------
 .../gpu/timing/nvptx/CMakeLists.txt           |  4 ++
 libc/benchmarks/gpu/timing/nvptx/timing.h     | 37 +++++++++--
 6 files changed, 128 insertions(+), 80 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
index 2b85b146ed7459c..830e6f9e89a7439 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
@@ -21,7 +21,7 @@ namespace benchmarks {
 
 struct BenchmarkOptions {
   uint32_t initial_iterations = 1;
-  uint32_t min_iterations = 50;
+  uint32_t min_iterations = 1;
   uint32_t max_iterations = 10000000;
   uint32_t min_samples = 4;
   uint32_t max_samples = 1000;
@@ -111,9 +111,15 @@ class Benchmark {
 };
 
 // We want our random values to be approximately
-// |real value| <= 2^(max_exponent) * (1 + (random 52 bits) * 2^-52) <
-// 2^(max_exponent + 1)
-template <typename T> static T get_rand_input() {
+// Output: a random number with the exponent field between min_exp and max_exp,
+// i.e. 2^min_exp <= |real_value| < 2^(max_exp + 1),
+// Caveats:
+//   -EXP_BIAS corresponding to denormal values,
+//   EXP_BIAS + 1 corresponding to inf or nan.
+template <typename T>
+static T
+get_rand_input(int max_exp = LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS,
+               int min_exp = -LIBC_NAMESPACE::fputil::FPBits<T>::EXP_BIAS) {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
 
   // Required to correctly instantiate FPBits for floats and doubles.
@@ -125,10 +131,11 @@ template <typename T> static T get_rand_input() {
            static_cast<uint64_t>(LIBC_NAMESPACE::rand());
   else
     bits = LIBC_NAMESPACE::rand();
-  double scale = 0.5 + LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN / 2048.0;
+  double scale =
+      static_cast<double>(max_exp - min_exp + 1) / (2 * FPBits::EXP_BIAS + 1);
   FPBits fp(bits);
   fp.set_biased_exponent(
-      static_cast<uint32_t>(fp.get_biased_exponent() * scale));
+      static_cast<uint32_t>(fp.get_biased_exponent() * scale + min_exp));
   return fp.get_val();
 }
 
@@ -141,19 +148,15 @@ template <typename T> class MathPerf {
 public:
   typedef T Func(T);
 
-  static uint64_t run_perf_in_range(Func f, StorageType starting_bit,
-                                    StorageType ending_bit, StorageType step) {
-    uint64_t total_time = 0;
-    if (step <= 0)
-      step = 1;
-    volatile T result;
-    for (StorageType bits = starting_bit; bits < ending_bit; bits += step) {
-      T x = FPBits(bits).get_val();
-      total_time += LIBC_NAMESPACE::latency(f, x);
-    }
-    StorageType num_runs = (ending_bit - starting_bit) / step + 1;
-
-    return total_time / num_runs;
+  template <size_t N = 1>
+  static uint64_t run_throughput_in_range(Func f, int min_exp, int max_exp) {
+    cpp::array<T, N> inputs;
+    for (size_t i = 0; i < N; ++i)
+      inputs[i] = get_rand_input<T>(min_exp, max_exp);
+
+    uint64_t total_time = LIBC_NAMESPACE::throughput(f, inputs);
+
+    return total_time / N;
   }
 };
 
@@ -176,5 +179,4 @@ template <typename T> class MathPerf {
 #define SINGLE_WAVE_BENCHMARK(SuiteName, TestName, Func)                       \
   BENCHMARK_N_THREADS(SuiteName, TestName, Func,                               \
                       LIBC_NAMESPACE::gpu::get_lane_size())
-
 #endif
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 5849ea3e99bb094..e86961790b9438a 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -15,51 +15,41 @@
 #include "src/math/amdgpu/declarations.h"
 #endif
 
-constexpr double M_PI = 3.14159265358979323846;
-uint64_t get_bits(double x) {
-  return LIBC_NAMESPACE::cpp::bit_cast<uint64_t>(x);
-}
-
 // BENCHMARK() expects a function that with no parameters that returns a
 // uint64_t representing the latency. Defining each benchmark using macro that
 // expands to a lambda to allow us to switch the implementation of `sin()` to
 // easily register NVPTX benchmarks.
-#define BM_RANDOM_INPUT(Func)                                                  \
-  []() {                                                                       \
-    double x = LIBC_NAMESPACE::benchmarks::get_rand_input<double>();           \
-    return LIBC_NAMESPACE::latency(Func, x);                                   \
-  }
-BENCHMARK(LlvmLibcSinGpuBenchmark, Sin, BM_RANDOM_INPUT(LIBC_NAMESPACE::sin));
-
-#define BM_TWO_PI(Func)                                                        \
+#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N)                             \
   []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
-        Func, 0, get_bits(2 * M_PI), get_bits(M_PI / 64));                     \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<                               \
+        double>::run_throughput_in_range<N>(Func, MIN_EXP, MAX_EXP);           \
   }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinTwoPi, BM_TWO_PI(LIBC_NAMESPACE::sin));
 
-#define BM_LARGE_INT(Func)                                                     \
-  []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<double>::run_perf_in_range(    \
-        Func, 0, get_bits(1 << 30), get_bits(1 << 4));                         \
-  }
-BENCHMARK(LlvmLibcSinGpuBenchmark, SinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::sin));
+#define BENCH(Name, Func, MIN_EXP, MAX_EXP)                                    \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1,                     \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1));           \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128,                   \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 128));         \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024,                  \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1024));        \
+  SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096,                  \
+                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 4096))
+
+BENCH(Sin, LIBC_NAMESPACE::sin, -1023, 1023);
+BENCH(SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
+BENCH(SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
+BENCH(SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
 
 #ifdef NVPTX_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSin,
-          BM_RANDOM_INPUT(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinTwoPi,
-          BM_TWO_PI(LIBC_NAMESPACE::__nv_sin));
-BENCHMARK(LlvmLibcSinGpuBenchmark, NvSinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::__nv_sin));
+BENCH(NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
+BENCH(NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
+BENCH(NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
+BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSin,
-          BM_RANDOM_INPUT(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinTwoPi,
-          BM_TWO_PI(LIBC_NAMESPACE::__ocml_sin_f64));
-BENCHMARK(LlvmLibcSinGpuBenchmark, AmdgpuSinLargeInt,
-          BM_LARGE_INT(LIBC_NAMESPACE::__ocml_sin_f64));
+BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
+BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
+BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
+BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
 #endif
diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
index 179429db9a09ae8..aa5dcd33bee9c81 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
     timing.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.attributes
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.array
 )
diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
index e308d619e956956..d5c3df27b7de605 100644
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
 
+#include "src/__support/CPP/array.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
@@ -17,14 +18,6 @@
 
 #include <stdint.h>
 
-// AMDGPU does not support input register constraints for i1 and i8, so we must
-// cast them to uint16_t's before loading them into registers.
-#define FORCE_TO_REGISTER(TYPE, VARIABLE)                                      \
-  if constexpr (cpp::is_same_v<TYPE, char> || cpp::is_same_v<TYPE, bool>)      \
-    asm("" ::"v"(static_cast<uint16_t>(VARIABLE)));                            \
-  else                                                                         \
-    asm("" ::"v"(VARIABLE))
-
 namespace LIBC_NAMESPACE_DECL {
 
 // Returns the overhead associated with calling the profiling region. This
@@ -50,8 +43,6 @@ template <typename F, typename T>
   volatile T storage = t;
   T arg = storage;
 
-  FORCE_TO_REGISTER(T, arg);
-
   // The AMDGPU architecture needs to wait on pending results.
   gpu::memory_fence();
   // Get the current timestamp from the clock.
@@ -59,7 +50,6 @@ template <typename F, typename T>
 
   // This forces the compiler to load the input argument and run the clock
   // cycle counter before the profiling region.
-  FORCE_TO_REGISTER(T, arg);
   asm("" ::"s"(start));
 
   // Run the function under test and return its value.
@@ -67,8 +57,15 @@ template <typename F, typename T>
 
   // This inline assembly performs a no-op which forces the result to both
   // be used and prevents us from exiting this region before it's complete.
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
-      static_cast<uint32_t>(result)));
+  if constexpr (cpp::is_same_v<decltype(result), char> ||
+                cpp::is_same_v<decltype(result), bool>)
+    // AMDGPU does not support input register constraints for i1 and i8, so we
+    // cast it to a 32-bit integer. This does not add an additional assembly
+    // instruction (https://godbolt.org/z/zxGqv8G91).
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+        static_cast<uint32_t>(result)));
+  else
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
 
   // Obtain the current timestamp after running the calculation and force
   // ordering.
@@ -87,20 +84,19 @@ template <typename F, typename T1, typename T2>
   T1 arg1 = storage1;
   T2 arg2 = storage2;
 
-  FORCE_TO_REGISTER(T1, arg1);
-  FORCE_TO_REGISTER(T2, arg2);
-
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
-  FORCE_TO_REGISTER(T1, arg1);
-  FORCE_TO_REGISTER(T2, arg2);
   asm("" ::"s"(start));
 
   auto result = f(arg1, arg2);
 
-  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
-      static_cast<uint32_t>(result)));
+  if constexpr (cpp::is_same_v<decltype(result), char> ||
+                cpp::is_same_v<decltype(result), bool>)
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(
+        static_cast<uint32_t>(result)));
+  else
+    asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
 
   uint64_t stop = gpu::processor_clock();
   asm("" ::"s"(stop));
@@ -109,6 +105,31 @@ template <typename F, typename T1, typename T2>
   return stop - start;
 }
 
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+throughput(F f, const cpp::array<T, N> &inputs) {
+  asm("" ::"v"(&inputs));
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"s"(start));
+
+  for (auto input : inputs) {
+    auto result = f(input);
+
+    asm("" ::"v"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  asm("" ::"s"(stop));
+  gpu::memory_fence();
+
+  // Return the time elapsed.
+  return stop - start;
+}
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
index 9958e16206a410a..2723c8940814c6f 100644
--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
@@ -4,4 +4,8 @@ add_header_library(
     timing.h
   DEPENDS
     libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.attributes
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.CPP.array
 )
diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
index b426dfd0ea1535f..637986abd9092da 100644
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
 
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/attributes.h"
@@ -25,7 +27,7 @@ namespace LIBC_NAMESPACE_DECL {
   volatile uint32_t x = 1;
   uint32_t y = x;
   uint64_t start = gpu::processor_clock();
-  asm("" ::"r"(y), "llr"(start));
+  asm("" ::"llr"(start));
   uint32_t result = y;
   asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
   uint64_t stop = gpu::processor_clock();
@@ -42,7 +44,6 @@ template <typename F, typename T>
   // not constant propagate it and remove the profiling region.
   volatile T storage = t;
   T arg = storage;
-  asm("" ::"r"(arg));
 
   // Get the current timestamp from the clock.
   gpu::memory_fence();
@@ -50,7 +51,7 @@ template <typename F, typename T>
 
   // This forces the compiler to load the input argument and run the clock cycle
   // counter before the profiling region.
-  asm("" ::"r"(arg), "llr"(start));
+  asm("" ::"llr"(start));
 
   // Run the function under test and return its value.
   auto result = f(arg);
@@ -76,12 +77,11 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
   volatile T2 storage2 = t2;
   T1 arg = storage;
   T2 arg2 = storage2;
-  asm("" ::"r"(arg), "r"(arg2));
 
   gpu::memory_fence();
   uint64_t start = gpu::processor_clock();
 
-  asm("" ::"r"(arg), "r"(arg2), "llr"(start));
+  asm("" ::"llr"(start));
 
   auto result = f(arg, arg2);
 
@@ -94,6 +94,33 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
 
   return stop - start;
 }
+
+// Provides throughput benchmarking.
+template <typename F, typename T, size_t N>
+[[gnu::noinline]] static LIBC_INLINE uint64_t
+throughput(F f, const cpp::array<T, N> &inputs) {
+  asm("" ::"r"(&inputs));
+
+  gpu::memory_fence();
+  uint64_t start = gpu::processor_clock();
+
+  asm("" ::"llr"(start));
+
+  uint64_t result;
+  for (auto input : inputs) {
+    asm("" ::"r"(input));
+    result = f(input);
+    asm("" ::"r"(result));
+  }
+
+  uint64_t stop = gpu::processor_clock();
+  gpu::memory_fence();
+  asm("" ::"r"(stop));
+  volatile auto output = result;
+
+  // Return the time elapsed.
+  return stop - start;
+}
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX

From 8c3b6bd0cb04a6edc3b294f9048cbfb1ea6c200c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 8 Aug 2024 13:12:59 -0700
Subject: [PATCH 067/112] [flang][cuda] Do not lower device variables in main
 program as globals (#102512)

Flang considers arrays in main program larger than 32 bytes having the
SAVE attribute and lowers them as globals. In CUDA Fortran, device
variables are not allowed to have the SAVE attribute and should be
allocated dynamically in the main program scope.
This patch updates lowering so CUDA Fortran device variables are not
considered with the SAVE attribute.
---
 flang/include/flang/Evaluate/tools.h          | 12 ++++++++++++
 flang/lib/Evaluate/tools.cpp                  |  3 ++-
 flang/test/Lower/CUDA/cuda-program-global.cuf |  9 ++++++---
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 8c6d3b37166a92a..de4d415eda6fd61 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1243,6 +1243,18 @@ bool CheckForCoindexedObject(parser::ContextualMessages &,
     const std::optional<ActualArgument> &, const std::string &procName,
     const std::string &argName);
 
+inline bool CanCUDASymbolHasSave(const Symbol &sym) {
+  if (const auto *details =
+          sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
+    if (details->cudaDataAttr() &&
+        *details->cudaDataAttr() != common::CUDADataAttr::Pinned &&
+        *details->cudaDataAttr() != common::CUDADataAttr::Unified) {
+      return false;
+    }
+  }
+  return true;
+}
+
 inline bool IsCUDADeviceSymbol(const Symbol &sym) {
   if (const auto *details =
           sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 34faba39ffd46f2..4d78f814f8ef2c9 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1696,7 +1696,8 @@ bool IsSaved(const Symbol &original) {
       (features.IsEnabled(common::LanguageFeature::SaveMainProgram) ||
           (features.IsEnabled(
                common::LanguageFeature::SaveBigMainProgramVariables) &&
-              symbol.size() > 32))) {
+              symbol.size() > 32)) &&
+      Fortran::evaluate::CanCUDASymbolHasSave(symbol)) {
     // With SaveBigMainProgramVariables, keeping all unsaved main program
     // variables of 32 bytes or less on the stack allows keeping numerical and
     // logical scalars, small scalar characters or derived, small arrays, and
diff --git a/flang/test/Lower/CUDA/cuda-program-global.cuf b/flang/test/Lower/CUDA/cuda-program-global.cuf
index 97b9927b3082fd8..a3c9e1ba8d253c1 100644
--- a/flang/test/Lower/CUDA/cuda-program-global.cuf
+++ b/flang/test/Lower/CUDA/cuda-program-global.cuf
@@ -1,19 +1,22 @@
 ! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
 
-! Test lowering of program local variable that are global
+! Test lowering of program local variables. Make sure CUDA device variables are
+! not lowered as global.
 
 program test
   integer, device :: a(10)
+  integer, unified :: u(10)
   integer :: b(10)
   integer :: i
   print*,i
 end
 
 ! CHECK-LABEL: func.func @_QQmain()
-! CHECK: fir.address_of(@_QFEa) : !fir.ref<!fir.array<10xi32>>
+! CHECK: cuf.alloc !fir.array<10xi32> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QFEa"} -> !fir.ref<!fir.array<10xi32>>
 ! CHECK: fir.address_of(@_QFEb) : !fir.ref<!fir.array<10xi32>>
 ! CHECK: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
 ! CHECK: hlfir.declare %[[ALLOCA]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
-! CHECK: fir.global internal @_QFEa {data_attr = #cuf.cuda<device>} : !fir.array<10xi32> {{{$}}
+! CHECK-NOT: fir.global internal @_QFEa {data_attr = #cuf.cuda<device>} : !fir.array<10xi32> {{{$}}
 ! CHECK: fir.global internal @_QFEb : !fir.array<10xi32> {{{$}}
+! CHECK: fir.global internal @_QFEu {data_attr = #cuf.cuda<unified>} : !fir.array<10xi32>

From ba976971898d74df38d155c55e008c898120d1e4 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Thu, 8 Aug 2024 13:19:26 -0700
Subject: [PATCH 068/112] [NVPTX] support switch statement with brx.idx
 (#102400)

Add custom lowering for `BR_JT` DAG nodes to the `brx.idx` PTX
instruction ([PTX ISA 9.7.13.4. Control Flow Instructions: brx.idx]
(https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-brx-idx)).
Depending on the heuristics in DAG selection, `switch` statements may
now be lowered using `brx.idx`
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  4 ++
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 11 +--
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   | 45 +++++++++++-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     | 10 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       | 38 ++++++++++
 llvm/test/CodeGen/NVPTX/jump-table.ll         | 69 +++++++++++++++++++
 6 files changed, 169 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/jump-table.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 9ccdbab008aec8b..5b2214fa66c40b4 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3843,6 +3843,10 @@ class TargetLowering : public TargetLoweringBase {
   /// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
   virtual unsigned getJumpTableEncoding() const;
 
+  virtual MVT getJumpTableRegTy(const DataLayout &DL) const {
+    return getPointerTy(DL);
+  }
+
   virtual const MCExpr *
   LowerCustomJumpTableEntry(const MachineJumpTableInfo * /*MJTI*/,
                             const MachineBasicBlock * /*MBB*/, unsigned /*uid*/,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1f4436fb3a49669..37ba62911ec70b2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2977,7 +2977,7 @@ void SelectionDAGBuilder::visitJumpTable(SwitchCG::JumpTable &JT) {
   // Emit the code for the jump table
   assert(JT.SL && "Should set SDLoc for SelectionDAG!");
   assert(JT.Reg != -1U && "Should lower JT Header first!");
-  EVT PTy = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+  EVT PTy = DAG.getTargetLoweringInfo().getJumpTableRegTy(DAG.getDataLayout());
   SDValue Index = DAG.getCopyFromReg(getControlRoot(), *JT.SL, JT.Reg, PTy);
   SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
   SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, *JT.SL, MVT::Other,
@@ -3005,12 +3005,13 @@ void SelectionDAGBuilder::visitJumpTableHeader(SwitchCG::JumpTable &JT,
   // This value may be smaller or larger than the target's pointer type, and
   // therefore require extension or truncating.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SwitchOp = DAG.getZExtOrTrunc(Sub, dl, TLI.getPointerTy(DAG.getDataLayout()));
+  SwitchOp =
+      DAG.getZExtOrTrunc(Sub, dl, TLI.getJumpTableRegTy(DAG.getDataLayout()));
 
   unsigned JumpTableReg =
-      FuncInfo.CreateReg(TLI.getPointerTy(DAG.getDataLayout()));
-  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl,
-                                    JumpTableReg, SwitchOp);
+      FuncInfo.CreateReg(TLI.getJumpTableRegTy(DAG.getDataLayout()));
+  SDValue CopyTo =
+      DAG.getCopyToReg(getControlRoot(), dl, JumpTableReg, SwitchOp);
   JT.Reg = JumpTableReg;
 
   if (!JTH.FallthroughUnreachable) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 516fc7339a4bf38..bf647c88f00e282 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -582,9 +583,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::ROTR, MVT::i8, Expand);
   setOperationAction(ISD::BSWAP, MVT::i16, Expand);
 
-  // Indirect branch is not supported.
-  // This also disables Jump Table creation.
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BR_JT, MVT::Other, Custom);
   setOperationAction(ISD::BRIND, MVT::Other, Expand);
 
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
@@ -945,6 +944,9 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(NVPTXISD::Dummy)
     MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED)
     MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED)
+    MAKE_CASE(NVPTXISD::BrxEnd)
+    MAKE_CASE(NVPTXISD::BrxItem)
+    MAKE_CASE(NVPTXISD::BrxStart)
     MAKE_CASE(NVPTXISD::Tex1DFloatS32)
     MAKE_CASE(NVPTXISD::Tex1DFloatFloat)
     MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel)
@@ -2785,6 +2787,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerFP_ROUND(Op, DAG);
   case ISD::FP_EXTEND:
     return LowerFP_EXTEND(Op, DAG);
+  case ISD::BR_JT:
+    return LowerBR_JT(Op, DAG);
   case ISD::VAARG:
     return LowerVAARG(Op, DAG);
   case ISD::VASTART:
@@ -2810,6 +2814,41 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
+SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Chain = Op.getOperand(0);
+  const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
+  SDValue Index = Op.getOperand(2);
+
+  unsigned JId = JT->getIndex();
+  MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo();
+  ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
+
+  SDValue IdV = DAG.getConstant(JId, DL, MVT::i32);
+
+  // Generate BrxStart node
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(NVPTXISD::BrxStart, DL, VTs, Chain, IdV);
+
+  // Generate BrxItem nodes
+  assert(!MBBs.empty());
+  for (MachineBasicBlock *MBB : MBBs.drop_back())
+    Chain = DAG.getNode(NVPTXISD::BrxItem, DL, VTs, Chain.getValue(0),
+                        DAG.getBasicBlock(MBB), Chain.getValue(1));
+
+  // Generate BrxEnd nodes
+  SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
+                      IdV, Chain.getValue(1)};
+  SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd, DL, VTs, EndOps);
+
+  return BrxEnd;
+}
+
+// This will prevent AsmPrinter from trying to print the jump tables itself.
+unsigned NVPTXTargetLowering::getJumpTableEncoding() const {
+  return MachineJumpTableInfo::EK_Inline;
+}
+
 // This function is almost a copy of SelectionDAG::expandVAArg().
 // The only diff is that this one produces loads from local address space.
 SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 63262961b363eda..32e6b044b0de1f8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -62,6 +62,9 @@ enum NodeType : unsigned {
   BFI,
   PRMT,
   DYNAMIC_STACKALLOC,
+  BrxStart,
+  BrxItem,
+  BrxEnd,
   Dummy,
 
   LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -580,6 +583,11 @@ class NVPTXTargetLowering : public TargetLowering {
     return true;
   }
 
+  // The default is the same as pointer type, but brx.idx only accepts i32
+  MVT getJumpTableRegTy(const DataLayout &) const override { return MVT::i32; }
+
+  unsigned getJumpTableEncoding() const override;
+
   bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
 
   // The default is to transform llvm.ctlz(x, false) (where false indicates that
@@ -637,6 +645,8 @@ class NVPTXTargetLowering : public TargetLowering {
 
   SDValue LowerSelect(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 6a096fa5acea7c1..904574b2e1d6607 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3880,6 +3880,44 @@ def DYNAMIC_STACKALLOC64 :
             [(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>,
             Requires<[hasPTX<73>, hasSM<52>]>;
 
+
+//
+// BRX
+//
+
+def SDTBrxStartProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDTBrxItemProfile : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+def SDTBrxEndProfile : SDTypeProfile<0, 3, [SDTCisVT<0, OtherVT>, SDTCisInt<1>, SDTCisInt<2>]>;
+
+def brx_start :
+  SDNode<"NVPTXISD::BrxStart", SDTBrxStartProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def brx_item :
+  SDNode<"NVPTXISD::BrxItem", SDTBrxItemProfile,
+         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def brx_end :
+  SDNode<"NVPTXISD::BrxEnd", SDTBrxEndProfile,
+         [SDNPHasChain, SDNPInGlue, SDNPSideEffect]>;
+
+let isTerminator = 1, isBranch = 1, isIndirectBranch = 1 in {
+
+  def BRX_START :
+    NVPTXInst<(outs), (ins i32imm:$id),
+              "$$L_brx_$id: .branchtargets",
+              [(brx_start (i32 imm:$id))]>;
+
+  def BRX_ITEM :
+    NVPTXInst<(outs), (ins brtarget:$target),
+              "\t$target,",
+              [(brx_item bb:$target)]>;
+
+  def BRX_END :
+    NVPTXInst<(outs), (ins brtarget:$target, Int32Regs:$val, i32imm:$id),
+              "\t$target;\n\tbrx.idx \t$val, $$L_brx_$id;",
+              [(brx_end bb:$target, (i32 Int32Regs:$val), (i32 imm:$id))]>;
+}
+
+
 include "NVPTXIntrinsics.td"
 
 //-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/jump-table.ll b/llvm/test/CodeGen/NVPTX/jump-table.ll
new file mode 100644
index 000000000000000..867e171a5840ae5
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/jump-table.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+@out = addrspace(1) global i32 0, align 4
+
+define void @foo(i32 %i) {
+; CHECK-LABEL: foo(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    ld.param.u32 %r2, [foo_param_0];
+; CHECK-NEXT:    setp.gt.u32 %p1, %r2, 3;
+; CHECK-NEXT:    @%p1 bra $L__BB0_6;
+; CHECK-NEXT:  // %bb.1: // %entry
+; CHECK-NEXT:    $L_brx_0: .branchtargets
+; CHECK-NEXT:     $L__BB0_2,
+; CHECK-NEXT:     $L__BB0_3,
+; CHECK-NEXT:     $L__BB0_4,
+; CHECK-NEXT:     $L__BB0_5;
+; CHECK-NEXT:    brx.idx %r2, $L_brx_0;
+; CHECK-NEXT:  $L__BB0_2: // %case0
+; CHECK-NEXT:    mov.b32 %r6, 0;
+; CHECK-NEXT:    st.global.u32 [out], %r6;
+; CHECK-NEXT:    bra.uni $L__BB0_6;
+; CHECK-NEXT:  $L__BB0_4: // %case2
+; CHECK-NEXT:    mov.b32 %r4, 2;
+; CHECK-NEXT:    st.global.u32 [out], %r4;
+; CHECK-NEXT:    bra.uni $L__BB0_6;
+; CHECK-NEXT:  $L__BB0_5: // %case3
+; CHECK-NEXT:    mov.b32 %r3, 3;
+; CHECK-NEXT:    st.global.u32 [out], %r3;
+; CHECK-NEXT:    bra.uni $L__BB0_6;
+; CHECK-NEXT:  $L__BB0_3: // %case1
+; CHECK-NEXT:    mov.b32 %r5, 1;
+; CHECK-NEXT:    st.global.u32 [out], %r5;
+; CHECK-NEXT:  $L__BB0_6: // %end
+; CHECK-NEXT:    ret;
+entry:
+  switch i32 %i, label %end [
+    i32 0, label %case0
+    i32 1, label %case1
+    i32 2, label %case2
+    i32 3, label %case3
+  ]
+
+case0:
+  store i32 0, ptr addrspace(1) @out, align 4
+  br label %end
+
+case1:
+  store i32 1, ptr addrspace(1) @out, align 4
+  br label %end
+
+case2:
+  store i32 2, ptr addrspace(1) @out, align 4
+  br label %end
+
+case3:
+  store i32 3, ptr addrspace(1) @out, align 4
+  br label %end
+
+end:
+  ret void
+}

From 86cf67ffc1ee62c65bef313bf58ae70f74afb7c1 Mon Sep 17 00:00:00 2001
From: aaryanshukla <53713108+aaryanshukla@users.noreply.github.com>
Date: Thu, 8 Aug 2024 13:26:45 -0700
Subject: [PATCH 069/112] [libc][newhdrgen] add_function by alphabetical order
 (#102527)

- add_function now adds the function by alphabetical order
---
 libc/newhdrgen/yaml_to_classes.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/libc/newhdrgen/yaml_to_classes.py b/libc/newhdrgen/yaml_to_classes.py
index 37a4f78ec4a7b67..3eb5e4ef2546c1a 100644
--- a/libc/newhdrgen/yaml_to_classes.py
+++ b/libc/newhdrgen/yaml_to_classes.py
@@ -190,7 +190,15 @@ def add_function_to_yaml(yaml_file, function_details):
     if new_function.attributes:
         function_dict["attributes"] = new_function.attributes
 
-    yaml_data["functions"].append(function_dict)
+    insert_index = 0
+    for i, func in enumerate(yaml_data["functions"]):
+        if func["name"] > new_function.name:
+            insert_index = i
+            break
+    else:
+        insert_index = len(yaml_data["functions"])
+
+    yaml_data["functions"].insert(insert_index, function_dict)
 
     class IndentYamlListDumper(yaml.Dumper):
         def increase_indent(self, flow=False, indentless=False):

From 2756879000c5cde9e956bb38148ea13366cff439 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Thu, 8 Aug 2024 13:31:10 -0700
Subject: [PATCH 070/112] Revert "[NVPTX] support switch statement with
 brx.idx" (#102530)

Reverts llvm/llvm-project#102400

Causes LLVM to crash on some tests.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  4 --
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 11 ++-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   | 45 +-----------
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     | 10 ---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       | 38 ----------
 llvm/test/CodeGen/NVPTX/jump-table.ll         | 69 -------------------
 6 files changed, 8 insertions(+), 169 deletions(-)
 delete mode 100644 llvm/test/CodeGen/NVPTX/jump-table.ll

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 5b2214fa66c40b4..9ccdbab008aec8b 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3843,10 +3843,6 @@ class TargetLowering : public TargetLoweringBase {
   /// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
   virtual unsigned getJumpTableEncoding() const;
 
-  virtual MVT getJumpTableRegTy(const DataLayout &DL) const {
-    return getPointerTy(DL);
-  }
-
   virtual const MCExpr *
   LowerCustomJumpTableEntry(const MachineJumpTableInfo * /*MJTI*/,
                             const MachineBasicBlock * /*MBB*/, unsigned /*uid*/,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 37ba62911ec70b2..1f4436fb3a49669 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2977,7 +2977,7 @@ void SelectionDAGBuilder::visitJumpTable(SwitchCG::JumpTable &JT) {
   // Emit the code for the jump table
   assert(JT.SL && "Should set SDLoc for SelectionDAG!");
   assert(JT.Reg != -1U && "Should lower JT Header first!");
-  EVT PTy = DAG.getTargetLoweringInfo().getJumpTableRegTy(DAG.getDataLayout());
+  EVT PTy = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   SDValue Index = DAG.getCopyFromReg(getControlRoot(), *JT.SL, JT.Reg, PTy);
   SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
   SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, *JT.SL, MVT::Other,
@@ -3005,13 +3005,12 @@ void SelectionDAGBuilder::visitJumpTableHeader(SwitchCG::JumpTable &JT,
   // This value may be smaller or larger than the target's pointer type, and
   // therefore require extension or truncating.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SwitchOp =
-      DAG.getZExtOrTrunc(Sub, dl, TLI.getJumpTableRegTy(DAG.getDataLayout()));
+  SwitchOp = DAG.getZExtOrTrunc(Sub, dl, TLI.getPointerTy(DAG.getDataLayout()));
 
   unsigned JumpTableReg =
-      FuncInfo.CreateReg(TLI.getJumpTableRegTy(DAG.getDataLayout()));
-  SDValue CopyTo =
-      DAG.getCopyToReg(getControlRoot(), dl, JumpTableReg, SwitchOp);
+      FuncInfo.CreateReg(TLI.getPointerTy(DAG.getDataLayout()));
+  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl,
+                                    JumpTableReg, SwitchOp);
   JT.Reg = JumpTableReg;
 
   if (!JTH.FallthroughUnreachable) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index bf647c88f00e282..516fc7339a4bf38 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -583,7 +582,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::ROTR, MVT::i8, Expand);
   setOperationAction(ISD::BSWAP, MVT::i16, Expand);
 
-  setOperationAction(ISD::BR_JT, MVT::Other, Custom);
+  // Indirect branch is not supported.
+  // This also disables Jump Table creation.
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BRIND, MVT::Other, Expand);
 
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
@@ -944,9 +945,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(NVPTXISD::Dummy)
     MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED)
     MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED)
-    MAKE_CASE(NVPTXISD::BrxEnd)
-    MAKE_CASE(NVPTXISD::BrxItem)
-    MAKE_CASE(NVPTXISD::BrxStart)
     MAKE_CASE(NVPTXISD::Tex1DFloatS32)
     MAKE_CASE(NVPTXISD::Tex1DFloatFloat)
     MAKE_CASE(NVPTXISD::Tex1DFloatFloatLevel)
@@ -2787,8 +2785,6 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerFP_ROUND(Op, DAG);
   case ISD::FP_EXTEND:
     return LowerFP_EXTEND(Op, DAG);
-  case ISD::BR_JT:
-    return LowerBR_JT(Op, DAG);
   case ISD::VAARG:
     return LowerVAARG(Op, DAG);
   case ISD::VASTART:
@@ -2814,41 +2810,6 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
-SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  SDValue Chain = Op.getOperand(0);
-  const auto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
-  SDValue Index = Op.getOperand(2);
-
-  unsigned JId = JT->getIndex();
-  MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo();
-  ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
-
-  SDValue IdV = DAG.getConstant(JId, DL, MVT::i32);
-
-  // Generate BrxStart node
-  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
-  Chain = DAG.getNode(NVPTXISD::BrxStart, DL, VTs, Chain, IdV);
-
-  // Generate BrxItem nodes
-  assert(!MBBs.empty());
-  for (MachineBasicBlock *MBB : MBBs.drop_back())
-    Chain = DAG.getNode(NVPTXISD::BrxItem, DL, VTs, Chain.getValue(0),
-                        DAG.getBasicBlock(MBB), Chain.getValue(1));
-
-  // Generate BrxEnd nodes
-  SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
-                      IdV, Chain.getValue(1)};
-  SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd, DL, VTs, EndOps);
-
-  return BrxEnd;
-}
-
-// This will prevent AsmPrinter from trying to print the jump tables itself.
-unsigned NVPTXTargetLowering::getJumpTableEncoding() const {
-  return MachineJumpTableInfo::EK_Inline;
-}
-
 // This function is almost a copy of SelectionDAG::expandVAArg().
 // The only diff is that this one produces loads from local address space.
 SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 32e6b044b0de1f8..63262961b363eda 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -62,9 +62,6 @@ enum NodeType : unsigned {
   BFI,
   PRMT,
   DYNAMIC_STACKALLOC,
-  BrxStart,
-  BrxItem,
-  BrxEnd,
   Dummy,
 
   LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -583,11 +580,6 @@ class NVPTXTargetLowering : public TargetLowering {
     return true;
   }
 
-  // The default is the same as pointer type, but brx.idx only accepts i32
-  MVT getJumpTableRegTy(const DataLayout &) const override { return MVT::i32; }
-
-  unsigned getJumpTableEncoding() const override;
-
   bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
 
   // The default is to transform llvm.ctlz(x, false) (where false indicates that
@@ -645,8 +637,6 @@ class NVPTXTargetLowering : public TargetLowering {
 
   SDValue LowerSelect(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
-
   SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 904574b2e1d6607..6a096fa5acea7c1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3880,44 +3880,6 @@ def DYNAMIC_STACKALLOC64 :
             [(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>,
             Requires<[hasPTX<73>, hasSM<52>]>;
 
-
-//
-// BRX
-//
-
-def SDTBrxStartProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def SDTBrxItemProfile : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
-def SDTBrxEndProfile : SDTypeProfile<0, 3, [SDTCisVT<0, OtherVT>, SDTCisInt<1>, SDTCisInt<2>]>;
-
-def brx_start :
-  SDNode<"NVPTXISD::BrxStart", SDTBrxStartProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
-def brx_item :
-  SDNode<"NVPTXISD::BrxItem", SDTBrxItemProfile,
-         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def brx_end :
-  SDNode<"NVPTXISD::BrxEnd", SDTBrxEndProfile,
-         [SDNPHasChain, SDNPInGlue, SDNPSideEffect]>;
-
-let isTerminator = 1, isBranch = 1, isIndirectBranch = 1 in {
-
-  def BRX_START :
-    NVPTXInst<(outs), (ins i32imm:$id),
-              "$$L_brx_$id: .branchtargets",
-              [(brx_start (i32 imm:$id))]>;
-
-  def BRX_ITEM :
-    NVPTXInst<(outs), (ins brtarget:$target),
-              "\t$target,",
-              [(brx_item bb:$target)]>;
-
-  def BRX_END :
-    NVPTXInst<(outs), (ins brtarget:$target, Int32Regs:$val, i32imm:$id),
-              "\t$target;\n\tbrx.idx \t$val, $$L_brx_$id;",
-              [(brx_end bb:$target, (i32 Int32Regs:$val), (i32 imm:$id))]>;
-}
-
-
 include "NVPTXIntrinsics.td"
 
 //-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/jump-table.ll b/llvm/test/CodeGen/NVPTX/jump-table.ll
deleted file mode 100644
index 867e171a5840ae5..000000000000000
--- a/llvm/test/CodeGen/NVPTX/jump-table.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s | FileCheck %s
-; RUN: %if ptxas %{ llc < %s | %ptxas-verify %}
-
-target triple = "nvptx64-nvidia-cuda"
-
-@out = addrspace(1) global i32 0, align 4
-
-define void @foo(i32 %i) {
-; CHECK-LABEL: foo(
-; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<2>;
-; CHECK-NEXT:    .reg .b32 %r<7>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    ld.param.u32 %r2, [foo_param_0];
-; CHECK-NEXT:    setp.gt.u32 %p1, %r2, 3;
-; CHECK-NEXT:    @%p1 bra $L__BB0_6;
-; CHECK-NEXT:  // %bb.1: // %entry
-; CHECK-NEXT:    $L_brx_0: .branchtargets
-; CHECK-NEXT:     $L__BB0_2,
-; CHECK-NEXT:     $L__BB0_3,
-; CHECK-NEXT:     $L__BB0_4,
-; CHECK-NEXT:     $L__BB0_5;
-; CHECK-NEXT:    brx.idx %r2, $L_brx_0;
-; CHECK-NEXT:  $L__BB0_2: // %case0
-; CHECK-NEXT:    mov.b32 %r6, 0;
-; CHECK-NEXT:    st.global.u32 [out], %r6;
-; CHECK-NEXT:    bra.uni $L__BB0_6;
-; CHECK-NEXT:  $L__BB0_4: // %case2
-; CHECK-NEXT:    mov.b32 %r4, 2;
-; CHECK-NEXT:    st.global.u32 [out], %r4;
-; CHECK-NEXT:    bra.uni $L__BB0_6;
-; CHECK-NEXT:  $L__BB0_5: // %case3
-; CHECK-NEXT:    mov.b32 %r3, 3;
-; CHECK-NEXT:    st.global.u32 [out], %r3;
-; CHECK-NEXT:    bra.uni $L__BB0_6;
-; CHECK-NEXT:  $L__BB0_3: // %case1
-; CHECK-NEXT:    mov.b32 %r5, 1;
-; CHECK-NEXT:    st.global.u32 [out], %r5;
-; CHECK-NEXT:  $L__BB0_6: // %end
-; CHECK-NEXT:    ret;
-entry:
-  switch i32 %i, label %end [
-    i32 0, label %case0
-    i32 1, label %case1
-    i32 2, label %case2
-    i32 3, label %case3
-  ]
-
-case0:
-  store i32 0, ptr addrspace(1) @out, align 4
-  br label %end
-
-case1:
-  store i32 1, ptr addrspace(1) @out, align 4
-  br label %end
-
-case2:
-  store i32 2, ptr addrspace(1) @out, align 4
-  br label %end
-
-case3:
-  store i32 3, ptr addrspace(1) @out, align 4
-  br label %end
-
-end:
-  ret void
-}

From 1248698e9bb2a0232eee53a72679ed5077190a90 Mon Sep 17 00:00:00 2001
From: jameshu15869 <55058507+jameshu15869@users.noreply.github.com>
Date: Thu, 8 Aug 2024 15:32:20 -0500
Subject: [PATCH 071/112] [libc] [gpu] Fix Minor Benchmark UI Issues (#102529)

Previously, `AmdgpuSinTwoPow_128` and others were too large for their
table cells. This PR shortens the name to `AmdSin...`

There were also some `-` missing in the separator. This PR instead
creates the separator string using the length of the headers.
---
 libc/benchmarks/gpu/LibcGpuBenchmark.cpp       | 12 +++++++-----
 libc/benchmarks/gpu/src/math/sin_benchmark.cpp |  8 ++++----
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
index 28235cf137c1867..f237e2ea1b9545e 100644
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@@ -126,12 +126,14 @@ void print_header() {
   LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
                          benchmarks[0]->get_suite_name().data());
   LIBC_NAMESPACE::printf("%s", RESET);
-  LIBC_NAMESPACE::printf(
+  cpp::string titles =
       "Benchmark            |  Cycles |     Min |     Max | "
-      "Iterations | Time / Iteration |   Stddev |  Threads |\n");
-  LIBC_NAMESPACE::printf(
-      "---------------------------------------------------------------------"
-      "--------------------------------\n");
+      "Iterations | Time / Iteration |   Stddev |  Threads |\n";
+  LIBC_NAMESPACE::printf(titles.data());
+
+  cpp::string separator(titles.size(), '-');
+  separator[titles.size() - 1] = '\n';
+  LIBC_NAMESPACE::printf(separator.data());
 }
 
 void Benchmark::run_benchmarks() {
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index e86961790b9438a..0dc3eae2a24a09c 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -48,8 +48,8 @@ BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCH(AmdgpuSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
-BENCH(AmdgpuSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
-BENCH(AmdgpuSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
-BENCH(AmdgpuSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
+BENCH(AmdSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
+BENCH(AmdSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
+BENCH(AmdSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
+BENCH(AmdSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
 #endif

From bb7143f6669345825c214b26fbe336857f4bf523 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 9 Aug 2024 00:39:12 +0400
Subject: [PATCH 072/112] AMDGPU: Avoid creating unnecessary block split in
 atomic expansion (#102440)

This was creating a new block to insert the is.shared check, but we
can just do that in the original block.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  8 ----
 llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll  |  4 +-
 .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll     | 44 +++++++++----------
 .../AtomicExpand/AMDGPU/expand-atomic-mmra.ll |  2 -
 ...and-atomic-rmw-fadd-flat-specialization.ll | 24 ++++------
 .../AMDGPU/expand-atomic-rmw-fadd.ll          |  6 +--
 6 files changed, 34 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8497aa33449e995..899465d66d957b1 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16609,9 +16609,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   //
   // With this expansion we produce the following code:
   //   [...]
-  //   br label %atomicrmw.check.shared
-  //
-  // atomicrmw.check.shared:
   //   %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
   //   br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
   //
@@ -16654,8 +16651,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   Function *F = BB->getParent();
   BasicBlock *ExitBB =
       BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
-  BasicBlock *CheckSharedBB =
-      BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
   BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
   BasicBlock *CheckPrivateBB =
       BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
@@ -16682,9 +16677,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
 
   std::prev(BB->end())->eraseFromParent();
   Builder.SetInsertPoint(BB);
-  Builder.CreateBr(CheckSharedBB);
-
-  Builder.SetInsertPoint(CheckSharedBB);
   CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
                                                {Addr}, nullptr, "is.shared");
   Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index 8bf7a1cc42f642f..4f0bc512565d135 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -222,7 +222,7 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) #0 {
 
 define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ; GFX908-LABEL: syncscope_workgroup_nortn:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
@@ -272,7 +272,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: syncscope_workgroup_nortn:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 896b85ea14da11e..422c8a0be23b498 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -630,7 +630,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
@@ -682,7 +682,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__am
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
@@ -839,7 +839,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -893,7 +893,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gra
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -1062,7 +1062,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
@@ -1116,7 +1116,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_gra
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
@@ -1469,7 +1469,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -1525,7 +1525,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_gr
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -2006,7 +2006,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -2060,7 +2060,7 @@ define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__a
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -2950,7 +2950,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
@@ -3002,7 +3002,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memor
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
@@ -3159,7 +3159,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -3213,7 +3213,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fin
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -3382,7 +3382,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
@@ -3436,7 +3436,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
@@ -3789,7 +3789,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -3845,7 +3845,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fi
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -4198,7 +4198,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -4254,7 +4254,7 @@ define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memo
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x7fc, v0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -5239,7 +5239,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
-; GFX90A:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
@@ -5291,7 +5291,7 @@ define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amd
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
-; GFX908:       ; %bb.0: ; %atomicrmw.check.shared
+; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX908-NEXT:    v_cmp_ne_u32_e32 vcc, s5, v1
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll
index d51e9291a6119ca..78969839efcb8a2 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll
@@ -127,8 +127,6 @@ define i16 @test_cmpxchg_i16_global_agent_align4(ptr addrspace(1) %out, i16 %in,
 define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ; GFX90A-LABEL: define void @syncscope_workgroup_nortn(
 ; GFX90A-SAME: ptr [[ADDR:%.*]], float [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
-; GFX90A-NEXT:    br label [[ATOMICRMW_CHECK_SHARED:%.*]]
-; GFX90A:       atomicrmw.check.shared:
 ; GFX90A-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[ADDR]])
 ; GFX90A-NEXT:    br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
 ; GFX90A:       atomicrmw.shared:
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
index 70dc5b267f73b98..96cbf057b490a88 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
@@ -22,8 +22,6 @@ define float @syncscope_system(ptr %addr, float %val) {
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
 ; GFX90A-LABEL: @syncscope_system(
-; GFX90A-NEXT:    br label [[ATOMICRMW_CHECK_SHARED:%.*]]
-; GFX90A:       atomicrmw.check.shared:
 ; GFX90A-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[ADDR:%.*]])
 ; GFX90A-NEXT:    br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
 ; GFX90A:       atomicrmw.shared:
@@ -36,8 +34,8 @@ define float @syncscope_system(ptr %addr, float %val) {
 ; GFX90A:       atomicrmw.private:
 ; GFX90A-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(5)
 ; GFX90A-NEXT:    [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4
-; GFX90A-NEXT:    [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
-; GFX90A-NEXT:    store float [[VAL_NEW]], ptr addrspace(5) [[TMP3]], align 4
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
+; GFX90A-NEXT:    store float [[NEW]], ptr addrspace(5) [[TMP3]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.global:
 ; GFX90A-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
@@ -94,8 +92,6 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) {
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
 ; GFX90A-LABEL: @syncscope_workgroup_rtn(
-; GFX90A-NEXT:    br label [[ATOMICRMW_CHECK_SHARED:%.*]]
-; GFX90A:       atomicrmw.check.shared:
 ; GFX90A-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[ADDR:%.*]])
 ; GFX90A-NEXT:    br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
 ; GFX90A:       atomicrmw.shared:
@@ -108,8 +104,8 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) {
 ; GFX90A:       atomicrmw.private:
 ; GFX90A-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(5)
 ; GFX90A-NEXT:    [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4
-; GFX90A-NEXT:    [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
-; GFX90A-NEXT:    store float [[VAL_NEW]], ptr addrspace(5) [[TMP3]], align 4
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
+; GFX90A-NEXT:    store float [[NEW]], ptr addrspace(5) [[TMP3]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.global:
 ; GFX90A-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
@@ -150,8 +146,6 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) {
 
 define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
 ; GFX908-LABEL: @syncscope_workgroup_nortn(
-; GFX908-NEXT:    br label [[ATOMICRMW_CHECK_SHARED:%.*]]
-; GFX908:       atomicrmw.check.shared:
 ; GFX908-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[ADDR:%.*]])
 ; GFX908-NEXT:    br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
 ; GFX908:       atomicrmw.shared:
@@ -164,8 +158,8 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
 ; GFX908:       atomicrmw.private:
 ; GFX908-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(5)
 ; GFX908-NEXT:    [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4
-; GFX908-NEXT:    [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
-; GFX908-NEXT:    store float [[VAL_NEW]], ptr addrspace(5) [[TMP3]], align 4
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
+; GFX908-NEXT:    store float [[NEW]], ptr addrspace(5) [[TMP3]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX908:       atomicrmw.global:
 ; GFX908-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
@@ -178,8 +172,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
 ; GFX908-NEXT:    ret void
 ;
 ; GFX90A-LABEL: @syncscope_workgroup_nortn(
-; GFX90A-NEXT:    br label [[ATOMICRMW_CHECK_SHARED:%.*]]
-; GFX90A:       atomicrmw.check.shared:
 ; GFX90A-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[ADDR:%.*]])
 ; GFX90A-NEXT:    br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
 ; GFX90A:       atomicrmw.shared:
@@ -192,8 +184,8 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
 ; GFX90A:       atomicrmw.private:
 ; GFX90A-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(5)
 ; GFX90A-NEXT:    [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4
-; GFX90A-NEXT:    [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
-; GFX90A-NEXT:    store float [[VAL_NEW]], ptr addrspace(5) [[TMP3]], align 4
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VAL]]
+; GFX90A-NEXT:    store float [[NEW]], ptr addrspace(5) [[TMP3]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.global:
 ; GFX90A-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1)
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index def9522077004f5..7eaaf2ae1ec997a 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -595,8 +595,6 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #0 {
 ; GFX908-NEXT:    ret float [[TMP5]]
 ;
 ; GFX90A-LABEL: @test_atomicrmw_fadd_f32_flat_unsafe(
-; GFX90A-NEXT:    br label [[ATOMICRMW_CHECK_SHARED:%.*]]
-; GFX90A:       atomicrmw.check.shared:
 ; GFX90A-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR:%.*]])
 ; GFX90A-NEXT:    br i1 [[IS_SHARED]], label [[ATOMICRMW_SHARED:%.*]], label [[ATOMICRMW_CHECK_PRIVATE:%.*]]
 ; GFX90A:       atomicrmw.shared:
@@ -609,8 +607,8 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #0 {
 ; GFX90A:       atomicrmw.private:
 ; GFX90A-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
 ; GFX90A-NEXT:    [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4
-; GFX90A-NEXT:    [[VAL_NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VALUE]]
-; GFX90A-NEXT:    store float [[VAL_NEW]], ptr addrspace(5) [[TMP3]], align 4
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VALUE]]
+; GFX90A-NEXT:    store float [[NEW]], ptr addrspace(5) [[TMP3]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.global:
 ; GFX90A-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)

From 42b5540211927011856599d960216dae943e1d78 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 9 Aug 2024 00:43:04 +0400
Subject: [PATCH 073/112] AMDGPU: Preserve atomicrmw name when specializing
 address space (#102470)

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp            |  3 ++-
 ...mic-rmw-fadd-flat-specialization-preserve-name.ll | 10 ++++++++++
 .../expand-atomic-rmw-fadd-flat-specialization.ll    | 12 ++++++------
 .../AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll    |  4 ++--
 4 files changed, 20 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization-preserve-name.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 899465d66d957b1..f8767e00949bf0e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16710,12 +16710,13 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   Builder.CreateBr(PhiBB);
 
   Builder.SetInsertPoint(PhiBB);
-  PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
+  PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
   Loaded->addIncoming(LoadedShared, SharedBB);
   Loaded->addIncoming(LoadedPrivate, PrivateBB);
   Loaded->addIncoming(LoadedGlobal, GlobalBB);
   Builder.CreateBr(ExitBB);
 
+  Loaded->takeName(AI);
   AI->replaceAllUsesWith(Loaded);
   AI->eraseFromParent();
 }
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization-preserve-name.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization-preserve-name.ll
new file mode 100644
index 000000000000000..44cd7097059cd2e
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization-preserve-name.ll
@@ -0,0 +1,10 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck %s
+
+; CHECK: %preserve_me = phi float [ %{{[0-9]+}}, %atomicrmw.shared ], [ %loaded.private, %atomicrmw.private ], [ %{{[0-9]+}}, %atomicrmw.global ]
+; CHECK: ret float %preserve_me
+define float @expand_preserve_name(ptr %addr, float %val) {
+  %preserve_me = atomicrmw fadd ptr %addr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %preserve_me
+}
+
+!0 = !{}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
index 96cbf057b490a88..fc586a01e3bcf8e 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
@@ -42,10 +42,10 @@ define float @syncscope_system(ptr %addr, float %val) {
 ; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.phi:
-; GFX90A-NEXT:    [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
+; GFX90A-NEXT:    [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_END:%.*]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    ret float [[LOADED_PHI]]
+; GFX90A-NEXT:    ret float [[RES]]
 ;
 ; GFX940-LABEL: @syncscope_system(
 ; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float [[VAL:%.*]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]], !amdgpu.ignore.denormal.mode [[META0]]
@@ -112,10 +112,10 @@ define float @syncscope_workgroup_rtn(ptr %addr, float %val) {
 ; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.phi:
-; GFX90A-NEXT:    [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
+; GFX90A-NEXT:    [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_END:%.*]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    ret float [[LOADED_PHI]]
+; GFX90A-NEXT:    ret float [[RES]]
 ;
 ; GFX940-LABEL: @syncscope_workgroup_rtn(
 ; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
@@ -166,7 +166,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
 ; GFX908-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX908-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX908:       atomicrmw.phi:
-; GFX908-NEXT:    [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
+; GFX908-NEXT:    [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
 ; GFX908-NEXT:    br label [[ATOMICRMW_END:%.*]]
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret void
@@ -192,7 +192,7 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
 ; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.phi:
-; GFX90A-NEXT:    [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
+; GFX90A-NEXT:    [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_END:%.*]]
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret void
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 7eaaf2ae1ec997a..0a091bd0fc9ada6 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -615,10 +615,10 @@ define float @test_atomicrmw_fadd_f32_flat_unsafe(ptr %ptr, float %value) #0 {
 ; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VALUE]] syncscope("wavefront") monotonic, align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.phi:
-; GFX90A-NEXT:    [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
+; GFX90A-NEXT:    [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_END:%.*]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    ret float [[LOADED_PHI]]
+; GFX90A-NEXT:    ret float [[RES]]
 ;
 ; GFX940-LABEL: @test_atomicrmw_fadd_f32_flat_unsafe(
 ; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4

From 373d35d1e157996e168bb4fcaef0348bea12e295 Mon Sep 17 00:00:00 2001
From: Joshua Baehring <98630690+JoshuaMBa@users.noreply.github.com>
Date: Thu, 8 Aug 2024 13:44:27 -0700
Subject: [PATCH 074/112] [scudo] Added test fixture for cache tests. (#102230)

The test fixture simplifies some of the logic for allocations and
mmap-based allocations
are separated from the cache to allow for more direct cache tests.
Additionally, a couple
of end to end tests for the cache and the LRU algorithm are added.
---
 compiler-rt/lib/scudo/standalone/secondary.h  | 15 ++-
 .../scudo/standalone/tests/secondary_test.cpp | 98 +++++++++++++++++++
 2 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 34b16df75841b56..27f8697db7838f5 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -178,7 +178,11 @@ template <typename T> class NonZeroLengthArray<T, 0> {
   T &operator[](uptr UNUSED Idx) { UNREACHABLE("Unsupported!"); }
 };
 
-template <typename Config> class MapAllocatorCache {
+// The default unmap callback is simply scudo::unmap.
+// In testing, a different unmap callback is used to
+// record information about unmaps in the cache
+template <typename Config, void (*unmapCallBack)(MemMapT &) = unmap>
+class MapAllocatorCache {
 public:
   void getStats(ScopedString *Str) {
     ScopedLock L(Mutex);
@@ -246,6 +250,7 @@ template <typename Config> class MapAllocatorCache {
     const s32 Interval = atomic_load_relaxed(&ReleaseToOsIntervalMs);
     u64 Time;
     CachedBlock Entry;
+
     Entry.CommitBase = CommitBase;
     Entry.CommitSize = CommitSize;
     Entry.BlockBegin = BlockBegin;
@@ -290,7 +295,7 @@ template <typename Config> class MapAllocatorCache {
         // read Options and when we locked Mutex. We can't insert our entry into
         // the quarantine or the cache because the permissions would be wrong so
         // just unmap it.
-        unmap(Entry.MemMap);
+        unmapCallBack(Entry.MemMap);
         break;
       }
       if (Config::getQuarantineSize() && useMemoryTagging<Config>(Options)) {
@@ -321,7 +326,7 @@ template <typename Config> class MapAllocatorCache {
     } while (0);
 
     for (MemMapT &EvictMemMap : EvictionMemMaps)
-      unmap(EvictMemMap);
+      unmapCallBack(EvictMemMap);
 
     if (Interval >= 0) {
       // TODO: Add ReleaseToOS logic to LRU algorithm
@@ -423,7 +428,7 @@ template <typename Config> class MapAllocatorCache {
     for (u32 I = 0; I != Config::getQuarantineSize(); ++I) {
       if (Quarantine[I].isValid()) {
         MemMapT &MemMap = Quarantine[I].MemMap;
-        unmap(MemMap);
+        unmapCallBack(MemMap);
         Quarantine[I].invalidate();
       }
     }
@@ -517,7 +522,7 @@ template <typename Config> class MapAllocatorCache {
     }
     for (uptr I = 0; I < N; I++) {
       MemMapT &MemMap = MapInfo[I];
-      unmap(MemMap);
+      unmapCallBack(MemMap);
     }
   }
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
index 5685a9335316d1e..e85b6abdb36d228 100644
--- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
@@ -265,3 +265,101 @@ TEST_F(MapAllocatorWithReleaseTest, SecondaryThreadsRace) {
   Allocator->getStats(&Str);
   Str.output();
 }
+
+struct MapAllocatorCacheTest : public Test {
+  static constexpr scudo::u32 UnmappedMarker = 0xDEADBEEF;
+
+  static void testUnmapCallback(scudo::MemMapT &MemMap) {
+    scudo::u32 *Ptr = reinterpret_cast<scudo::u32 *>(MemMap.getBase());
+    *Ptr = UnmappedMarker;
+  }
+
+  using SecondaryConfig = scudo::SecondaryConfig<TestConfig>;
+  using CacheConfig = SecondaryConfig::CacheConfig;
+  using CacheT = scudo::MapAllocatorCache<CacheConfig, testUnmapCallback>;
+
+  std::unique_ptr<CacheT> Cache = std::make_unique<CacheT>();
+
+  const scudo::uptr PageSize = scudo::getPageSizeCached();
+  // The current test allocation size is set to the minimum size
+  // needed for the scudo allocator to fall back to the secondary allocator
+  static constexpr scudo::uptr TestAllocSize =
+      CacheConfig::getDefaultMaxEntrySize();
+
+  scudo::Options Options = getOptionsForConfig<SecondaryConfig>();
+
+  void SetUp() override { Cache->init(/*ReleaseToOsInterval=*/-1); }
+
+  void TearDown() override { Cache->unmapTestOnly(); }
+
+  scudo::MemMapT allocate(scudo::uptr Size) {
+    scudo::uptr MapSize = scudo::roundUp(Size, PageSize);
+    scudo::ReservedMemoryT ReservedMemory;
+    CHECK(ReservedMemory.create(0U, MapSize, nullptr, MAP_ALLOWNOMEM));
+
+    scudo::MemMapT MemMap = ReservedMemory.dispatch(
+        ReservedMemory.getBase(), ReservedMemory.getCapacity());
+    MemMap.remap(MemMap.getBase(), MemMap.getCapacity(), "scudo:test",
+                 MAP_RESIZABLE | MAP_ALLOWNOMEM);
+    return MemMap;
+  }
+
+  void fillCacheWithSameSizeBlocks(std::vector<scudo::MemMapT> &MemMaps,
+                                   scudo::uptr NumEntries, scudo::uptr Size) {
+    for (scudo::uptr I = 0; I < NumEntries; I++) {
+      MemMaps.emplace_back(allocate(Size));
+      auto &MemMap = MemMaps[I];
+      Cache->store(Options, MemMap.getBase(), MemMap.getCapacity(),
+                   MemMap.getBase(), MemMap);
+    }
+  }
+};
+
+TEST_F(MapAllocatorCacheTest, CacheOrder) {
+  std::vector<scudo::MemMapT> MemMaps;
+  Cache->setOption(scudo::Option::MaxCacheEntriesCount,
+                   CacheConfig::getEntriesArraySize());
+
+  fillCacheWithSameSizeBlocks(MemMaps, CacheConfig::getEntriesArraySize(),
+                              TestAllocSize);
+
+  // Retrieval order should be the inverse of insertion order
+  for (scudo::uptr I = CacheConfig::getEntriesArraySize(); I > 0; I--) {
+    scudo::uptr EntryHeaderPos;
+    scudo::CachedBlock Entry =
+        Cache->retrieve(TestAllocSize, PageSize, 0, EntryHeaderPos);
+    EXPECT_EQ(Entry.MemMap.getBase(), MemMaps[I - 1].getBase());
+  }
+
+  // Clean up MemMaps
+  for (auto &MemMap : MemMaps)
+    MemMap.unmap();
+}
+
+TEST_F(MapAllocatorCacheTest, MemoryLeakTest) {
+  std::vector<scudo::MemMapT> MemMaps;
+  // Fill the cache above MaxEntriesCount to force an eviction
+  // The first cache entry should be evicted (because it is the oldest)
+  // due to the maximum number of entries being reached
+  fillCacheWithSameSizeBlocks(
+      MemMaps, CacheConfig::getDefaultMaxEntriesCount() + 1, TestAllocSize);
+
+  std::vector<scudo::CachedBlock> RetrievedEntries;
+
+  // First MemMap should be evicted from cache because it was the first
+  // inserted into the cache
+  for (scudo::uptr I = CacheConfig::getDefaultMaxEntriesCount(); I > 0; I--) {
+    scudo::uptr EntryHeaderPos;
+    RetrievedEntries.push_back(
+        Cache->retrieve(TestAllocSize, PageSize, 0, EntryHeaderPos));
+    EXPECT_EQ(MemMaps[I].getBase(), RetrievedEntries.back().MemMap.getBase());
+  }
+
+  // Evicted entry should be marked due to unmap callback
+  EXPECT_EQ(*reinterpret_cast<scudo::u32 *>(MemMaps[0].getBase()),
+            UnmappedMarker);
+
+  // Clean up MemMaps
+  for (auto &MemMap : MemMaps)
+    MemMap.unmap();
+}

From 1a6d60e0162b3ef767c87c95512dd453bf4f4746 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Thu, 8 Aug 2024 16:45:04 -0400
Subject: [PATCH 075/112] [ctx_prof] Fix the pre-thinlink "use" case (#102511)

Didn't notice in #101338 that the instrumentation in `llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll` was actually incorrect.
---
 .../Instrumentation/PGOCtxProfLowering.h        |  3 ++-
 llvm/lib/Passes/PassBuilderPipelines.cpp        | 13 +++++++------
 .../Instrumentation/PGOCtxProfLowering.cpp      |  2 +-
 .../Instrumentation/PGOInstrumentation.cpp      | 17 ++++++++++-------
 .../PGOProfile/ctx-prof-use-prelink.ll          | 14 ++++++++++----
 5 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
index 5256aff56205bab..f127d16b8de124d 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
@@ -19,7 +19,8 @@ class Type;
 class PGOCtxProfLoweringPass : public PassInfoMixin<PGOCtxProfLoweringPass> {
 public:
   explicit PGOCtxProfLoweringPass() = default;
-  static bool isContextualIRPGOEnabled();
+  // True if contextual instrumentation is enabled.
+  static bool isCtxIRPGOInstrEnabled();
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
 };
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index c175ee898098495..67f3464f745a160 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1173,13 +1173,12 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   const bool IsMemprofUse = IsPGOPreLink && !PGOOpt->MemoryProfile.empty();
   // We don't want to mix pgo ctx gen and pgo gen; we also don't currently
   // enable ctx profiling from the frontend.
-  assert(
-      !(IsPGOInstrGen && PGOCtxProfLoweringPass::isContextualIRPGOEnabled()) &&
-      "Enabling both instrumented FDO and contextual instrumentation is not "
-      "supported.");
+  assert(!(IsPGOInstrGen && PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled()) &&
+         "Enabling both instrumented PGO and contextual instrumentation is not "
+         "supported.");
   // Enable contextual profiling instrumentation.
   const bool IsCtxProfGen = !IsPGOInstrGen && IsPreLink &&
-                            PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+                            PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled();
   const bool IsCtxProfUse = !UseCtxProfile.empty() && !PGOOpt &&
                             Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
 
@@ -1670,8 +1669,10 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
   // In pre-link, for ctx prof use, we stop here with an instrumented IR. We let
   // thinlto use the contextual info to perform imports; then use the contextual
   // profile in the post-thinlink phase.
-  if (!UseCtxProfile.empty() && !PGOOpt)
+  if (!UseCtxProfile.empty() && !PGOOpt) {
+    addRequiredLTOPreLinkPasses(MPM);
     return MPM;
+  }
 
   // Run partial inlining pass to partially inline functions that have
   // large bodies.
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
index de1d4d2381c06ee..d6ba12465bb3283 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
@@ -30,7 +30,7 @@ static cl::list<std::string> ContextRoots(
         "root of an interesting graph, which will be profiled independently "
         "from other similar graphs."));
 
-bool PGOCtxProfLoweringPass::isContextualIRPGOEnabled() {
+bool PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled() {
   return !ContextRoots.empty();
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 1ce8f58c1aa1408..41618194d12ed7c 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -321,6 +321,7 @@ static cl::opt<unsigned> PGOFunctionCriticalEdgeThreshold(
              " greater than this threshold."));
 
 extern cl::opt<unsigned> MaxNumVTableAnnotations;
+extern cl::opt<std::string> UseCtxProfile;
 
 namespace llvm {
 // Command line option to turn on CFG dot dump after profile annotation.
@@ -338,9 +339,12 @@ extern cl::opt<bool> EnableVTableProfileUse;
 extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
 } // namespace llvm
 
+bool shouldInstrumentForCtxProf() {
+  return PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled() ||
+         !UseCtxProfile.empty();
+}
 bool shouldInstrumentEntryBB() {
-  return PGOInstrumentEntry ||
-         PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+  return PGOInstrumentEntry || shouldInstrumentForCtxProf();
 }
 
 // FIXME(mtrofin): re-enable this for ctx profiling, for non-indirect calls. Ctx
@@ -348,8 +352,7 @@ bool shouldInstrumentEntryBB() {
 // Supporting other values is relatively straight-forward - just another counter
 // range within the context.
 bool isValueProfilingDisabled() {
-  return DisableValueProfiling ||
-         PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+  return DisableValueProfiling || shouldInstrumentForCtxProf();
 }
 
 // Return a string describing the branch condition that can be
@@ -902,7 +905,7 @@ static void instrumentOneFunc(
   unsigned NumCounters =
       InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
 
-  if (PGOCtxProfLoweringPass::isContextualIRPGOEnabled()) {
+  if (shouldInstrumentForCtxProf()) {
     auto *CSIntrinsic =
         Intrinsic::getDeclaration(M, Intrinsic::instrprof_callsite);
     // We want to count the instrumentable callsites, then instrument them. This
@@ -1861,7 +1864,7 @@ static bool InstrumentAllFunctions(
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) {
   // For the context-sensitve instrumentation, we should have a separated pass
   // (before LTO/ThinLTO linking) to create these variables.
-  if (!IsCS && !PGOCtxProfLoweringPass::isContextualIRPGOEnabled())
+  if (!IsCS && !shouldInstrumentForCtxProf())
     createIRLevelProfileFlagVar(M, /*IsCS=*/false);
 
   Triple TT(M.getTargetTriple());
@@ -2112,7 +2115,7 @@ static bool annotateAllFunctions(
   bool InstrumentFuncEntry = PGOReader->instrEntryBBEnabled();
   if (PGOInstrumentEntry.getNumOccurrences() > 0)
     InstrumentFuncEntry = PGOInstrumentEntry;
-  InstrumentFuncEntry |= PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+  InstrumentFuncEntry |= shouldInstrumentForCtxProf();
 
   bool HasSingleByteCoverage = PGOReader->hasSingleByteCoverage();
   for (auto &F : M) {
diff --git a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
index b50a815be5abf5b..18ac2f92aa39d46 100644
--- a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
+++ b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; There is no profile, but that's OK because the prelink does not care about
 ; the content of the profile, just that we intend to use one.
 ; There is no scenario currently of doing ctx profile use without thinlto.
@@ -7,19 +7,22 @@
 
 declare void @bar()
 
+;.
+; CHECK: @__profn_foo = private constant [3 x i8] c"foo"
+;.
 define void @foo(i32 %a, ptr %fct) {
 ; CHECK-LABEL: define void @foo(
 ; CHECK-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0)
 ; CHECK-NEXT:    [[T:%.*]] = icmp eq i32 [[A]], 0
 ; CHECK-NEXT:    br i1 [[T]], label %[[YES:.*]], label %[[NO:.*]]
 ; CHECK:       [[YES]]:
 ; CHECK-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[FCT]] to i64
-; CHECK-NEXT:    call void @llvm.instrprof.value.profile(ptr @__profn_foo, i64 728453322856651412, i64 [[TMP1]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0, ptr [[FCT]])
 ; CHECK-NEXT:    call void [[FCT]](i32 0)
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[NO]]:
-; CHECK-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0)
+; CHECK-NEXT:    call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1, ptr @bar)
 ; CHECK-NEXT:    call void @bar()
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
@@ -36,3 +39,6 @@ no:
 exit:
   ret void
 }
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+;.

From ae059a1f9f1e501b08a99cb636ec0869ec204c6f Mon Sep 17 00:00:00 2001
From: Brox Chen <broxigarchen@outlook.com>
Date: Thu, 8 Aug 2024 16:52:59 -0400
Subject: [PATCH 076/112] [AMDGPU][True16][CodeGen] support v_mov_b16 and
 v_swap_b16 in true16 format (#102198)

support v_swap_b16 in true16 format.
update tableGen pattern and folding for v_mov_b16.

---------

Co-authored-by: guochen2 <guochen2@amd.com>
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     |  10 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |   6 +-
 .../Target/AMDGPU/SIShrinkInstructions.cpp    |  65 +++++++----
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |   2 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              |  28 ++---
 llvm/test/CodeGen/AMDGPU/fadd.f16.ll          |  16 +--
 llvm/test/CodeGen/AMDGPU/v_swap_b16.ll        | 110 ++++++++++++++++++
 llvm/test/MC/AMDGPU/gfx11_asm_err.s           |  18 ---
 llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s       |  10 ++
 9 files changed, 192 insertions(+), 73 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
 create mode 100644 llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 32ecf350db59cf9..875738dad74cedf 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1460,7 +1460,15 @@ bool SIFoldOperands::tryFoldFoldableCopy(
     return false;
   }
 
-  MachineOperand &OpToFold = MI.getOperand(1);
+  MachineOperand *OpToFoldPtr;
+  if (MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e64) {
+    // Folding when any src_modifiers are non-zero is unsupported
+    if (TII->hasAnyModifiersSet(MI))
+      return false;
+    OpToFoldPtr = &MI.getOperand(2);
+  } else
+    OpToFoldPtr = &MI.getOperand(1);
+  MachineOperand &OpToFold = *OpToFoldPtr;
   bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
 
   // FIXME: We could also be folding things like TargetIndexes.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b6dd4905fb61bb6..8af5c364509f0e6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3369,6 +3369,8 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
 
 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
+  case AMDGPU::V_MOV_B16_t16_e32:
+  case AMDGPU::V_MOV_B16_t16_e64:
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::V_MOV_B32_e64:
   case AMDGPU::V_MOV_B64_PSEUDO:
@@ -5639,7 +5641,9 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
   unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
   const TargetRegisterClass *RC = RI.getRegClass(RCID);
   unsigned Size = RI.getRegSizeInBits(*RC);
-  unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
+  unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
+                    : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
+                                 : AMDGPU::V_MOV_B32_e32;
   if (MO.isReg())
     Opcode = AMDGPU::COPY;
   else if (RI.isSGPRClass(RC))
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 79bcf5e8cd30d4f..155747551471e38 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -657,6 +657,7 @@ void SIShrinkInstructions::dropInstructionKeepingImpDefs(
 // although requirements match the pass placement and it reduces code size too.
 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
   assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+         MovT.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
          MovT.getOpcode() == AMDGPU::COPY);
 
   Register T = MovT.getOperand(0).getReg();
@@ -668,7 +669,12 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
   Register X = Xop.getReg();
   unsigned Xsub = Xop.getSubReg();
 
-  unsigned Size = TII->getOpSize(MovT, 0) / 4;
+  unsigned Size = TII->getOpSize(MovT, 0);
+
+  // We can't match v_swap_b16 pre-RA, because VGPR_16_Lo128 registers
+  // are not allocatble.
+  if (Size == 2 && X.isVirtual())
+    return nullptr;
 
   if (!TRI->isVGPR(*MRI, X))
     return nullptr;
@@ -684,9 +690,9 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
     KilledT = MovY->killsRegister(T, TRI);
 
     if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+         MovY->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 &&
          MovY->getOpcode() != AMDGPU::COPY) ||
-        !MovY->getOperand(1).isReg()        ||
-        MovY->getOperand(1).getReg() != T   ||
+        !MovY->getOperand(1).isReg() || MovY->getOperand(1).getReg() != T ||
         MovY->getOperand(1).getSubReg() != Tsub)
       continue;
 
@@ -714,6 +720,7 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
       }
       if (MovX ||
           (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+           I->getOpcode() != AMDGPU::V_MOV_B16_t16_e32 &&
            I->getOpcode() != AMDGPU::COPY) ||
           I->getOperand(0).getReg() != X ||
           I->getOperand(0).getSubReg() != Xsub) {
@@ -721,7 +728,7 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
         break;
       }
 
-      if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
+      if (Size > 4 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
         continue;
 
       MovX = &*I;
@@ -730,23 +737,40 @@ MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
     if (!MovX)
       continue;
 
-    LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY);
+    LLVM_DEBUG(dbgs() << "Matched v_swap:\n" << MovT << *MovX << *MovY);
 
-    for (unsigned I = 0; I < Size; ++I) {
-      TargetInstrInfo::RegSubRegPair X1, Y1;
-      X1 = getSubRegForIndex(X, Xsub, I);
-      Y1 = getSubRegForIndex(Y, Ysub, I);
-      MachineBasicBlock &MBB = *MovT.getParent();
+    MachineBasicBlock &MBB = *MovT.getParent();
+    SmallVector<MachineInstr *, 4> Swaps;
+    if (Size == 2) {
       auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
-                         TII->get(AMDGPU::V_SWAP_B32))
-        .addDef(X1.Reg, 0, X1.SubReg)
-        .addDef(Y1.Reg, 0, Y1.SubReg)
-        .addReg(Y1.Reg, 0, Y1.SubReg)
-        .addReg(X1.Reg, 0, X1.SubReg).getInstr();
-      if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
-        // Drop implicit EXEC.
-        MIB->removeOperand(MIB->getNumExplicitOperands());
-        MIB->copyImplicitOps(*MBB.getParent(), *MovX);
+                         TII->get(AMDGPU::V_SWAP_B16))
+                     .addDef(X)
+                     .addDef(Y)
+                     .addReg(Y)
+                     .addReg(X)
+                     .getInstr();
+      Swaps.push_back(MIB);
+    } else {
+      assert(Size > 0 && Size % 4 == 0);
+      for (unsigned I = 0; I < Size / 4; ++I) {
+        TargetInstrInfo::RegSubRegPair X1, Y1;
+        X1 = getSubRegForIndex(X, Xsub, I);
+        Y1 = getSubRegForIndex(Y, Ysub, I);
+        auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
+                           TII->get(AMDGPU::V_SWAP_B32))
+                       .addDef(X1.Reg, 0, X1.SubReg)
+                       .addDef(Y1.Reg, 0, Y1.SubReg)
+                       .addReg(Y1.Reg, 0, Y1.SubReg)
+                       .addReg(X1.Reg, 0, X1.SubReg)
+                       .getInstr();
+        Swaps.push_back(MIB);
+      }
+    }
+    // Drop implicit EXEC.
+    if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
+      for (MachineInstr *Swap : Swaps) {
+        Swap->removeOperand(Swap->getNumExplicitOperands());
+        Swap->copyImplicitOps(*MBB.getParent(), *MovX);
       }
     }
     MovX->eraseFromParent();
@@ -833,6 +857,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
       }
 
       if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
+                            MI.getOpcode() == AMDGPU::V_MOV_B16_t16_e32 ||
                             MI.getOpcode() == AMDGPU::COPY)) {
         if (auto *NextMI = matchSwap(MI)) {
           Next = NextMI->getIterator();
@@ -1023,7 +1048,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
               MachineFunctionProperties::Property::NoVRegs))
         continue;
 
-      if (ST->hasTrue16BitInsts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
+      if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(MI.getOpcode()) &&
           !shouldShrinkTrue16(MI))
         continue;
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 0a2e338b347871c..34d12aa5e078354 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -751,7 +751,7 @@ let SubtargetPredicate = isGFX11Plus in {
     let IsInvalidSingleUseConsumer = 1;
     let IsInvalidSingleUseProducer = 1;
   }
-  defm V_MOV_B16_t16    : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
+  defm V_MOV_B16        : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
   defm V_NOT_B16        : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
   defm V_CVT_I32_I16    : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
   defm V_CVT_U32_U16    : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index d732da1a67bc1fb..970bb08e1838b26 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2131,26 +2131,14 @@ define void @test_store_fpimm(ptr addrspace(1) %ptr0, ptr addrspace(1) %ptr1) {
 ; GFX10-NEXT:    global_store_short v[2:3], v5, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11TRUE16-LABEL: test_store_fpimm:
-; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0x3f80
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0x4228
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, v4.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, v4.h
-; GFX11TRUE16-NEXT:    global_store_b16 v[0:1], v5, off
-; GFX11TRUE16-NEXT:    global_store_b16 v[2:3], v4, off
-; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: test_store_fpimm:
-; GFX11FAKE16:       ; %bb.0:
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    v_mov_b32_e32 v4, 0x3f80
-; GFX11FAKE16-NEXT:    v_mov_b32_e32 v5, 0x4228
-; GFX11FAKE16-NEXT:    global_store_b16 v[0:1], v4, off
-; GFX11FAKE16-NEXT:    global_store_b16 v[2:3], v5, off
-; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: test_store_fpimm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0x3f80
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x4228
+; GFX11-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b16 v[2:3], v5, off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat 1.0, ptr addrspace(1) %ptr0
   store bfloat 42.0, ptr addrspace(1) %ptr1
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index 7352fcdd071d5b1..9fe7544003568cf 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -246,9 +246,7 @@ define amdgpu_kernel void @fadd_f16_imm_a(
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, s7
 ; GFX11-SDAG-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b16_e32 v0.h, 0x3c00
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -264,9 +262,7 @@ define amdgpu_kernel void @fadd_f16_imm_a(
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b16_e32 v0.h, 0x3c00
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -390,9 +386,7 @@ define amdgpu_kernel void @fadd_f16_imm_b(
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, s7
 ; GFX11-SDAG-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b16_e32 v0.h, 0x4000
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-SDAG-NEXT:    s_nop 0
 ; GFX11-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -408,9 +402,7 @@ define amdgpu_kernel void @fadd_f16_imm_b(
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b16_e32 v0.h, 0x4000
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
new file mode 100644
index 000000000000000..1f36f7a0d9616e0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/v_swap_b16.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
+
+define half @swap(half %a, half %b, i32 %i) {
+; GFX11-TRUE16-LABEL: swap:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT:  .LBB0_1: ; %loop
+; GFX11-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_swap_b16 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX11-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-TRUE16-NEXT:  ; %bb.2: ; %ret
+; GFX11-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: swap:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX11-FAKE16-NEXT:  .LBB0_1: ; %loop
+; GFX11-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2
+; GFX11-FAKE16-NEXT:    v_swap_b32 v1, v0
+; GFX11-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX11-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-FAKE16-NEXT:  ; %bb.2: ; %ret
+; GFX11-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: swap:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v0.l
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX12-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX12-TRUE16-NEXT:  .LBB0_1: ; %loop
+; GFX12-TRUE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-TRUE16-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX12-TRUE16-NEXT:    v_swap_b16 v0.l, v0.h
+; GFX12-TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX12-TRUE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-TRUE16-NEXT:  ; %bb.2: ; %ret
+; GFX12-TRUE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: swap:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX12-FAKE16-NEXT:  .LBB0_1: ; %loop
+; GFX12-FAKE16-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_add_nc_u32 v2, -1, v2
+; GFX12-FAKE16-NEXT:    v_swap_b32 v1, v0
+; GFX12-FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX12-FAKE16-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-FAKE16-NEXT:  ; %bb.2: ; %ret
+; GFX12-FAKE16-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, v1
+; GFX12-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  br label %loop
+
+loop:
+  %x = phi half [%a, %entry], [%y, %loop]
+  %y = phi half [%b, %entry], [%x, %loop]
+  %i2 = phi i32 [%i, %entry], [%i3, %loop]
+
+  %i3 = sub i32 %i2, 1
+
+  %cmp = icmp eq i32 %i3, 0
+  br i1 %cmp, label %ret, label %loop
+
+ret:
+  ret half %x
+}
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
index 7f99afe01925997..68442b01bf7d909 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_err.s
@@ -169,21 +169,3 @@ s_load_b96 s[20:22], s[2:3], s0
 
 s_buffer_load_b96 s[20:22], s[4:7], s0
 // GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-
-v_mov_b16 v0.l, s0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, ttmp0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, a0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, s0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, ttmp0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
-
-v_mov_b16 v0.l, a0.h
-// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s
new file mode 100644
index 000000000000000..aa2309dd7d5d7cd
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_t16_err.s
@@ -0,0 +1,10 @@
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 %s 2>&1 | FileCheck --check-prefix=GFX11 --implicit-check-not=error: %s
+
+v_mov_b16 v0.l, s0.h
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_mov_b16 v0.l, ttmp0.h
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_mov_b16 v0.l, a0.h
+// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction

From 1baa6f75f34068a0b3fc772457deeed2d44287f8 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Thu, 8 Aug 2024 14:14:03 -0700
Subject: [PATCH 077/112] [llvm][ELF] Add statistics on various section sizes
 (#102363)

Useful with other infrastructure that consume LLVM statistics to get an
idea of distribution of section sizes.

The breakdown of various section types is subject to change, this is
just an initial go at gather some sort of stats.

Example stats compiling X86ISelLowering.cpp (-g1):

```
        "elf-object-writer.AllocROBytes": 308268,
        "elf-object-writer.AllocRWBytes": 6240,
        "elf-object-writer.AllocTextBytes": 1659203,
        "elf-object-writer.DebugBytes": 3180386,
        "elf-object-writer.OtherBytes": 5862,
        "elf-object-writer.RelocationBytes": 2623440,
        "elf-object-writer.StrtabBytes": 228599,
        "elf-object-writer.SymtabBytes": 120336,
        "elf-object-writer.UnwindBytes": 85216,
```
---
 llvm/lib/MC/ELFObjectWriter.cpp        | 56 +++++++++++++++++++++++++-
 llvm/test/CodeGen/X86/section-stats.ll | 13 ++++++
 2 files changed, 67 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/section-stats.ll

diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index c40a074137ab2da..7a8a6c8daf95938 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -62,10 +63,23 @@
 
 using namespace llvm;
 
-#undef  DEBUG_TYPE
-#define DEBUG_TYPE "reloc-info"
+#define DEBUG_TYPE "elf-object-writer"
 
 namespace {
+namespace stats {
+
+STATISTIC(AllocTextBytes, "Total size of SHF_ALLOC text sections");
+STATISTIC(AllocROBytes, "Total size of SHF_ALLOC readonly sections");
+STATISTIC(AllocRWBytes, "Total size of SHF_ALLOC read-write sections");
+STATISTIC(StrtabBytes, "Total size of SHT_STRTAB sections");
+STATISTIC(SymtabBytes, "Total size of SHT_SYMTAB sections");
+STATISTIC(RelocationBytes, "Total size of relocation sections");
+STATISTIC(DynsymBytes, "Total size of SHT_DYNSYM sections");
+STATISTIC(DebugBytes, "Total size of debug info sections");
+STATISTIC(UnwindBytes, "Total size of unwind sections");
+STATISTIC(OtherBytes, "Total size of uncategorized sections");
+
+} // namespace stats
 
 struct ELFWriter;
 
@@ -951,6 +965,44 @@ void ELFWriter::writeSectionHeader(const MCAssembler &Asm) {
     else
       Size = Offsets.second - Offsets.first;
 
+    auto SectionHasFlag = [&](uint64_t Flag) -> bool {
+      return Section->getFlags() & Flag;
+    };
+
+    if (Section->getName().starts_with(".debug")) {
+      stats::DebugBytes += Size;
+    } else if (Section->getName().starts_with(".eh_frame")) {
+      stats::UnwindBytes += Size;
+    } else if (SectionHasFlag(ELF::SHF_ALLOC)) {
+      if (SectionHasFlag(ELF::SHF_EXECINSTR)) {
+        stats::AllocTextBytes += Size;
+      } else if (SectionHasFlag(ELF::SHF_WRITE)) {
+        stats::AllocRWBytes += Size;
+      } else {
+        stats::AllocROBytes += Size;
+      }
+    } else {
+      switch (Section->getType()) {
+      case ELF::SHT_STRTAB:
+        stats::StrtabBytes += Size;
+        break;
+      case ELF::SHT_SYMTAB:
+        stats::SymtabBytes += Size;
+        break;
+      case ELF::SHT_DYNSYM:
+        stats::DynsymBytes += Size;
+        break;
+      case ELF::SHT_REL:
+      case ELF::SHT_RELA:
+      case ELF::SHT_CREL:
+        stats::RelocationBytes += Size;
+        break;
+      default:
+        stats::OtherBytes += Size;
+        break;
+      }
+    }
+
     writeSection(GroupSymbolIndex, Offsets.first, Size, *Section);
   }
 }
diff --git a/llvm/test/CodeGen/X86/section-stats.ll b/llvm/test/CodeGen/X86/section-stats.ll
new file mode 100644
index 000000000000000..94d0a965ac59ee4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/section-stats.ll
@@ -0,0 +1,13 @@
+; REQUIRES: asserts
+; RUN: llc -o /dev/null -filetype=obj -stats %s 2>&1 | FileCheck %s
+
+; CHECK-DAG: 1 elf-object-writer - Total size of SHF_ALLOC text sections
+; CHECK-DAG: 1 elf-object-writer - Total size of SHF_ALLOC read-write sections
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = global i8 1
+
+define void @f() {
+    ret void
+}

From 967185eeb85abb77bd6b6cdd2b026d5c54b7d4f3 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Thu, 8 Aug 2024 21:14:15 +0000
Subject: [PATCH 078/112] Revert "[ctx_prof] Fix the pre-thinlink "use" case
 (#102511)"

This reverts commit 1a6d60e0162b3ef767c87c95512dd453bf4f4746.

Broke some buildbots.
---
 .../Instrumentation/PGOCtxProfLowering.h        |  3 +--
 llvm/lib/Passes/PassBuilderPipelines.cpp        | 13 ++++++-------
 .../Instrumentation/PGOCtxProfLowering.cpp      |  2 +-
 .../Instrumentation/PGOInstrumentation.cpp      | 17 +++++++----------
 .../PGOProfile/ctx-prof-use-prelink.ll          | 14 ++++----------
 5 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
index f127d16b8de124d..5256aff56205bab 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
@@ -19,8 +19,7 @@ class Type;
 class PGOCtxProfLoweringPass : public PassInfoMixin<PGOCtxProfLoweringPass> {
 public:
   explicit PGOCtxProfLoweringPass() = default;
-  // True if contextual instrumentation is enabled.
-  static bool isCtxIRPGOInstrEnabled();
+  static bool isContextualIRPGOEnabled();
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
 };
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 67f3464f745a160..c175ee898098495 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1173,12 +1173,13 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   const bool IsMemprofUse = IsPGOPreLink && !PGOOpt->MemoryProfile.empty();
   // We don't want to mix pgo ctx gen and pgo gen; we also don't currently
   // enable ctx profiling from the frontend.
-  assert(!(IsPGOInstrGen && PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled()) &&
-         "Enabling both instrumented PGO and contextual instrumentation is not "
-         "supported.");
+  assert(
+      !(IsPGOInstrGen && PGOCtxProfLoweringPass::isContextualIRPGOEnabled()) &&
+      "Enabling both instrumented FDO and contextual instrumentation is not "
+      "supported.");
   // Enable contextual profiling instrumentation.
   const bool IsCtxProfGen = !IsPGOInstrGen && IsPreLink &&
-                            PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled();
+                            PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
   const bool IsCtxProfUse = !UseCtxProfile.empty() && !PGOOpt &&
                             Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
 
@@ -1669,10 +1670,8 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
   // In pre-link, for ctx prof use, we stop here with an instrumented IR. We let
   // thinlto use the contextual info to perform imports; then use the contextual
   // profile in the post-thinlink phase.
-  if (!UseCtxProfile.empty() && !PGOOpt) {
-    addRequiredLTOPreLinkPasses(MPM);
+  if (!UseCtxProfile.empty() && !PGOOpt)
     return MPM;
-  }
 
   // Run partial inlining pass to partially inline functions that have
   // large bodies.
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
index d6ba12465bb3283..de1d4d2381c06ee 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
@@ -30,7 +30,7 @@ static cl::list<std::string> ContextRoots(
         "root of an interesting graph, which will be profiled independently "
         "from other similar graphs."));
 
-bool PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled() {
+bool PGOCtxProfLoweringPass::isContextualIRPGOEnabled() {
   return !ContextRoots.empty();
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 41618194d12ed7c..1ce8f58c1aa1408 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -321,7 +321,6 @@ static cl::opt<unsigned> PGOFunctionCriticalEdgeThreshold(
              " greater than this threshold."));
 
 extern cl::opt<unsigned> MaxNumVTableAnnotations;
-extern cl::opt<std::string> UseCtxProfile;
 
 namespace llvm {
 // Command line option to turn on CFG dot dump after profile annotation.
@@ -339,12 +338,9 @@ extern cl::opt<bool> EnableVTableProfileUse;
 extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
 } // namespace llvm
 
-bool shouldInstrumentForCtxProf() {
-  return PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled() ||
-         !UseCtxProfile.empty();
-}
 bool shouldInstrumentEntryBB() {
-  return PGOInstrumentEntry || shouldInstrumentForCtxProf();
+  return PGOInstrumentEntry ||
+         PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
 }
 
 // FIXME(mtrofin): re-enable this for ctx profiling, for non-indirect calls. Ctx
@@ -352,7 +348,8 @@ bool shouldInstrumentEntryBB() {
 // Supporting other values is relatively straight-forward - just another counter
 // range within the context.
 bool isValueProfilingDisabled() {
-  return DisableValueProfiling || shouldInstrumentForCtxProf();
+  return DisableValueProfiling ||
+         PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
 }
 
 // Return a string describing the branch condition that can be
@@ -905,7 +902,7 @@ static void instrumentOneFunc(
   unsigned NumCounters =
       InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
 
-  if (shouldInstrumentForCtxProf()) {
+  if (PGOCtxProfLoweringPass::isContextualIRPGOEnabled()) {
     auto *CSIntrinsic =
         Intrinsic::getDeclaration(M, Intrinsic::instrprof_callsite);
     // We want to count the instrumentable callsites, then instrument them. This
@@ -1864,7 +1861,7 @@ static bool InstrumentAllFunctions(
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) {
   // For the context-sensitve instrumentation, we should have a separated pass
   // (before LTO/ThinLTO linking) to create these variables.
-  if (!IsCS && !shouldInstrumentForCtxProf())
+  if (!IsCS && !PGOCtxProfLoweringPass::isContextualIRPGOEnabled())
     createIRLevelProfileFlagVar(M, /*IsCS=*/false);
 
   Triple TT(M.getTargetTriple());
@@ -2115,7 +2112,7 @@ static bool annotateAllFunctions(
   bool InstrumentFuncEntry = PGOReader->instrEntryBBEnabled();
   if (PGOInstrumentEntry.getNumOccurrences() > 0)
     InstrumentFuncEntry = PGOInstrumentEntry;
-  InstrumentFuncEntry |= shouldInstrumentForCtxProf();
+  InstrumentFuncEntry |= PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
 
   bool HasSingleByteCoverage = PGOReader->hasSingleByteCoverage();
   for (auto &F : M) {
diff --git a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
index 18ac2f92aa39d46..b50a815be5abf5b 100644
--- a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
+++ b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; There is no profile, but that's OK because the prelink does not care about
 ; the content of the profile, just that we intend to use one.
 ; There is no scenario currently of doing ctx profile use without thinlto.
@@ -7,22 +7,19 @@
 
 declare void @bar()
 
-;.
-; CHECK: @__profn_foo = private constant [3 x i8] c"foo"
-;.
 define void @foo(i32 %a, ptr %fct) {
 ; CHECK-LABEL: define void @foo(
 ; CHECK-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) local_unnamed_addr {
-; CHECK-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0)
 ; CHECK-NEXT:    [[T:%.*]] = icmp eq i32 [[A]], 0
 ; CHECK-NEXT:    br i1 [[T]], label %[[YES:.*]], label %[[NO:.*]]
 ; CHECK:       [[YES]]:
 ; CHECK-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1)
-; CHECK-NEXT:    call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0, ptr [[FCT]])
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[FCT]] to i64
+; CHECK-NEXT:    call void @llvm.instrprof.value.profile(ptr @__profn_foo, i64 728453322856651412, i64 [[TMP1]], i32 0, i32 0)
 ; CHECK-NEXT:    call void [[FCT]](i32 0)
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[NO]]:
-; CHECK-NEXT:    call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1, ptr @bar)
+; CHECK-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0)
 ; CHECK-NEXT:    call void @bar()
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
@@ -39,6 +36,3 @@ no:
 exit:
   ret void
 }
-;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
-;.

From d46c26b8102dee763d72bf98341bc95b21767196 Mon Sep 17 00:00:00 2001
From: Kirill Stoimenov <87100199+kstoimenov@users.noreply.github.com>
Date: Thu, 8 Aug 2024 14:19:56 -0700
Subject: [PATCH 079/112] Fix prctl to handle PR_GET_PDEATHSIG. (#101749)

---
 .../sanitizer_common/sanitizer_common_interceptors.inc    | 7 +++++--
 .../test/sanitizer_common/TestCases/Linux/prctl.cpp       | 8 ++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 49c9dcbef358ff2..0fa491a9641a6c1 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -1255,6 +1255,7 @@ INTERCEPTOR(int, prctl, int option, unsigned long arg2, unsigned long arg3,
   static const int PR_SET_VMA = 0x53564d41;
   static const int PR_SCHED_CORE = 62;
   static const int PR_SCHED_CORE_GET = 0;
+  static const int PR_GET_PDEATHSIG = 2;
   if (option == PR_SET_VMA && arg2 == 0UL) {
     char *name = (char *)arg5;
     COMMON_INTERCEPTOR_READ_RANGE(ctx, name, internal_strlen(name) + 1);
@@ -1270,7 +1271,9 @@ INTERCEPTOR(int, prctl, int option, unsigned long arg2, unsigned long arg3,
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, name, internal_strlen(name) + 1);
   } else if (res != -1 && option == PR_SCHED_CORE &&
              arg2 == PR_SCHED_CORE_GET) {
-    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64*)(arg5), sizeof(u64));
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64 *)(arg5), sizeof(u64));
+  } else if (res != -1 && option == PR_GET_PDEATHSIG) {
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64 *)(arg2), sizeof(u64));
   }
   return res;
 }
@@ -9980,7 +9983,7 @@ INTERCEPTOR(SSIZE_T, getrandom, void *buf, SIZE_T buflen, unsigned int flags) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, getrandom, buf, buflen, flags);
   // If GRND_NONBLOCK is set in the flags, it is non blocking.
-  static const int grnd_nonblock = 1; 
+  static const int grnd_nonblock = 1;
   SSIZE_T n;
   if ((flags & grnd_nonblock))
     n = REAL(getrandom)(buf, buflen, flags);
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
index d5d81280e0b44c0..3d1c24cddc64370 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
@@ -41,6 +41,14 @@ int main() {
     assert(cookie != 0);
   }
 
+  int signum;
+  res = prctl(PR_GET_PDEATHSIG, reinterpret_cast<unsigned long>(&signum));
+  if (res < 0) {
+    assert(errno == EINVAL);
+  } else {
+    assert(signum == 0);
+  }
+
   char invname[81], vlname[] = "prctl";
   for (auto i = 0; i < sizeof(invname); i++) {
     invname[i] = 0x1e;

From 2b592b16c17e5c4fc135534d80d7c61a986a292c Mon Sep 17 00:00:00 2001
From: jameshu15869 <55058507+jameshu15869@users.noreply.github.com>
Date: Thu, 8 Aug 2024 16:26:26 -0500
Subject: [PATCH 080/112] [libc][gpu] Add Sinf Benchmarks (#102532)

This PR adds benchmarking for `sinf()` using the same set up as `sin()`
but with a smaller range for floats.
---
 libc/benchmarks/gpu/src/math/CMakeLists.txt   |  1 +
 .../benchmarks/gpu/src/math/sin_benchmark.cpp | 60 ++++++++++++-------
 2 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
index 77250edb6bb526f..335da5ad71cf882 100644
--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
+++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
@@ -32,6 +32,7 @@ add_benchmark(
     sin_benchmark.cpp
   DEPENDS
     libc.src.math.sin
+    libc.src.math.sinf
     libc.src.stdlib.srand
     libc.src.stdlib.rand
     libc.src.__support.FPUtil.fp_bits
diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
index 0dc3eae2a24a09c..bf09e6e462172e8 100644
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@@ -5,6 +5,7 @@
 #include "src/__support/CPP/functional.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/sin.h"
+#include "src/math/sinf.h"
 #include "src/stdlib/rand.h"
 
 #ifdef NVPTX_MATH_FOUND
@@ -19,37 +20,56 @@
 // uint64_t representing the latency. Defining each benchmark using macro that
 // expands to a lambda to allow us to switch the implementation of `sin()` to
 // easily register NVPTX benchmarks.
-#define BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, N)                             \
+#define BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N)                          \
   []() {                                                                       \
-    return LIBC_NAMESPACE::benchmarks::MathPerf<                               \
-        double>::run_throughput_in_range<N>(Func, MIN_EXP, MAX_EXP);           \
+    return LIBC_NAMESPACE::benchmarks::MathPerf<T>::run_throughput_in_range<   \
+        N>(Func, MIN_EXP, MAX_EXP);                                            \
   }
 
-#define BENCH(Name, Func, MIN_EXP, MAX_EXP)                                    \
+#define BENCH(T, Name, Func, MIN_EXP, MAX_EXP)                                 \
   SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1,                     \
-                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1));           \
+                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1));        \
   SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_128,                   \
-                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 128));         \
+                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 128));      \
   SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_1024,                  \
-                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 1024));        \
+                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 1024));     \
   SINGLE_WAVE_BENCHMARK(LlvmLibcSinGpuBenchmark, Name##_4096,                  \
-                        BM_RANDOM_INPUT(Func, MIN_EXP, MAX_EXP, 4096))
+                        BM_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, 4096))
 
-BENCH(Sin, LIBC_NAMESPACE::sin, -1023, 1023);
-BENCH(SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
-BENCH(SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
-BENCH(SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
+BENCH(double, Sin, LIBC_NAMESPACE::sin, -1023, 1023);
+BENCH(double, SinTwoPi, LIBC_NAMESPACE::sin, -10, 3);
+BENCH(double, SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
+BENCH(double, SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
 
 #ifdef NVPTX_MATH_FOUND
-BENCH(NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
-BENCH(NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
-BENCH(NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
-BENCH(NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
+BENCH(double, NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
+BENCH(double, NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
+BENCH(double, NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
+BENCH(double, NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
 #endif
 
 #ifdef AMDGPU_MATH_FOUND
-BENCH(AmdSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
-BENCH(AmdSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
-BENCH(AmdSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
-BENCH(AmdSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
+BENCH(double, AmdSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
+BENCH(double, AmdSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
+BENCH(double, AmdSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
+BENCH(double, AmdSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
+#endif
+
+BENCH(float, Sinf, LIBC_NAMESPACE::sinf, -127, 128);
+BENCH(float, SinfTwoPi, LIBC_NAMESPACE::sinf, -10, 3);
+BENCH(float, SinfTwoPow30, LIBC_NAMESPACE::sinf, 0, 30);
+BENCH(float, SinfVeryLarge, LIBC_NAMESPACE::sinf, 30, 120);
+
+#ifdef NVPTX_MATH_FOUND
+BENCH(float, NvSinf, LIBC_NAMESPACE::__nv_sinf, -127, 128);
+BENCH(float, NvSinfTwoPi, LIBC_NAMESPACE::__nv_sinf, -10, 3);
+BENCH(float, NvSinfTwoPow30, LIBC_NAMESPACE::__nv_sinf, 0, 30);
+BENCH(float, NvSinfVeryLarge, LIBC_NAMESPACE::__nv_sinf, 30, 120);
+#endif
+
+#ifdef AMDGPU_MATH_FOUND
+BENCH(float, AmdSinf, LIBC_NAMESPACE::__ocml_sin_f32, -127, 128);
+BENCH(float, AmdSinfTwoPi, LIBC_NAMESPACE::__ocml_sin_f32, -10, 3);
+BENCH(float, AmdSinfTwoPow30, LIBC_NAMESPACE::__ocml_sin_f32, 0, 30);
+BENCH(float, AmdSinfVeryLarge, LIBC_NAMESPACE::__ocml_sin_f32, 30, 120);
 #endif

From d0fe470fd2b32de96762465fac130c13e9a1f782 Mon Sep 17 00:00:00 2001
From: aaryanshukla <53713108+aaryanshukla@users.noreply.github.com>
Date: Thu, 8 Aug 2024 14:33:50 -0700
Subject: [PATCH 081/112] [libc][math] Add scalbln{,f,l,f128} math functions
 (#102219)

Co-authored-by: OverMighty <its.overmighty@gmail.com>
---
 libc/config/baremetal/arm/entrypoints.txt     |  3 +
 libc/config/baremetal/riscv/entrypoints.txt   |  3 +
 libc/config/darwin/arm/entrypoints.txt        |  3 +
 libc/config/gpu/entrypoints.txt               |  2 +
 libc/config/linux/aarch64/entrypoints.txt     |  4 ++
 libc/config/linux/arm/entrypoints.txt         |  3 +
 libc/config/linux/riscv/entrypoints.txt       |  4 ++
 libc/config/linux/x86_64/entrypoints.txt      |  4 ++
 libc/config/windows/entrypoints.txt           |  3 +
 libc/docs/math/index.rst                      |  2 +-
 libc/newhdrgen/yaml/math.yaml                 | 37 ++++++++++
 libc/spec/stdc.td                             |  4 ++
 libc/src/math/CMakeLists.txt                  |  4 ++
 libc/src/math/generic/CMakeLists.txt          | 54 ++++++++++++++
 libc/src/math/generic/scalbln.cpp             | 25 +++++++
 libc/src/math/generic/scalblnf.cpp            | 25 +++++++
 libc/src/math/generic/scalblnf128.cpp         | 25 +++++++
 libc/src/math/generic/scalblnl.cpp            | 25 +++++++
 libc/src/math/scalbln.h                       | 20 ++++++
 libc/src/math/scalblnf.h                      | 20 ++++++
 libc/src/math/scalblnf128.h                   | 21 ++++++
 libc/src/math/scalblnl.h                      | 20 ++++++
 libc/test/src/math/smoke/CMakeLists.txt       | 72 +++++++++++++++++++
 libc/test/src/math/smoke/scalbln_test.cpp     | 13 ++++
 libc/test/src/math/smoke/scalblnf128_test.cpp | 13 ++++
 libc/test/src/math/smoke/scalblnf_test.cpp    | 13 ++++
 libc/test/src/math/smoke/scalblnl_test.cpp    | 13 ++++
 27 files changed, 434 insertions(+), 1 deletion(-)
 create mode 100644 libc/src/math/generic/scalbln.cpp
 create mode 100644 libc/src/math/generic/scalblnf.cpp
 create mode 100644 libc/src/math/generic/scalblnf128.cpp
 create mode 100644 libc/src/math/generic/scalblnl.cpp
 create mode 100644 libc/src/math/scalbln.h
 create mode 100644 libc/src/math/scalblnf.h
 create mode 100644 libc/src/math/scalblnf128.h
 create mode 100644 libc/src/math/scalblnl.h
 create mode 100644 libc/test/src/math/smoke/scalbln_test.cpp
 create mode 100644 libc/test/src/math/smoke/scalblnf128_test.cpp
 create mode 100644 libc/test/src/math/smoke/scalblnf_test.cpp
 create mode 100644 libc/test/src/math/smoke/scalblnl_test.cpp

diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index d9b0fd8d065862e..af9a8bc9925441f 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -394,6 +394,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.roundevenl
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index 60d3070c963a050..6ebe2e4a29025f3 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -389,6 +389,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.roundevenl
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index 78e46b18993a096..36da9e13136638f 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -248,6 +248,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.round
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 3e0e6aff5524565..5e05c1617a3be01 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -336,6 +336,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.rintf
     libc.src.math.round
     libc.src.math.roundf
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.sin
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 034db23932a3987..bb0ebca29e6ae25 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -542,6 +542,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.roundevenl
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
@@ -729,6 +732,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.rintf128
     libc.src.math.roundevenf128
     libc.src.math.roundf128
+    libc.src.math.scalblnf128
     libc.src.math.scalbnf128
     libc.src.math.setpayloadf128
     libc.src.math.setpayloadsigf128
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 9d3ab0c157aa81c..7fd60799bbcc722 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -371,6 +371,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.round
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index e2d8e9e32496778..0d48b55a9654836 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -545,6 +545,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.roundevenl
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
@@ -636,6 +639,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.rintf128
     libc.src.math.roundevenf128
     libc.src.math.roundf128
+    libc.src.math.scalblnf128
     libc.src.math.scalbnf128
     libc.src.math.setpayloadf128
     libc.src.math.setpayloadsigf128
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 817f4190399aa19..b9134c8496c30ae 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -545,6 +545,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.roundevenl
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
@@ -725,6 +728,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
     libc.src.math.rintf128
     libc.src.math.roundevenf128
     libc.src.math.roundf128
+    libc.src.math.scalblnf128
     libc.src.math.scalbnf128
     libc.src.math.setpayloadf128
     libc.src.math.setpayloadsigf128
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 1f36bdf9b21ee8d..7fa7eb22772e29b 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -264,6 +264,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.round
     libc.src.math.roundf
     libc.src.math.roundl
+    libc.src.math.scalbln
+    libc.src.math.scalblnf
+    libc.src.math.scalblnl
     libc.src.math.scalbn
     libc.src.math.scalbnf
     libc.src.math.scalbnl
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index a3174c72e86ce12..9185a95192963e2 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -220,7 +220,7 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | roundeven        | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.9.8               | F.10.6.8                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| scalbln          |                  |                 |                        | |check|              |                        | 7.12.6.19              | F.10.3.19                  |
+| scalbln          | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.19              | F.10.3.19                  |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | scalbn           | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.6.19              | F.10.3.19                  |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/newhdrgen/yaml/math.yaml b/libc/newhdrgen/yaml/math.yaml
index 1b474b3401ecb5a..2f1203e581f9a72 100644
--- a/libc/newhdrgen/yaml/math.yaml
+++ b/libc/newhdrgen/yaml/math.yaml
@@ -2313,6 +2313,43 @@ functions:
       - type: int
       - type: unsigned int
     guard: LIBC_TYPES_HAS_FLOAT128
+  - name: scalbln
+    standards: 
+      - stdc
+    return_type: double
+    arguments:
+      - type: double
+      - type: long
+  - name: scalblnl
+    standards:
+      - stdc
+    return_type: long double
+    arguments:
+      - type: long double
+      - type: long
+  - name: scalblnf
+    standards:
+      - stdc
+    return_type: float
+    arguments:
+      - type: float
+      - type: long
+  - name: scalblnf16
+    standards:
+      - stdc
+    return_type: float16
+    arguments:
+      - type: float16
+      - type: long
+    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: scalblnf128
+    standards:
+      - stdc
+    return_type: float128
+    arguments:
+      - type: float128
+      - type: long
+    guard: LIBC_TYPES_HAS_FLOAT128
   - name: lgamma
     standards:
       - stdc
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 082b1e499e0d02b..72bfe0cf71aa2bd 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -717,7 +717,11 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"asinhf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"atanhf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
 
+          FunctionSpec<"scalbln", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<LongType>]>,
+          FunctionSpec<"scalblnf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<LongType>]>,
+          FunctionSpec<"scalblnl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongType>]>,
           GuardedFunctionSpec<"scalblnf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<LongType>], "LIBC_TYPES_HAS_FLOAT16">,
+          GuardedFunctionSpec<"scalblnf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<LongType>], "LIBC_TYPES_HAS_FLOAT128">,
 
           FunctionSpec<"scalbn", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntType>]>,
           FunctionSpec<"scalbnf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntType>]>,
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index a07065c93003f26..e2ebf2ddd4bfa44 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -427,7 +427,11 @@ add_math_entrypoint_object(roundevenl)
 add_math_entrypoint_object(roundevenf16)
 add_math_entrypoint_object(roundevenf128)
 
+add_math_entrypoint_object(scalbln)
+add_math_entrypoint_object(scalblnf)
+add_math_entrypoint_object(scalblnl)
 add_math_entrypoint_object(scalblnf16)
+add_math_entrypoint_object(scalblnf128)
 
 add_math_entrypoint_object(scalbn)
 add_math_entrypoint_object(scalbnf)
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index a224bed8c3a14c4..c80c7ca7f7af11d 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -4185,6 +4185,46 @@ add_entrypoint_object(
     libc.src.__support.macros.optimization
 )
 
+
+add_entrypoint_object(
+  scalbln
+  SRCS
+    scalbln.cpp
+  HDRS
+    ../scalbln.h
+  DEPENDS
+    libc.hdr.float_macros
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  scalblnf
+  SRCS
+    scalblnf.cpp
+  HDRS
+    ../scalblnf.h
+  DEPENDS
+    libc.hdr.float_macros
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  scalblnl
+  SRCS
+    scalblnl.cpp
+  HDRS
+    ../scalblnl.h
+  DEPENDS
+    libc.hdr.float_macros
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   scalblnf16
   SRCS
@@ -4199,6 +4239,20 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  scalblnf128
+  SRCS
+    scalblnf128.cpp
+  HDRS
+    ../scalblnf128.h
+  DEPENDS
+    libc.hdr.float_macros
+    libc.src.__support.macros.properties.types
+    libc.src.__support.FPUtil.manipulation_functions
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   scalbn
   SRCS
diff --git a/libc/src/math/generic/scalbln.cpp b/libc/src/math/generic/scalbln.cpp
new file mode 100644
index 000000000000000..f97619954237eef
--- /dev/null
+++ b/libc/src/math/generic/scalbln.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of scalbln function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/scalbln.h"
+#include "hdr/float_macros.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, scalbln, (double x, long n)) {
+  return fputil::ldexp(x, n);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/scalblnf.cpp b/libc/src/math/generic/scalblnf.cpp
new file mode 100644
index 000000000000000..aa11a552a919401
--- /dev/null
+++ b/libc/src/math/generic/scalblnf.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of scalblnf function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/scalblnf.h"
+#include "hdr/float_macros.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float, scalblnf, (float x, long n)) {
+  return fputil::ldexp(x, n);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/scalblnf128.cpp b/libc/src/math/generic/scalblnf128.cpp
new file mode 100644
index 000000000000000..fda6ea0bfe03056
--- /dev/null
+++ b/libc/src/math/generic/scalblnf128.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of scalblnf128 function ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/scalblnf128.h"
+#include "hdr/float_macros.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(float128, scalblnf128, (float128 x, long n)) {
+  return fputil::ldexp(x, n);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/scalblnl.cpp b/libc/src/math/generic/scalblnl.cpp
new file mode 100644
index 000000000000000..5823c498ba3ecc4
--- /dev/null
+++ b/libc/src/math/generic/scalblnl.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of scalblnl function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/scalblnl.h"
+#include "hdr/float_macros.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#if FLT_RADIX != 2
+#error "FLT_RADIX != 2 is not supported."
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(long double, scalblnl, (long double x, long n)) {
+  return fputil::ldexp(x, n);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/scalbln.h b/libc/src/math/scalbln.h
new file mode 100644
index 000000000000000..b99ba7683ebc4a3
--- /dev/null
+++ b/libc/src/math/scalbln.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for scalbln -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SCALBLN_H
+#define LLVM_LIBC_SRC_MATH_SCALBLN_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double scalbln(double x, long n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SCALBLN_H
diff --git a/libc/src/math/scalblnf.h b/libc/src/math/scalblnf.h
new file mode 100644
index 000000000000000..a757f528e785641
--- /dev/null
+++ b/libc/src/math/scalblnf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for scalblnf ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SCALBLNF_H
+#define LLVM_LIBC_SRC_MATH_SCALBLNF_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float scalblnf(float x, long n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SCALBLNF_H
diff --git a/libc/src/math/scalblnf128.h b/libc/src/math/scalblnf128.h
new file mode 100644
index 000000000000000..b9b7a862f66f681
--- /dev/null
+++ b/libc/src/math/scalblnf128.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for scalblnf128 -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SCALBLNF128_H
+#define LLVM_LIBC_SRC_MATH_SCALBLNF128_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float128 scalblnf128(float128 x, long n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SCALBLNF128_H
diff --git a/libc/src/math/scalblnl.h b/libc/src/math/scalblnl.h
new file mode 100644
index 000000000000000..e2df840892e5e97
--- /dev/null
+++ b/libc/src/math/scalblnl.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for scalblnl ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SCALBLNL_H
+#define LLVM_LIBC_SRC_MATH_SCALBLNL_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+long double scalblnl(long double x, long n);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SCALBLNL_H
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index ef303228a9fe007..609ecef42f74596 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3630,6 +3630,51 @@ add_fp_unittest(
     libc.src.math.atan2
 )
 
+add_fp_unittest(
+  scalbln_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    scalbln_test.cpp
+  HDRS
+    ScalbnTest.h
+  DEPENDS
+    libc.src.math.scalbln
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
+)
+
+add_fp_unittest(
+  scalblnf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    scalblnf_test.cpp
+  HDRS
+    ScalbnTest.h
+  DEPENDS
+    libc.src.math.scalblnf
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
+)
+
+add_fp_unittest(
+  scalblnl_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    scalblnl_test.cpp
+  HDRS
+    ScalbnTest.h
+  DEPENDS
+    libc.src.math.scalblnl
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
+)
+
 add_fp_unittest(
   scalblnf16_test
   SUITE
@@ -3640,7 +3685,24 @@ add_fp_unittest(
     ScalbnTest.h
   DEPENDS
     libc.src.math.scalblnf16
+    libc.src.__support.CPP.limits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
+)
+
+add_fp_unittest(
+  scalblnf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    scalblnf128_test.cpp
+  HDRS
+    ScalbnTest.h
+  DEPENDS
+    libc.src.math.scalblnf128
+    libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
@@ -3653,7 +3715,9 @@ add_fp_unittest(
     ScalbnTest.h
   DEPENDS
     libc.src.math.scalbn
+    libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
@@ -3666,7 +3730,9 @@ add_fp_unittest(
     ScalbnTest.h
   DEPENDS
     libc.src.math.scalbnf
+    libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
@@ -3679,7 +3745,9 @@ add_fp_unittest(
     ScalbnTest.h
   DEPENDS
     libc.src.math.scalbnl
+    libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
@@ -3692,7 +3760,9 @@ add_fp_unittest(
     ScalbnTest.h
   DEPENDS
     libc.src.math.scalbnf16
+    libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
@@ -3705,7 +3775,9 @@ add_fp_unittest(
     ScalbnTest.h
   DEPENDS
     libc.src.math.scalbnf128
+    libc.src.__support.CPP.limits
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.normal_float
 )
 
 add_fp_unittest(
diff --git a/libc/test/src/math/smoke/scalbln_test.cpp b/libc/test/src/math/smoke/scalbln_test.cpp
new file mode 100644
index 000000000000000..eaf7b8e47b41ce3
--- /dev/null
+++ b/libc/test/src/math/smoke/scalbln_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for scalbln ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScalbnTest.h"
+
+#include "src/math/scalbln.h"
+
+LIST_SCALBN_TESTS(double, long, LIBC_NAMESPACE::scalbln)
diff --git a/libc/test/src/math/smoke/scalblnf128_test.cpp b/libc/test/src/math/smoke/scalblnf128_test.cpp
new file mode 100644
index 000000000000000..9ee1b3816511e0d
--- /dev/null
+++ b/libc/test/src/math/smoke/scalblnf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for scalblnf128 -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScalbnTest.h"
+
+#include "src/math/scalblnf128.h"
+
+LIST_SCALBN_TESTS(float128, long, LIBC_NAMESPACE::scalblnf128)
diff --git a/libc/test/src/math/smoke/scalblnf_test.cpp b/libc/test/src/math/smoke/scalblnf_test.cpp
new file mode 100644
index 000000000000000..a40d7aa7886db1f
--- /dev/null
+++ b/libc/test/src/math/smoke/scalblnf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for scalblnf --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScalbnTest.h"
+
+#include "src/math/scalblnf.h"
+
+LIST_SCALBN_TESTS(float, long, LIBC_NAMESPACE::scalblnf)
diff --git a/libc/test/src/math/smoke/scalblnl_test.cpp b/libc/test/src/math/smoke/scalblnl_test.cpp
new file mode 100644
index 000000000000000..ccfbe1ebdeaf094
--- /dev/null
+++ b/libc/test/src/math/smoke/scalblnl_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for scalblnl --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScalbnTest.h"
+
+#include "src/math/scalblnl.h"
+
+LIST_SCALBN_TESTS(long double, long, LIBC_NAMESPACE::scalblnl)

From d851b5c12c6c9758fec8784df730f330b6b31644 Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 8 Aug 2024 14:39:07 -0700
Subject: [PATCH 082/112] [libc] Make str_to_float independent of fenv
 (#102369)

The str_to_float conversion code doesn't need the features provided by
fenv and the dependency is creating a blocker for hand-in-hand. This
patch uses a workaround to remove this dependency.
---
 libc/src/__support/CMakeLists.txt                 |  2 --
 libc/src/__support/str_to_float.h                 | 11 +++++------
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel |  2 --
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index d8a192f1ffa570c..9bd1e29081a801f 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -190,8 +190,6 @@ add_header_library(
     libc.src.__support.CPP.bit
     libc.src.__support.CPP.limits
     libc.src.__support.CPP.optional
-    libc.src.__support.FPUtil.dyadic_float
-    libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.rounding_mode
     libc.src.errno.errno
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index c72bc1f957dc374..17cf3dd55b9fb4b 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -13,9 +13,7 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/string_view.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/dyadic_float.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/common.h"
 #include "src/__support/ctype_utils.h"
@@ -27,6 +25,8 @@
 #include "src/__support/uint128.h"
 #include "src/errno/libc_errno.h" // For ERANGE
 
+#include <stdint.h>
+
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
@@ -525,10 +525,9 @@ clinger_fast_path(ExpandedFloat<T> init_num,
   FPBits result;
   T float_mantissa;
   if constexpr (cpp::is_same_v<StorageType, UInt<128>>) {
-    float_mantissa = static_cast<T>(fputil::DyadicFloat<128>(
-        Sign::POS, 0,
-        fputil::DyadicFloat<128>::MantissaType(
-            {uint64_t(mantissa), uint64_t(mantissa >> 64)})));
+    float_mantissa =
+        (static_cast<T>(uint64_t(mantissa)) * static_cast<T>(0x1.0p64)) +
+        static_cast<T>(uint64_t(mantissa >> 64));
   } else {
     float_mantissa = static_cast<T>(mantissa);
   }
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index e81e677dc58dd21..bedd363edd1bdee 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -664,8 +664,6 @@ libc_support_library(
         ":__support_cpp_optional",
         ":__support_cpp_string_view",
         ":__support_ctype_utils",
-        ":__support_fputil_dyadic_float",
-        ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
         ":__support_fputil_rounding_mode",
         ":__support_str_to_integer",

From 070ce816dadb266f3296256048e6a5fb8517b06d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 8 Aug 2024 15:13:46 -0700
Subject: [PATCH 083/112] [RISCV] Remove unused function argument in
 RISCVOptWInstrs. NFC

---
 llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 49be866448f2ea1..00ac8cb9a9066d7 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -350,8 +350,7 @@ static bool hasAllWUsers(const MachineInstr &OrigMI, const RISCVSubtarget &ST,
 
 // This function returns true if the machine instruction always outputs a value
 // where bits 63:32 match bit 31.
-static bool isSignExtendingOpW(const MachineInstr &MI,
-                               const MachineRegisterInfo &MRI, unsigned OpNo) {
+static bool isSignExtendingOpW(const MachineInstr &MI, unsigned OpNo) {
   uint64_t TSFlags = MI.getDesc().TSFlags;
 
   // Instructions that can be determined from opcode are marked in tablegen.
@@ -426,7 +425,7 @@ static bool isSignExtendedW(Register SrcReg, const RISCVSubtarget &ST,
     assert(OpNo != -1 && "Couldn't find register");
 
     // If this is a sign extending operation we don't need to look any further.
-    if (isSignExtendingOpW(*MI, MRI, OpNo))
+    if (isSignExtendingOpW(*MI, OpNo))
       continue;
 
     // Is this an instruction that propagates sign extend?

From 046524e8fe425cbc86c3371f2bf29fbb39d98435 Mon Sep 17 00:00:00 2001
From: Kirill Stoimenov <kstoimenov@google.com>
Date: Thu, 8 Aug 2024 22:41:53 +0000
Subject: [PATCH 084/112] Revert "Fix prctl to handle PR_GET_PDEATHSIG.
 (#101749)"

Broke the build.

This reverts commit d46c26b8102dee763d72bf98341bc95b21767196.
---
 .../sanitizer_common/sanitizer_common_interceptors.inc    | 7 ++-----
 .../test/sanitizer_common/TestCases/Linux/prctl.cpp       | 8 --------
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 0fa491a9641a6c1..49c9dcbef358ff2 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -1255,7 +1255,6 @@ INTERCEPTOR(int, prctl, int option, unsigned long arg2, unsigned long arg3,
   static const int PR_SET_VMA = 0x53564d41;
   static const int PR_SCHED_CORE = 62;
   static const int PR_SCHED_CORE_GET = 0;
-  static const int PR_GET_PDEATHSIG = 2;
   if (option == PR_SET_VMA && arg2 == 0UL) {
     char *name = (char *)arg5;
     COMMON_INTERCEPTOR_READ_RANGE(ctx, name, internal_strlen(name) + 1);
@@ -1271,9 +1270,7 @@ INTERCEPTOR(int, prctl, int option, unsigned long arg2, unsigned long arg3,
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, name, internal_strlen(name) + 1);
   } else if (res != -1 && option == PR_SCHED_CORE &&
              arg2 == PR_SCHED_CORE_GET) {
-    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64 *)(arg5), sizeof(u64));
-  } else if (res != -1 && option == PR_GET_PDEATHSIG) {
-    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64 *)(arg2), sizeof(u64));
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64*)(arg5), sizeof(u64));
   }
   return res;
 }
@@ -9983,7 +9980,7 @@ INTERCEPTOR(SSIZE_T, getrandom, void *buf, SIZE_T buflen, unsigned int flags) {
   void *ctx;
   COMMON_INTERCEPTOR_ENTER(ctx, getrandom, buf, buflen, flags);
   // If GRND_NONBLOCK is set in the flags, it is non blocking.
-  static const int grnd_nonblock = 1;
+  static const int grnd_nonblock = 1; 
   SSIZE_T n;
   if ((flags & grnd_nonblock))
     n = REAL(getrandom)(buf, buflen, flags);
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
index 3d1c24cddc64370..d5d81280e0b44c0 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
@@ -41,14 +41,6 @@ int main() {
     assert(cookie != 0);
   }
 
-  int signum;
-  res = prctl(PR_GET_PDEATHSIG, reinterpret_cast<unsigned long>(&signum));
-  if (res < 0) {
-    assert(errno == EINVAL);
-  } else {
-    assert(signum == 0);
-  }
-
   char invname[81], vlname[] = "prctl";
   for (auto i = 0; i < sizeof(invname); i++) {
     invname[i] = 0x1e;

From 1d9e1c6644a03530efbb09d419013ec0bfe0c823 Mon Sep 17 00:00:00 2001
From: Leonard Chan <leonardchan@google.com>
Date: Thu, 8 Aug 2024 23:05:23 +0000
Subject: [PATCH 085/112] Revert "[LLDB] Impove ObjectFileELF's .dynamic
 parsing and usage. (#101237)"

This reverts commit 28ba8a56b6fb9ec61897fa84369f46e43be94c03.

Reverting since this broke the buildbot at
https://green.lab.llvm.org/job/llvm.org/view/LLDB/job/as-lldb-cmake/9352/.
---
 lldb/include/lldb/Symbol/ObjectFile.h         |   5 +-
 .../Plugins/ObjectFile/ELF/ObjectFileELF.cpp  | 422 +++++-------------
 .../Plugins/ObjectFile/ELF/ObjectFileELF.h    |  35 +-
 lldb/source/Symbol/ObjectFile.cpp             |   7 +-
 .../ObjectFile/ELF/Inputs/memory-elf.cpp      |   5 -
 .../ObjectFile/ELF/elf-dynamic-no-shdrs.yaml  |  90 ----
 .../test/Shell/ObjectFile/ELF/elf-memory.test |  55 ---
 7 files changed, 126 insertions(+), 493 deletions(-)
 delete mode 100644 lldb/test/Shell/ObjectFile/ELF/Inputs/memory-elf.cpp
 delete mode 100644 lldb/test/Shell/ObjectFile/ELF/elf-dynamic-no-shdrs.yaml
 delete mode 100644 lldb/test/Shell/ObjectFile/ELF/elf-memory.test

diff --git a/lldb/include/lldb/Symbol/ObjectFile.h b/lldb/include/lldb/Symbol/ObjectFile.h
index d89314d44bf6719..8592323322e383e 100644
--- a/lldb/include/lldb/Symbol/ObjectFile.h
+++ b/lldb/include/lldb/Symbol/ObjectFile.h
@@ -656,9 +656,8 @@ class ObjectFile : public std::enable_shared_from_this<ObjectFile>,
   // When an object file is in memory, subclasses should try and lock the
   // process weak pointer. If the process weak pointer produces a valid
   // ProcessSP, then subclasses can call this function to read memory.
-  static lldb::WritableDataBufferSP
-  ReadMemory(const lldb::ProcessSP &process_sp, lldb::addr_t addr,
-             size_t byte_size);
+  static lldb::DataBufferSP ReadMemory(const lldb::ProcessSP &process_sp,
+                                       lldb::addr_t addr, size_t byte_size);
 
   // This function returns raw file contents. Do not use it if you want
   // transparent decompression of section contents.
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 414ac6112d5797f..890db5c27481462 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -419,35 +419,19 @@ ObjectFile *ObjectFileELF::CreateInstance(const lldb::ModuleSP &module_sp,
 ObjectFile *ObjectFileELF::CreateMemoryInstance(
     const lldb::ModuleSP &module_sp, WritableDataBufferSP data_sp,
     const lldb::ProcessSP &process_sp, lldb::addr_t header_addr) {
-  if (!data_sp || data_sp->GetByteSize() < (llvm::ELF::EI_NIDENT))
-    return nullptr;
-  const uint8_t *magic = data_sp->GetBytes();
-  if (!ELFHeader::MagicBytesMatch(magic))
-    return nullptr;
-  // Read the ELF header first so we can figure out how many bytes we need
-  // to read to get as least the ELF header + program headers.
-  DataExtractor data;
-  data.SetData(data_sp);
-  elf::ELFHeader hdr;
-  lldb::offset_t offset = 0;
-  if (!hdr.Parse(data, &offset))
-    return nullptr;
-
-  // Make sure the address size is set correctly in the ELF header.
-  if (!hdr.Is32Bit() && !hdr.Is64Bit())
-    return nullptr;
-  // Figure out where the program headers end and read enough bytes to get the
-  // program headers in their entirety.
-  lldb::offset_t end_phdrs = hdr.e_phoff + (hdr.e_phentsize * hdr.e_phnum);
-  if (end_phdrs > data_sp->GetByteSize())
-    data_sp = ReadMemory(process_sp, header_addr, end_phdrs);
-
-  std::unique_ptr<ObjectFileELF> objfile_up(
-      new ObjectFileELF(module_sp, data_sp, process_sp, header_addr));
-  ArchSpec spec = objfile_up->GetArchitecture();
-  if (spec && objfile_up->SetModulesArchitecture(spec))
-    return objfile_up.release();
-
+  if (data_sp && data_sp->GetByteSize() > (llvm::ELF::EI_NIDENT)) {
+    const uint8_t *magic = data_sp->GetBytes();
+    if (ELFHeader::MagicBytesMatch(magic)) {
+      unsigned address_size = ELFHeader::AddressSizeInBytes(magic);
+      if (address_size == 4 || address_size == 8) {
+        std::unique_ptr<ObjectFileELF> objfile_up(
+            new ObjectFileELF(module_sp, data_sp, process_sp, header_addr));
+        ArchSpec spec = objfile_up->GetArchitecture();
+        if (spec && objfile_up->SetModulesArchitecture(spec))
+          return objfile_up.release();
+      }
+    }
+  }
   return nullptr;
 }
 
@@ -889,37 +873,42 @@ Address ObjectFileELF::GetImageInfoAddress(Target *target) {
   if (!section_list)
     return Address();
 
-  for (size_t i = 0; i < m_dynamic_symbols.size(); ++i) {
-    const ELFDynamic &symbol = m_dynamic_symbols[i].symbol;
+  // Find the SHT_DYNAMIC (.dynamic) section.
+  SectionSP dynsym_section_sp(
+      section_list->FindSectionByType(eSectionTypeELFDynamicLinkInfo, true));
+  if (!dynsym_section_sp)
+    return Address();
+  assert(dynsym_section_sp->GetObjectFile() == this);
 
-    if (symbol.d_tag != DT_DEBUG && symbol.d_tag != DT_MIPS_RLD_MAP &&
-        symbol.d_tag != DT_MIPS_RLD_MAP_REL)
-      continue;
+  user_id_t dynsym_id = dynsym_section_sp->GetID();
+  const ELFSectionHeaderInfo *dynsym_hdr = GetSectionHeaderByIndex(dynsym_id);
+  if (!dynsym_hdr)
+    return Address();
 
-    // Compute the offset as the number of previous entries plus the size of
-    // d_tag.
-    const addr_t offset = (i * 2 + 1) * GetAddressByteSize();
-    const addr_t d_file_addr = m_dynamic_base_addr + offset;
-    Address d_addr;
-    if (d_addr.ResolveAddressUsingFileSections(d_file_addr, GetSectionList()))
-      return Address();
-    if (symbol.d_tag == DT_DEBUG)
-      return d_addr;
+  for (size_t i = 0; i < m_dynamic_symbols.size(); ++i) {
+    ELFDynamic &symbol = m_dynamic_symbols[i];
 
+    if (symbol.d_tag == DT_DEBUG) {
+      // Compute the offset as the number of previous entries plus the size of
+      // d_tag.
+      addr_t offset = i * dynsym_hdr->sh_entsize + GetAddressByteSize();
+      return Address(dynsym_section_sp, offset);
+    }
     // MIPS executables uses DT_MIPS_RLD_MAP_REL to support PIE. DT_MIPS_RLD_MAP
     // exists in non-PIE.
-    if ((symbol.d_tag == DT_MIPS_RLD_MAP ||
-         symbol.d_tag == DT_MIPS_RLD_MAP_REL) &&
-        target) {
-      const addr_t d_load_addr = d_addr.GetLoadAddress(target);
-      if (d_load_addr == LLDB_INVALID_ADDRESS)
+    else if ((symbol.d_tag == DT_MIPS_RLD_MAP ||
+              symbol.d_tag == DT_MIPS_RLD_MAP_REL) &&
+             target) {
+      addr_t offset = i * dynsym_hdr->sh_entsize + GetAddressByteSize();
+      addr_t dyn_base = dynsym_section_sp->GetLoadBaseAddress(target);
+      if (dyn_base == LLDB_INVALID_ADDRESS)
         return Address();
 
       Status error;
       if (symbol.d_tag == DT_MIPS_RLD_MAP) {
         // DT_MIPS_RLD_MAP tag stores an absolute address of the debug pointer.
         Address addr;
-        if (target->ReadPointerFromMemory(d_load_addr, error, addr, true))
+        if (target->ReadPointerFromMemory(dyn_base + offset, error, addr, true))
           return addr;
       }
       if (symbol.d_tag == DT_MIPS_RLD_MAP_REL) {
@@ -927,17 +916,18 @@ Address ObjectFileELF::GetImageInfoAddress(Target *target) {
         // relative to the address of the tag.
         uint64_t rel_offset;
         rel_offset = target->ReadUnsignedIntegerFromMemory(
-            d_load_addr, GetAddressByteSize(), UINT64_MAX, error, true);
+            dyn_base + offset, GetAddressByteSize(), UINT64_MAX, error, true);
         if (error.Success() && rel_offset != UINT64_MAX) {
           Address addr;
           addr_t debug_ptr_address =
-              d_load_addr - GetAddressByteSize() + rel_offset;
+              dyn_base + (offset - GetAddressByteSize()) + rel_offset;
           addr.SetOffset(debug_ptr_address);
           return addr;
         }
       }
     }
   }
+
   return Address();
 }
 
@@ -980,23 +970,62 @@ Address ObjectFileELF::GetBaseAddress() {
   return LLDB_INVALID_ADDRESS;
 }
 
+// ParseDependentModules
 size_t ObjectFileELF::ParseDependentModules() {
   if (m_filespec_up)
     return m_filespec_up->GetSize();
 
   m_filespec_up = std::make_unique<FileSpecList>();
 
-  if (ParseDynamicSymbols()) {
-    for (const auto &entry : m_dynamic_symbols) {
-      if (entry.symbol.d_tag != DT_NEEDED)
+  if (!ParseSectionHeaders())
+    return 0;
+
+  SectionList *section_list = GetSectionList();
+  if (!section_list)
+    return 0;
+
+  // Find the SHT_DYNAMIC section.
+  Section *dynsym =
+      section_list->FindSectionByType(eSectionTypeELFDynamicLinkInfo, true)
+          .get();
+  if (!dynsym)
+    return 0;
+  assert(dynsym->GetObjectFile() == this);
+
+  const ELFSectionHeaderInfo *header = GetSectionHeaderByIndex(dynsym->GetID());
+  if (!header)
+    return 0;
+  // sh_link: section header index of string table used by entries in the
+  // section.
+  Section *dynstr = section_list->FindSectionByID(header->sh_link).get();
+  if (!dynstr)
+    return 0;
+
+  DataExtractor dynsym_data;
+  DataExtractor dynstr_data;
+  if (ReadSectionData(dynsym, dynsym_data) &&
+      ReadSectionData(dynstr, dynstr_data)) {
+    ELFDynamic symbol;
+    const lldb::offset_t section_size = dynsym_data.GetByteSize();
+    lldb::offset_t offset = 0;
+
+    // The only type of entries we are concerned with are tagged DT_NEEDED,
+    // yielding the name of a required library.
+    while (offset < section_size) {
+      if (!symbol.Parse(dynsym_data, &offset))
+        break;
+
+      if (symbol.d_tag != DT_NEEDED)
         continue;
-      if (!entry.name.empty()) {
-        FileSpec file_spec(entry.name);
-        FileSystem::Instance().Resolve(file_spec);
-        m_filespec_up->Append(file_spec);
-      }
+
+      uint32_t str_index = static_cast<uint32_t>(symbol.d_val);
+      const char *lib_name = dynstr_data.PeekCStr(str_index);
+      FileSpec file_spec(lib_name);
+      FileSystem::Instance().Resolve(file_spec);
+      m_filespec_up->Append(file_spec);
     }
   }
+
   return m_filespec_up->GetSize();
 }
 
@@ -2443,47 +2472,48 @@ size_t ObjectFileELF::ParseDynamicSymbols() {
   if (m_dynamic_symbols.size())
     return m_dynamic_symbols.size();
 
-  std::optional<DataExtractor> dynamic_data = GetDynamicData();
-  if (!dynamic_data)
+  SectionList *section_list = GetSectionList();
+  if (!section_list)
     return 0;
 
-  ELFDynamicWithName e;
-  lldb::offset_t cursor = 0;
-  while (e.symbol.Parse(*dynamic_data, &cursor)) {
-    m_dynamic_symbols.push_back(e);
-    if (e.symbol.d_tag == DT_NULL)
-      break;
-  }
-  if (std::optional<DataExtractor> dynstr_data = GetDynstrData()) {
-    for (ELFDynamicWithName &entry : m_dynamic_symbols) {
-      switch (entry.symbol.d_tag) {
-      case DT_NEEDED:
-      case DT_SONAME:
-      case DT_RPATH:
-      case DT_RUNPATH:
-      case DT_AUXILIARY:
-      case DT_FILTER: {
-        lldb::offset_t cursor = entry.symbol.d_val;
-        const char *name = dynstr_data->GetCStr(&cursor);
-        if (name)
-          entry.name = std::string(name);
-        break;
-      }
-      default:
+  // Find the SHT_DYNAMIC section.
+  Section *dynsym =
+      section_list->FindSectionByType(eSectionTypeELFDynamicLinkInfo, true)
+          .get();
+  if (!dynsym)
+    return 0;
+  assert(dynsym->GetObjectFile() == this);
+
+  ELFDynamic symbol;
+  DataExtractor dynsym_data;
+  if (ReadSectionData(dynsym, dynsym_data)) {
+    const lldb::offset_t section_size = dynsym_data.GetByteSize();
+    lldb::offset_t cursor = 0;
+
+    while (cursor < section_size) {
+      if (!symbol.Parse(dynsym_data, &cursor))
         break;
-      }
+
+      m_dynamic_symbols.push_back(symbol);
     }
   }
+
   return m_dynamic_symbols.size();
 }
 
 const ELFDynamic *ObjectFileELF::FindDynamicSymbol(unsigned tag) {
   if (!ParseDynamicSymbols())
     return nullptr;
-  for (const auto &entry : m_dynamic_symbols) {
-    if (entry.symbol.d_tag == tag)
-      return &entry.symbol;
+
+  DynamicSymbolCollIter I = m_dynamic_symbols.begin();
+  DynamicSymbolCollIter E = m_dynamic_symbols.end();
+  for (; I != E; ++I) {
+    ELFDynamic *symbol = &*I;
+
+    if (symbol->d_tag == tag)
+      return symbol;
   }
+
   return nullptr;
 }
 
@@ -3200,10 +3230,7 @@ void ObjectFileELF::Dump(Stream *s) {
   ArchSpec header_arch = GetArchitecture();
 
   *s << ", file = '" << m_file
-     << "', arch = " << header_arch.GetArchitectureName();
-  if (m_memory_addr != LLDB_INVALID_ADDRESS)
-    s->Printf(", addr = %#16.16" PRIx64, m_memory_addr);
-  s->EOL();
+     << "', arch = " << header_arch.GetArchitectureName() << "\n";
 
   DumpELFHeader(s, m_header);
   s->EOL();
@@ -3221,8 +3248,6 @@ void ObjectFileELF::Dump(Stream *s) {
   s->EOL();
   DumpDependentModules(s);
   s->EOL();
-  DumpELFDynamic(s);
-  s->EOL();
 }
 
 // DumpELFHeader
@@ -3467,111 +3492,6 @@ void ObjectFileELF::DumpDependentModules(lldb_private::Stream *s) {
   }
 }
 
-std::string static getDynamicTagAsString(uint16_t Arch, uint64_t Type) {
-#define DYNAMIC_STRINGIFY_ENUM(tag, value)                                     \
-  case value:                                                                  \
-    return #tag;
-
-#define DYNAMIC_TAG(n, v)
-  switch (Arch) {
-  case llvm::ELF::EM_AARCH64:
-    switch (Type) {
-#define AARCH64_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
-#include "llvm/BinaryFormat/DynamicTags.def"
-#undef AARCH64_DYNAMIC_TAG
-    }
-    break;
-
-  case llvm::ELF::EM_HEXAGON:
-    switch (Type) {
-#define HEXAGON_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
-#include "llvm/BinaryFormat/DynamicTags.def"
-#undef HEXAGON_DYNAMIC_TAG
-    }
-    break;
-
-  case llvm::ELF::EM_MIPS:
-    switch (Type) {
-#define MIPS_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
-#include "llvm/BinaryFormat/DynamicTags.def"
-#undef MIPS_DYNAMIC_TAG
-    }
-    break;
-
-  case llvm::ELF::EM_PPC:
-    switch (Type) {
-#define PPC_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
-#include "llvm/BinaryFormat/DynamicTags.def"
-#undef PPC_DYNAMIC_TAG
-    }
-    break;
-
-  case llvm::ELF::EM_PPC64:
-    switch (Type) {
-#define PPC64_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
-#include "llvm/BinaryFormat/DynamicTags.def"
-#undef PPC64_DYNAMIC_TAG
-    }
-    break;
-
-  case llvm::ELF::EM_RISCV:
-    switch (Type) {
-#define RISCV_DYNAMIC_TAG(name, value) DYNAMIC_STRINGIFY_ENUM(name, value)
-#include "llvm/BinaryFormat/DynamicTags.def"
-#undef RISCV_DYNAMIC_TAG
-    }
-    break;
-  }
-#undef DYNAMIC_TAG
-  switch (Type) {
-// Now handle all dynamic tags except the architecture specific ones
-#define AARCH64_DYNAMIC_TAG(name, value)
-#define MIPS_DYNAMIC_TAG(name, value)
-#define HEXAGON_DYNAMIC_TAG(name, value)
-#define PPC_DYNAMIC_TAG(name, value)
-#define PPC64_DYNAMIC_TAG(name, value)
-#define RISCV_DYNAMIC_TAG(name, value)
-// Also ignore marker tags such as DT_HIOS (maps to DT_VERNEEDNUM), etc.
-#define DYNAMIC_TAG_MARKER(name, value)
-#define DYNAMIC_TAG(name, value)                                               \
-  case value:                                                                  \
-    return #name;
-#include "llvm/BinaryFormat/DynamicTags.def"
-#undef DYNAMIC_TAG
-#undef AARCH64_DYNAMIC_TAG
-#undef MIPS_DYNAMIC_TAG
-#undef HEXAGON_DYNAMIC_TAG
-#undef PPC_DYNAMIC_TAG
-#undef PPC64_DYNAMIC_TAG
-#undef RISCV_DYNAMIC_TAG
-#undef DYNAMIC_TAG_MARKER
-#undef DYNAMIC_STRINGIFY_ENUM
-  default:
-    return "<unknown:>0x" + llvm::utohexstr(Type, true);
-  }
-}
-
-void ObjectFileELF::DumpELFDynamic(lldb_private::Stream *s) {
-  ParseDynamicSymbols();
-  if (m_dynamic_symbols.empty())
-    return;
-
-  s->PutCString(".dynamic:\n");
-  s->PutCString("IDX  d_tag            d_val/d_ptr\n");
-  s->PutCString("==== ---------------- ------------------\n");
-  uint32_t idx = 0;
-  for (const auto &entry : m_dynamic_symbols) {
-    s->Printf("[%2u] ", idx++);
-    s->Printf(
-        "%-16s 0x%16.16" PRIx64,
-        getDynamicTagAsString(m_header.e_machine, entry.symbol.d_tag).c_str(),
-        entry.symbol.d_ptr);
-    if (!entry.name.empty())
-      s->Printf(" \"%s\"", entry.name.c_str());
-    s->EOL();
-  }
-}
-
 ArchSpec ObjectFileELF::GetArchitecture() {
   if (!ParseHeader())
     return ArchSpec();
@@ -3744,24 +3664,7 @@ llvm::ArrayRef<ELFProgramHeader> ObjectFileELF::ProgramHeaders() {
 }
 
 DataExtractor ObjectFileELF::GetSegmentData(const ELFProgramHeader &H) {
-  // Try and read the program header from our cached m_data which can come from
-  // the file on disk being mmap'ed or from the initial part of the ELF file we
-  // read from memory and cached.
-  DataExtractor data = DataExtractor(m_data, H.p_offset, H.p_filesz);
-  if (data.GetByteSize() == H.p_filesz)
-    return data;
-  if (IsInMemory()) {
-    // We have a ELF file in process memory, read the program header data from
-    // the process.
-    if (ProcessSP process_sp = m_process_wp.lock()) {
-      const lldb::offset_t base_file_addr = GetBaseAddress().GetFileAddress();
-      const addr_t load_bias = m_memory_addr - base_file_addr;
-      const addr_t data_addr = H.p_vaddr + load_bias;
-      if (DataBufferSP data_sp = ReadMemory(process_sp, data_addr, H.p_memsz))
-        return DataExtractor(data_sp, GetByteOrder(), GetAddressByteSize());
-    }
-  }
-  return DataExtractor();
+  return DataExtractor(m_data, H.p_offset, H.p_filesz);
 }
 
 bool ObjectFileELF::AnySegmentHasPhysicalAddress() {
@@ -3801,88 +3704,3 @@ ObjectFileELF::MapFileDataWritable(const FileSpec &file, uint64_t Size,
   return FileSystem::Instance().CreateWritableDataBuffer(file.GetPath(), Size,
                                                          Offset);
 }
-
-std::optional<DataExtractor> ObjectFileELF::GetDynstrData() {
-  if (SectionList *section_list = GetSectionList()) {
-    // Find the SHT_DYNAMIC section.
-    if (Section *dynamic =
-            section_list
-                ->FindSectionByType(eSectionTypeELFDynamicLinkInfo, true)
-                .get()) {
-      assert(dynamic->GetObjectFile() == this);
-      if (const ELFSectionHeaderInfo *header =
-              GetSectionHeaderByIndex(dynamic->GetID())) {
-        // sh_link: section header index of string table used by entries in
-        // the section.
-        if (Section *dynstr =
-                section_list->FindSectionByID(header->sh_link).get()) {
-          DataExtractor data;
-          if (ReadSectionData(dynstr, data))
-            return data;
-        }
-      }
-    }
-  }
-
-  // Every ELF file which represents an executable or shared library has
-  // mandatory .dynamic entries. Two of these values are DT_STRTAB and DT_STRSZ
-  // and represent the dynamic symbol tables's string table. These are needed
-  // by the dynamic loader and we can read them from a process' address space.
-  //
-  // When loading and ELF file from memory, only the program headers end up
-  // being mapped into memory, and we can find these values in the PT_DYNAMIC
-  // segment.
-  const ELFDynamic *strtab = FindDynamicSymbol(DT_STRTAB);
-  const ELFDynamic *strsz = FindDynamicSymbol(DT_STRSZ);
-  if (strtab == nullptr || strsz == nullptr)
-    return std::nullopt;
-
-  if (ProcessSP process_sp = m_process_wp.lock()) {
-    if (DataBufferSP data_sp =
-            ReadMemory(process_sp, strtab->d_ptr, strsz->d_val))
-      return DataExtractor(data_sp, GetByteOrder(), GetAddressByteSize());
-  } else {
-    // We have an ELF file with no section headers or we didn't find the
-    // .dynamic section. Try and find the .dynstr section.
-    Address addr;
-    if (addr.ResolveAddressUsingFileSections(strtab->d_ptr, GetSectionList())) {
-      DataExtractor data;
-      addr.GetSection()->GetSectionData(data);
-      return DataExtractor(data,
-                           strtab->d_ptr - addr.GetSection()->GetFileAddress(),
-                           strsz->d_val);
-    }
-  }
-  return std::nullopt;
-}
-
-std::optional<lldb_private::DataExtractor> ObjectFileELF::GetDynamicData() {
-  DataExtractor data;
-  // The PT_DYNAMIC program header describes where the .dynamic section is and
-  // doesn't require parsing section headers. The PT_DYNAMIC is required by
-  // executables and shared libraries so it will always be available.
-  for (const ELFProgramHeader &H : ProgramHeaders()) {
-    if (H.p_type == llvm::ELF::PT_DYNAMIC) {
-      data = GetSegmentData(H);
-      if (data.GetByteSize() > 0) {
-        m_dynamic_base_addr = H.p_vaddr;
-        return data;
-      }
-    }
-  }
-  // Fall back to using section headers.
-  if (SectionList *section_list = GetSectionList()) {
-    // Find the SHT_DYNAMIC section.
-    if (Section *dynamic =
-            section_list
-                ->FindSectionByType(eSectionTypeELFDynamicLinkInfo, true)
-                .get()) {
-      assert(dynamic->GetObjectFile() == this);
-      if (ReadSectionData(dynamic, data)) {
-        m_dynamic_base_addr = dynamic->GetFileAddress();
-        return data;
-      }
-    }
-  }
-  return std::nullopt;
-}
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
index aba3a5bfcbf5b6c..844e981b1d890a3 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.h
@@ -183,11 +183,7 @@ class ObjectFileELF : public lldb_private::ObjectFile {
   typedef SectionHeaderColl::iterator SectionHeaderCollIter;
   typedef SectionHeaderColl::const_iterator SectionHeaderCollConstIter;
 
-  struct ELFDynamicWithName {
-    elf::ELFDynamic symbol;
-    std::string name;
-  };
-  typedef std::vector<ELFDynamicWithName> DynamicSymbolColl;
+  typedef std::vector<elf::ELFDynamic> DynamicSymbolColl;
   typedef DynamicSymbolColl::iterator DynamicSymbolCollIter;
   typedef DynamicSymbolColl::const_iterator DynamicSymbolCollConstIter;
 
@@ -217,10 +213,6 @@ class ObjectFileELF : public lldb_private::ObjectFile {
   /// Collection of section headers.
   SectionHeaderColl m_section_headers;
 
-  /// The file address of the .dynamic section. This can be found in the p_vaddr
-  /// of the PT_DYNAMIC program header.
-  lldb::addr_t m_dynamic_base_addr = LLDB_INVALID_ADDRESS;
-
   /// Collection of symbols from the dynamic table.
   DynamicSymbolColl m_dynamic_symbols;
 
@@ -392,9 +384,6 @@ class ObjectFileELF : public lldb_private::ObjectFile {
   /// ELF dependent module dump routine.
   void DumpDependentModules(lldb_private::Stream *s);
 
-  /// ELF dump the .dynamic section
-  void DumpELFDynamic(lldb_private::Stream *s);
-
   const elf::ELFDynamic *FindDynamicSymbol(unsigned tag);
 
   unsigned PLTRelocationType();
@@ -413,28 +402,6 @@ class ObjectFileELF : public lldb_private::ObjectFile {
   /// .gnu_debugdata section or \c nullptr if an error occured or if there's no
   /// section with that name.
   std::shared_ptr<ObjectFileELF> GetGnuDebugDataObjectFile();
-
-  /// Get the bytes that represent the .dynamic section.
-  ///
-  /// This function will fetch the data for the .dynamic section in an ELF file.
-  /// The PT_DYNAMIC program header will be used to extract the data and this
-  /// function will fall back to using the section headers if PT_DYNAMIC isn't
-  /// found.
-  ///
-  /// \return The bytes that represent the string table data or \c std::nullopt
-  ///         if an error occured.
-  std::optional<lldb_private::DataExtractor> GetDynamicData();
-
-  /// Get the bytes that represent the dynamic string table data.
-  ///
-  /// This function will fetch the data for the string table in an ELF file. If
-  /// the ELF file is loaded from a file on disk, it will use the section
-  /// headers to extract the data and fall back to using the DT_STRTAB and
-  /// DT_STRSZ .dynamic entries.
-  ///
-  /// \return The bytes that represent the string table data or \c std::nullopt
-  ///         if an error occured.
-  std::optional<lldb_private::DataExtractor> GetDynstrData();
 };
 
 #endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_ELF_OBJECTFILEELF_H
diff --git a/lldb/source/Symbol/ObjectFile.cpp b/lldb/source/Symbol/ObjectFile.cpp
index 35317d209de1f9a..2608a9c5fb79a2f 100644
--- a/lldb/source/Symbol/ObjectFile.cpp
+++ b/lldb/source/Symbol/ObjectFile.cpp
@@ -454,10 +454,9 @@ AddressClass ObjectFile::GetAddressClass(addr_t file_addr) {
   return AddressClass::eUnknown;
 }
 
-WritableDataBufferSP ObjectFile::ReadMemory(const ProcessSP &process_sp,
-                                            lldb::addr_t addr,
-                                            size_t byte_size) {
-  WritableDataBufferSP data_sp;
+DataBufferSP ObjectFile::ReadMemory(const ProcessSP &process_sp,
+                                    lldb::addr_t addr, size_t byte_size) {
+  DataBufferSP data_sp;
   if (process_sp) {
     std::unique_ptr<DataBufferHeap> data_up(new DataBufferHeap(byte_size, 0));
     Status error;
diff --git a/lldb/test/Shell/ObjectFile/ELF/Inputs/memory-elf.cpp b/lldb/test/Shell/ObjectFile/ELF/Inputs/memory-elf.cpp
deleted file mode 100644
index 9cae6c99c9f7616..000000000000000
--- a/lldb/test/Shell/ObjectFile/ELF/Inputs/memory-elf.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <stdio.h>
-int main() {
-  printf("Hello World\n"); // Use something from libc.so
-  return 0;
-}
diff --git a/lldb/test/Shell/ObjectFile/ELF/elf-dynamic-no-shdrs.yaml b/lldb/test/Shell/ObjectFile/ELF/elf-dynamic-no-shdrs.yaml
deleted file mode 100644
index bc34e16e0b6ad24..000000000000000
--- a/lldb/test/Shell/ObjectFile/ELF/elf-dynamic-no-shdrs.yaml
+++ /dev/null
@@ -1,90 +0,0 @@
-## This test verifies that loading an ELF file that has no section headers can
-## find the contents on the .dynamic section and the strings associated with
-## the .dynamic seciton.
-## - Loading the .dynamic section from the PT_DYNAMIC
-## This test will make a simple executable that links against libc.so and we
-## verify that we can find the DT_NEEDED entry with the shared library found
-## in the .dynamic dump from "image dump objfile"
-
-# RUN: yaml2obj %s -o %t
-
-# RUN: %lldb -b \
-# RUN:   -o "target create -d '%t'" \
-# RUN:   -o "image dump objfile" \
-# RUN:   | FileCheck %s --dump-input=always
-# CHECK: (lldb) image dump objfile
-# CHECK: Dumping headers for 1 module(s).
-# CHECK: ObjectFileELF, file =
-# CHECK: ELF Header
-# Make sure there are no section headers
-# CHECK: e_shnum = 0x00000000
-
-# Make sure we find the program headers and see a PT_DYNAMIC entry.
-# CHECK: Program Headers
-# CHECK: IDX  p_type          p_offset p_vaddr  p_paddr  p_filesz p_memsz  p_flags                   p_align
-# CHECK: ==== --------------- -------- -------- -------- -------- -------- ------------------------- --------
-# CHECK: [ 0] PT_LOAD         000000b0 00000000 00000000 00000170 00000170 00000000 (              ) 00000001
-# CHECK: [ 1] PT_DYNAMIC      000001b0 00000100 00000100 00000070 00000070 00000000 (              ) 00000008
-
-# CHECK: Dependent Modules:
-# CHECK:    ccc
-# CHECK:    aaa
-# CHECK:    bbb
-
-# Make sure we see some sections created from the program headers
-# MAIN: SectID
-# MAIN: PT_LOAD[0]
-
-# CHECK: .dynamic:
-# CHECK: IDX  d_tag            d_val/d_ptr
-# CHECK: ==== ---------------- ------------------
-# CHECK: [ 0] STRTAB           0x0000000000000000
-# CHECK: [ 1] NEEDED           0x0000000000000009 "ccc"
-# CHECK: [ 2] NEEDED           0x0000000000000001 "aaa"
-# CHECK: [ 3] NEEDED           0x0000000000000005 "bbb"
-# CHECK: [ 4] STRSZ            0x0000000000000100
-# CHECK: [ 5] DEBUG            0x00000000deadbeef
-# CHECK: [ 6] NULL             0x0000000000000000
-
---- !ELF
-FileHeader:
-  Class:   ELFCLASS64
-  Data:    ELFDATA2LSB
-  Type:    ET_EXEC
-  Machine: EM_X86_64
-Sections:
-  - Type:      SectionHeaderTable
-    NoHeaders: true
-  - Name:  .dynstr
-    Type:  SHT_STRTAB
-    Flags: [ SHF_ALLOC ]
-    Content: '00616161006262620063636300' ## 0,a,a,a,0,b,b,b,0,c,c,c,0
-    Size:    0x100
-  - Name:    .dynamic
-    Type:    SHT_DYNAMIC
-    Address: 0x100
-    Entries:
-      - Tag:   DT_STRTAB
-        Value: 0x0000000000000000
-      - Tag:   DT_NEEDED
-        Value: 9
-      - Tag:   DT_NEEDED
-        Value: 1
-      - Tag:   DT_NEEDED
-        Value: 5
-      - Tag:   DT_STRSZ
-        Value: 0x100
-      - Tag:   DT_DEBUG
-        Value: 0xdeadbeef
-      - Tag:   DT_NULL
-        Value: 0x0
-ProgramHeaders:
-  - Type: PT_LOAD
-    VAddr: 0x0
-    FirstSec: .dynstr
-    LastSec:  .dynamic
-  - Type:            PT_DYNAMIC
-    FirstSec:        .dynamic
-    LastSec:         .dynamic
-    VAddr:           0x100
-    Align:           0x8
diff --git a/lldb/test/Shell/ObjectFile/ELF/elf-memory.test b/lldb/test/Shell/ObjectFile/ELF/elf-memory.test
deleted file mode 100644
index 0b1c01486a4b431..000000000000000
--- a/lldb/test/Shell/ObjectFile/ELF/elf-memory.test
+++ /dev/null
@@ -1,55 +0,0 @@
-// REQUIRES: system-linux, native
-
-// This test verifies that loading an ELF file from memory works and the new
-// features that were added when loading from memory work like:
-// - Loading the .dynamic section from the PT_DYNAMIC since ELF files loaded
-//   from memory don't have the section headers available.
-// This test will make a simple executable that:
-//   - links against libc
-//   - runs and stops at a breakpoint
-//   - create a memory ELF file
-//   - verify that "image dump objfile" will dump the dynamic section of the
-//     memory elf file and find the .dynamic string table.
-
-// RUN: %clang_host %p/Inputs/memory-elf.cpp -g -O0 -o %t
-
-// RUN: %lldb %t -b \
-// RUN:   -o "b main" \
-// RUN:   -o "run" \
-// RUN:   -o "script real_module = lldb.target.module[0]" \
-// RUN:   -o "script base_addr = real_module.GetObjectFileHeaderAddress().GetLoadAddress(lldb.target)" \
-// RUN:   -o "script mem_module = lldb.SBModule(lldb.process, base_addr)" \
-// RUN:   -o "script target2 = lldb.debugger.CreateTarget('')" \
-// RUN:   -o "script target2.AddModule(mem_module)" \
-// RUN:   -o "target select 1" \
-// RUN:   -o "image dump objfile" \
-// RUN:   | FileCheck %s --check-prefix=MAIN --dump-input=always
-// MAIN: (lldb) image dump objfile
-// MAIN: Dumping headers for 1 module(s).
-// MAIN: ObjectFileELF, file = '', arch = {{.*, addr = 0x[0-9a-f]+}}
-// MAIN: ELF Header
-
-// Make sure we find the program headers and see a PT_DYNAMIC entry.
-// MAIN: Program Headers
-// MAIN: ] PT_DYNAMIC
-
-// Make sure we see some sections created from the program headers
-// MAIN: SectID
-// MAIN: PT_LOAD[0]
-
-// Ensure we find some dependent modules as won't find these if we aren't able
-// to load the .dynamic section from the PT_DYNAMIC program header.
-// MAIN: Dependent Modules:
-
-// Check for the .dynamic dump and ensure we find all dynamic entries that are
-// required to be there and needed to get the .dynstr section and the symbol
-// table, and the DT_DEBUG entry to find the list of shared libraries.
-// MAIN: .dynamic:
-// Make sure we found the .dynstr section by checking for valid strings after NEEDED
-// MAIN-DAG: NEEDED {{0x[0-9a-f]+ ".*libc.*"}}
-// MAIN-DAG: STRTAB {{0x[0-9a-f]+}}
-// MAIN-DAG: SYMTAB {{0x[0-9a-f]+}}
-// MAIN-DAG: STRSZ {{0x[0-9a-f]+}}
-// MAIN-DAG: SYMENT {{0x[0-9a-f]+}}
-// MAIN-DAG: DEBUG {{0x[0-9a-f]+}}
-// MAIN:     NULL {{0x[0-9a-f]+}}

From 8acf8852e9d4044456a782ff0089a4becfa2df86 Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Thu, 8 Aug 2024 16:32:54 -0700
Subject: [PATCH 086/112] [LLVM][rtsan] Add RealtimeSanitizer transform pass
 (#101232)

Split from #100596.

Introduce the RealtimeSanitizer transform, which inserts the
rtsan_enter/exit functions at the appropriate places in an instrumented
function.
---
 .../Instrumentation/RealtimeSanitizer.h       | 41 +++++++++++++
 llvm/lib/Passes/PassBuilder.cpp               |  6 ++
 llvm/lib/Passes/PassRegistry.def              |  4 ++
 .../Transforms/Instrumentation/CMakeLists.txt |  1 +
 .../Instrumentation/RealtimeSanitizer.cpp     | 61 +++++++++++++++++++
 .../RealtimeSanitizer/rtsan.ll                | 27 ++++++++
 .../RealtimeSanitizer/rtsan_multi_return.ll   | 30 +++++++++
 7 files changed, 170 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/Instrumentation/RealtimeSanitizer.h
 create mode 100644 llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
 create mode 100644 llvm/test/Instrumentation/RealtimeSanitizer/rtsan.ll
 create mode 100644 llvm/test/Instrumentation/RealtimeSanitizer/rtsan_multi_return.ll

diff --git a/llvm/include/llvm/Transforms/Instrumentation/RealtimeSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/RealtimeSanitizer.h
new file mode 100644
index 000000000000000..b141ad27996b7b6
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Instrumentation/RealtimeSanitizer.h
@@ -0,0 +1,41 @@
+//===- RealtimeSanitizer.h - RealtimeSanitizer instrumentation --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of the RealtimeSanitizer, an LLVM transformation for
+// detecting and reporting realtime safety violations.
+//
+// The instrumentation pass inserts calls to __rtsan_realtime_enter and
+// __rtsan_realtime_exit at the entry and exit points of functions that are
+// marked with the appropriate attribute.
+//
+// See also: llvm-project/compiler-rt/lib/rtsan/
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_REALTIMESANITIZER_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_REALTIMESANITIZER_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+struct RealtimeSanitizerOptions {};
+
+class RealtimeSanitizerPass : public PassInfoMixin<RealtimeSanitizerPass> {
+public:
+  RealtimeSanitizerPass(const RealtimeSanitizerOptions &Options);
+  PreservedAnalyses run(Function &F, AnalysisManager<Function> &AM);
+
+  static bool isRequired() { return true; }
+
+private:
+  RealtimeSanitizerOptions Options{};
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_INSTRUMENTATION_REALTIMESANITIZER_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index bcc69d5ac3db67e..7bc1c870ce5191a 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -199,6 +199,7 @@
 #include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
 #include "llvm/Transforms/Instrumentation/PoisonChecking.h"
+#include "llvm/Transforms/Instrumentation/RealtimeSanitizer.h"
 #include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
@@ -1210,6 +1211,11 @@ parseRegAllocFastPassOptions(PassBuilder &PB, StringRef Params) {
   return Opts;
 }
 
+Expected<RealtimeSanitizerOptions> parseRtSanPassOptions(StringRef Params) {
+  RealtimeSanitizerOptions Result;
+  return Result;
+}
+
 } // namespace
 
 /// Tests whether a pass name starts with a valid prefix for a default pipeline
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 61a5bab92927fe6..95842d15a35bf60 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -592,6 +592,10 @@ FUNCTION_PASS_WITH_PARAMS(
       return WinEHPreparePass(DemoteCatchSwitchPHIOnly);
     },
     parseWinEHPrepareOptions, "demote-catchswitch-only")
+FUNCTION_PASS_WITH_PARAMS(
+    "rtsan", "RealtimeSanitizerPass",
+    [](RealtimeSanitizerOptions Opts) { return RealtimeSanitizerPass(Opts); },
+    parseRtSanPassOptions, "")
 #undef FUNCTION_PASS_WITH_PARAMS
 
 #ifndef LOOPNEST_PASS
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 4e3f9e27e0c3446..deab37801ff1df8 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_component_library(LLVMInstrumentation
   ValueProfileCollector.cpp
   ThreadSanitizer.cpp
   HWAddressSanitizer.cpp
+  RealtimeSanitizer.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
new file mode 100644
index 000000000000000..d3028ee97c3eb8d
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
@@ -0,0 +1,61 @@
+//===- RealtimeSanitizer.cpp - RealtimeSanitizer instrumentation *- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of the RealtimeSanitizer, an LLVM transformation for
+// detecting and reporting realtime safety violations.
+//
+// See also: llvm-project/compiler-rt/lib/rtsan/
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Analysis.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+
+#include "llvm/Transforms/Instrumentation/RealtimeSanitizer.h"
+
+using namespace llvm;
+
+static void insertCallBeforeInstruction(Function &Fn, Instruction &Instruction,
+                                        const char *FunctionName) {
+  LLVMContext &Context = Fn.getContext();
+  FunctionType *FuncType = FunctionType::get(Type::getVoidTy(Context), false);
+  FunctionCallee Func =
+      Fn.getParent()->getOrInsertFunction(FunctionName, FuncType);
+  IRBuilder<> Builder{&Instruction};
+  Builder.CreateCall(Func, {});
+}
+
+static void insertCallAtFunctionEntryPoint(Function &Fn,
+                                           const char *InsertFnName) {
+
+  insertCallBeforeInstruction(Fn, Fn.front().front(), InsertFnName);
+}
+
+static void insertCallAtAllFunctionExitPoints(Function &Fn,
+                                              const char *InsertFnName) {
+  for (auto &BB : Fn)
+    for (auto &I : BB)
+      if (auto *RI = dyn_cast<ReturnInst>(&I))
+        insertCallBeforeInstruction(Fn, I, InsertFnName);
+}
+
+RealtimeSanitizerPass::RealtimeSanitizerPass(
+    const RealtimeSanitizerOptions &Options)
+    : Options{Options} {}
+
+PreservedAnalyses RealtimeSanitizerPass::run(Function &F,
+                                             AnalysisManager<Function> &AM) {
+  if (F.hasFnAttribute(Attribute::SanitizeRealtime)) {
+    insertCallAtFunctionEntryPoint(F, "__rtsan_realtime_enter");
+    insertCallAtAllFunctionExitPoints(F, "__rtsan_realtime_exit");
+    return PreservedAnalyses::none();
+  }
+
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/test/Instrumentation/RealtimeSanitizer/rtsan.ll b/llvm/test/Instrumentation/RealtimeSanitizer/rtsan.ll
new file mode 100644
index 000000000000000..a0bc4aef2cc319a
--- /dev/null
+++ b/llvm/test/Instrumentation/RealtimeSanitizer/rtsan.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s -passes=rtsan -S | FileCheck %s
+
+define void @violation() #0 {
+  %1 = alloca ptr, align 8
+  %2 = call ptr @malloc(i64 noundef 2) #3
+  store ptr %2, ptr %1, align 8
+  ret void
+}
+
+declare ptr @malloc(i64 noundef) #1
+
+define noundef i32 @main() #2 {
+  %1 = alloca i32, align 4
+  store i32 0, ptr %1, align 4
+  call void @violation() #4
+  ret i32 0
+}
+
+attributes #0 = { mustprogress noinline sanitize_realtime optnone ssp uwtable(sync) }
+
+; RealtimeSanitizer pass should insert __rtsan_realtime_enter right after function definition
+; CHECK-LABEL: @violation()
+; CHECK-NEXT: call{{.*}}@__rtsan_realtime_enter
+
+; RealtimeSanitizer pass should insert __rtsan_realtime_exit right before function return
+; CHECK: call{{.*}}@__rtsan_realtime_exit
+; CHECK-NEXT: ret{{.*}}void
diff --git a/llvm/test/Instrumentation/RealtimeSanitizer/rtsan_multi_return.ll b/llvm/test/Instrumentation/RealtimeSanitizer/rtsan_multi_return.ll
new file mode 100644
index 000000000000000..39a1ff0b7c442ae
--- /dev/null
+++ b/llvm/test/Instrumentation/RealtimeSanitizer/rtsan_multi_return.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -passes=rtsan -S | FileCheck %s
+
+define i32 @example(i32 %x) #0 {
+entry:
+    %retval = alloca i32
+    %cmp = icmp sgt i32 %x, 10
+    br i1 %cmp, label %then, label %else
+
+then:
+    ret i32 1
+
+else:
+    ret i32 0
+}
+
+attributes #0 = { mustprogress noinline sanitize_realtime optnone ssp uwtable(sync) }
+
+; RealtimeSanitizer pass should insert __rtsan_realtime_enter right after function definition
+; CHECK-LABEL: @example(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: call{{.*}}@__rtsan_realtime_enter
+
+; RealtimeSanitizer pass should insert the call at both function returns
+; CHECK-LABEL: then:
+; CHECK-NEXT: call{{.*}}@__rtsan_realtime_exit
+; CHECK-NEXT: ret i32 1
+
+; CHECK-LABEL: else:
+; CHECK-NEXT: call{{.*}}@__rtsan_realtime_exit
+; CHECK-NEXT: ret i32 0

From 9428631d7673387d7761358ec05015d7810c9511 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 8 Aug 2024 23:33:12 +0000
Subject: [PATCH 087/112] [gn build] Port 8acf8852e9d4

---
 .../gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
index 0f67db3549fb140..aeaeb576513576b 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
@@ -32,6 +32,7 @@ static_library("Instrumentation") {
     "PGOInstrumentation.cpp",
     "PGOMemOPSizeOpt.cpp",
     "PoisonChecking.cpp",
+    "RealtimeSanitizer.cpp",
     "SanitizerBinaryMetadata.cpp",
     "SanitizerCoverage.cpp",
     "ThreadSanitizer.cpp",

From 6aad62cf5b7f91f4b02266cf72469e2c8e28dbef Mon Sep 17 00:00:00 2001
From: Sayhaan Siddiqui <49014204+sayhaan@users.noreply.github.com>
Date: Thu, 8 Aug 2024 16:41:51 -0700
Subject: [PATCH 088/112] [BOLT][DWARF] Add parallelization for processing of
 DWO debug information (#100282)

Enables parallelization for the processing of DWO CUs.
---
 bolt/docs/CommandLineArgumentReference.md     |  4 ++
 bolt/include/bolt/Core/ParallelUtilities.h    |  3 +-
 bolt/include/bolt/Rewrite/DWARFRewriter.h     |  3 +-
 bolt/lib/Core/ParallelUtilities.cpp           | 11 +++--
 bolt/lib/Rewrite/DWARFRewriter.cpp            | 45 +++++++++++++------
 ...f4-cross-cu-backward-different-abbrev.test |  2 +-
 ...rf4-cross-cu-forward-different-abbrev.test |  2 +-
 ...oclist-dwarf4-loclist--dwarf5-loclist.test |  2 +-
 bolt/test/X86/dwarf4-df-dualcu-loclist.test   |  2 +-
 .../X86/dwarf4-split-dwarf-no-address.test    |  2 +-
 ...dwarf4-subprogram-multiple-ranges-cus.test |  2 +-
 bolt/test/X86/dwarf4-types-dwarf5-types.test  |  2 +-
 bolt/test/X86/dwarf4-types-dwarf5.test        |  2 +-
 bolt/test/X86/dwarf5-addr-section-reuse.s     |  2 +-
 .../dwarf5-call-pc-function-null-check.test   |  2 +-
 bolt/test/X86/dwarf5-call-pc.test             |  2 +-
 bolt/test/X86/dwarf5-cu-no-debug-addr.test    |  2 +-
 .../X86/dwarf5-df-input-lowpc-ranges-cus.test |  2 +-
 bolt/test/X86/dwarf5-df-mono-dualcu.test      |  2 +-
 .../X86/dwarf5-df-output-dir-same-name.test   |  2 +-
 .../test/X86/dwarf5-df-types-debug-names.test |  2 +-
 ...dwarf5-df-types-modify-dwo-name-mixed.test |  2 +-
 .../X86/dwarf5-df-types-modify-dwo-name.test  |  2 +-
 ...4-gdb-index-types-gdb-generated-gdb11.test |  2 +-
 ...f4-gdb-index-types-gdb-generated-gdb9.test |  2 +-
 ...-dwarf4-gdb-index-types-lld-generated.test |  2 +-
 bolt/test/X86/dwarf5-dwarf4-monolithic.test   |  2 +-
 ...ypes-backward-forward-cross-reference.test |  2 +-
 bolt/test/X86/dwarf5-empty-arange.test        |  2 +-
 ...5-gdb-index-types-gdb-generated-gdb11.test |  2 +-
 ...f5-gdb-index-types-gdb-generated-gdb9.test |  2 +-
 .../dwarf5-gdb-index-types-lld-generated.test |  2 +-
 bolt/test/X86/dwarf5-locexpr-referrence.test  |  2 +-
 bolt/test/X86/dwarf5-loclist-offset-form.test |  2 +-
 .../X86/dwarf5-one-loclists-two-bases.test    |  2 +-
 bolt/test/X86/dwarf5-return-pc-form-addr.test |  2 +-
 bolt/test/X86/dwarf5-return-pc.test           |  2 +-
 bolt/test/X86/dwarf5-shared-str-offset-base.s |  2 +-
 .../X86/dwarf5-split-dwarf4-monolithic.test   |  2 +-
 ...5-split-gdb-index-types-gdb-generated.test |  2 +-
 ...dwarf5-subprogram-multiple-ranges-cus.test |  2 +-
 .../X86/dwarf5-two-cu-str-offset-table.test   |  2 +-
 bolt/test/X86/dwarf5-two-loclists.test        |  2 +-
 bolt/test/X86/dwarf5-two-rnglists.test        |  2 +-
 ...arf5-type-unit-no-cu-str-offset-table.test |  2 +-
 .../dwarf5-types-backward-cross-reference.s   |  2 +-
 bolt/test/X86/dwarf5-types-debug-names.test   |  2 +-
 47 files changed, 88 insertions(+), 62 deletions(-)

diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 0c8935457366dbb..6d3b797da3787e3 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -113,6 +113,10 @@
 
   Prints out offsets for abbrev and debug_info of Skeleton CUs that get patched.
 
+- `--debug-thread-count=<uint>`
+
+  Specifies the number of threads to be used when processing DWO debug information.
+
 - `--dot-tooltip-code`
 
   Add basic block instructions as tool tips on nodes
diff --git a/bolt/include/bolt/Core/ParallelUtilities.h b/bolt/include/bolt/Core/ParallelUtilities.h
index e7b35a79acc78cc..9f75e2f6bd81d29 100644
--- a/bolt/include/bolt/Core/ParallelUtilities.h
+++ b/bolt/include/bolt/Core/ParallelUtilities.h
@@ -50,7 +50,8 @@ enum SchedulingPolicy {
 };
 
 /// Return the managed thread pool and initialize it if not initialized.
-ThreadPoolInterface &getThreadPool();
+ThreadPoolInterface &
+getThreadPool(const unsigned ThreadsCount = opts::ThreadCount);
 
 /// Perform the work on each BinaryFunction except those that are accepted
 /// by SkipPredicate, scheduling heuristic is based on SchedPolicy.
diff --git a/bolt/include/bolt/Rewrite/DWARFRewriter.h b/bolt/include/bolt/Rewrite/DWARFRewriter.h
index deaf179140c0145..624245650a0924b 100644
--- a/bolt/include/bolt/Rewrite/DWARFRewriter.h
+++ b/bolt/include/bolt/Rewrite/DWARFRewriter.h
@@ -184,7 +184,8 @@ class DWARFRewriter {
   /// Output .dwo files.
   void writeDWOFiles(DWARFUnit &, const OverriddenSectionsMap &,
                      const std::string &, DebugLocWriter &,
-                     DebugStrOffsetsWriter &, DebugStrWriter &);
+                     DebugStrOffsetsWriter &, DebugStrWriter &,
+                     DebugRangesSectionWriter &);
   using KnownSectionsEntry = std::pair<MCSection *, DWARFSectionKind>;
 };
 
diff --git a/bolt/lib/Core/ParallelUtilities.cpp b/bolt/lib/Core/ParallelUtilities.cpp
index a24c37c06f1ac1c..3a8a7dc0aee7bd7 100644
--- a/bolt/lib/Core/ParallelUtilities.cpp
+++ b/bolt/lib/Core/ParallelUtilities.cpp
@@ -49,7 +49,7 @@ namespace ParallelUtilities {
 
 namespace {
 /// A single thread pool that is used to run parallel tasks
-std::unique_ptr<DefaultThreadPool> ThreadPoolPtr;
+std::unique_ptr<ThreadPoolInterface> ThreadPoolPtr;
 
 unsigned computeCostFor(const BinaryFunction &BF,
                         const PredicateTy &SkipPredicate,
@@ -102,12 +102,15 @@ inline unsigned estimateTotalCost(const BinaryContext &BC,
 
 } // namespace
 
-ThreadPoolInterface &getThreadPool() {
+ThreadPoolInterface &getThreadPool(const unsigned ThreadsCount) {
   if (ThreadPoolPtr.get())
     return *ThreadPoolPtr;
 
-  ThreadPoolPtr = std::make_unique<DefaultThreadPool>(
-      llvm::hardware_concurrency(opts::ThreadCount));
+  if (ThreadsCount > 1)
+    ThreadPoolPtr = std::make_unique<DefaultThreadPool>(
+        llvm::hardware_concurrency(ThreadsCount));
+  else
+    ThreadPoolPtr = std::make_unique<SingleThreadExecutor>();
   return *ThreadPoolPtr;
 }
 
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index 98f81f44d64901f..f9cb1b3895e79bb 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -329,6 +329,12 @@ static cl::opt<bool> KeepARanges(
         "keep or generate .debug_aranges section if .gdb_index is written"),
     cl::Hidden, cl::cat(BoltCategory));
 
+static cl::opt<unsigned>
+    DebugThreadCount("debug-thread-count",
+                     cl::desc("specifies thread count for the multithreading "
+                              "for updating DWO debug info"),
+                     cl::init(1), cl::cat(BoltCategory));
+
 static cl::opt<std::string> DwarfOutputPath(
     "dwarf-output-path",
     cl::desc("Path to where .dwo files will be written out to."), cl::init(""),
@@ -475,8 +481,8 @@ static void emitDWOBuilder(const std::string &DWOName,
                            DWARFUnit &SplitCU, DWARFUnit &CU,
                            DebugLocWriter &LocWriter,
                            DebugStrOffsetsWriter &StrOffstsWriter,
-                           DebugStrWriter &StrWriter,
-                           GDBIndex &GDBIndexSection) {
+                           DebugStrWriter &StrWriter, GDBIndex &GDBIndexSection,
+                           DebugRangesSectionWriter &TempRangesSectionWriter) {
   // Populate debug_info and debug_abbrev for current dwo into StringRef.
   DWODIEBuilder.generateAbbrevs();
   DWODIEBuilder.finish();
@@ -532,7 +538,7 @@ static void emitDWOBuilder(const std::string &DWOName,
     OverriddenSections[Kind] = Contents;
   }
   Rewriter.writeDWOFiles(CU, OverriddenSections, DWOName, LocWriter,
-                         StrOffstsWriter, StrWriter);
+                         StrOffstsWriter, StrWriter, TempRangesSectionWriter);
 }
 
 using DWARFUnitVec = std::vector<DWARFUnit *>;
@@ -646,7 +652,6 @@ void DWARFRewriter::updateDebugInfo() {
                                          *StrWriter);
   GDBIndex GDBIndexSection(BC);
   auto processSplitCU = [&](DWARFUnit &Unit, DWARFUnit &SplitCU,
-                            DIEBuilder &DIEBlder,
                             DebugRangesSectionWriter &TempRangesSectionWriter,
                             DebugAddrWriter &AddressWriter,
                             const std::string &DWOName,
@@ -669,7 +674,7 @@ void DWARFRewriter::updateDebugInfo() {
 
     emitDWOBuilder(DWOName, DWODIEBuilder, *this, SplitCU, Unit,
                    DebugLocDWoWriter, DWOStrOffstsWriter, DWOStrWriter,
-                   GDBIndexSection);
+                   GDBIndexSection, TempRangesSectionWriter);
   };
   auto processMainBinaryCU = [&](DWARFUnit &Unit, DIEBuilder &DIEBlder) {
     std::optional<DWARFUnit *> SplitCU;
@@ -716,9 +721,13 @@ void DWARFRewriter::updateDebugInfo() {
       finalizeTypeSections(DIEBlder, *Streamer, GDBIndexSection);
 
   CUPartitionVector PartVec = partitionCUs(*BC.DwCtx);
+  const unsigned int ThreadCount =
+      std::min(opts::DebugThreadCount, opts::ThreadCount);
   for (std::vector<DWARFUnit *> &Vec : PartVec) {
     DIEBlder.buildCompileUnits(Vec);
     llvm::SmallVector<std::unique_ptr<DIEBuilder>, 72> DWODIEBuildersByCU;
+    ThreadPoolInterface &ThreadPool =
+        ParallelUtilities::getThreadPool(ThreadCount);
     for (DWARFUnit *CU : DIEBlder.getProcessedCUs()) {
       createRangeLocListAddressWriters(*CU);
       std::optional<DWARFUnit *> SplitCU;
@@ -729,9 +738,9 @@ void DWARFRewriter::updateDebugInfo() {
         continue;
       DebugAddrWriter &AddressWriter =
           *AddressWritersByCU[CU->getOffset()].get();
-      DebugRangesSectionWriter *TempRangesSectionWriter =
-          CU->getVersion() >= 5 ? RangeListsWritersByCU[*DWOId].get()
-                                : LegacyRangesWritersByCU[*DWOId].get();
+      DebugRangesSectionWriter &TempRangesSectionWriter =
+          CU->getVersion() >= 5 ? *RangeListsWritersByCU[*DWOId].get()
+                                : *LegacyRangesWritersByCU[*DWOId].get();
       std::optional<std::string> DwarfOutputPath =
           opts::DwarfOutputPath.empty()
               ? std::nullopt
@@ -744,9 +753,17 @@ void DWARFRewriter::updateDebugInfo() {
           *DWODIEBuildersByCU.emplace_back(std::move(DWODIEBuilderPtr)).get();
       if (CU->getVersion() >= 5)
         StrOffstsWriter->finalizeSection(*CU, DIEBlder);
-      processSplitCU(*CU, **SplitCU, DIEBlder, *TempRangesSectionWriter,
-                     AddressWriter, DWOName, DwarfOutputPath, DWODIEBuilder);
+      // Important to capture CU and SplitCU by value here, otherwise when the
+      // thread is executed at some point after the current iteration of the
+      // loop, dereferencing CU/SplitCU in the call to processSplitCU means it
+      // will dereference a different variable than the one intended, causing a
+      // seg fault.
+      ThreadPool.async([&, DwarfOutputPath, DWOName, CU, SplitCU] {
+        processSplitCU(*CU, **SplitCU, TempRangesSectionWriter, AddressWriter,
+                       DWOName, DwarfOutputPath, DWODIEBuilder);
+      });
     }
+    ThreadPool.wait();
     for (std::unique_ptr<DIEBuilder> &DWODIEBuilderPtr : DWODIEBuildersByCU)
       DWODIEBuilderPtr->updateDebugNamesTable();
     for (DWARFUnit *CU : DIEBlder.getProcessedCUs())
@@ -1807,7 +1824,8 @@ std::optional<StringRef> updateDebugData(
 void DWARFRewriter::writeDWOFiles(
     DWARFUnit &CU, const OverriddenSectionsMap &OverridenSections,
     const std::string &DWOName, DebugLocWriter &LocWriter,
-    DebugStrOffsetsWriter &StrOffstsWriter, DebugStrWriter &StrWriter) {
+    DebugStrOffsetsWriter &StrOffstsWriter, DebugStrWriter &StrWriter,
+    DebugRangesSectionWriter &TempRangesSectionWriter) {
   // Setup DWP code once.
   DWARFContext *DWOCtx = BC.getDWOContext();
   const uint64_t DWOId = *CU.getDWOId();
@@ -1854,9 +1872,8 @@ void DWARFRewriter::writeDWOFiles(
 
   DebugRangeListsSectionWriter *RangeListssWriter = nullptr;
   if (CU.getVersion() == 5) {
-    assert(RangeListsWritersByCU.count(DWOId) != 0 &&
-           "No RangeListsWriter for DWO ID.");
-    RangeListssWriter = RangeListsWritersByCU[DWOId].get();
+    RangeListssWriter =
+        llvm::dyn_cast<DebugRangeListsSectionWriter>(&TempRangesSectionWriter);
 
     // Handling .debug_rnglists.dwo separately. The original .o/.dwo might not
     // have .debug_rnglists so won't be part of the loop below.
diff --git a/bolt/test/X86/dwarf4-cross-cu-backward-different-abbrev.test b/bolt/test/X86/dwarf4-cross-cu-backward-different-abbrev.test
index 555887a067589f3..b06cec6fe6b962c 100644
--- a/bolt/test/X86/dwarf4-cross-cu-backward-different-abbrev.test
+++ b/bolt/test/X86/dwarf4-cross-cu-backward-different-abbrev.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-cross-reference-different-abbrev-dst.s -o %t.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-cross-reference-different-abbrev-src.s -o %t1.o
 # RUN: %clang %cflags -gdwarf-4 %t.o %t1.o -o %t.exe
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
diff --git a/bolt/test/X86/dwarf4-cross-cu-forward-different-abbrev.test b/bolt/test/X86/dwarf4-cross-cu-forward-different-abbrev.test
index 74c9491d95d36ea..9adbf4eef9114ca 100644
--- a/bolt/test/X86/dwarf4-cross-cu-forward-different-abbrev.test
+++ b/bolt/test/X86/dwarf4-cross-cu-forward-different-abbrev.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-cross-reference-different-abbrev-dst.s -o %t.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-cross-reference-different-abbrev-src.s -o %t1.o
 # RUN: %clang %cflags -gdwarf-4 %t1.o %t.o -o %t.exe
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
diff --git a/bolt/test/X86/dwarf4-cross-cu-loclist-dwarf4-loclist--dwarf5-loclist.test b/bolt/test/X86/dwarf4-cross-cu-loclist-dwarf4-loclist--dwarf5-loclist.test
index 6bcf8892ed0a8a0..6f14c95236f1319 100644
--- a/bolt/test/X86/dwarf4-cross-cu-loclist-dwarf4-loclist--dwarf5-loclist.test
+++ b/bolt/test/X86/dwarf4-cross-cu-loclist-dwarf4-loclist--dwarf5-loclist.test
@@ -4,7 +4,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-loclist.s -o %t1.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-two-entries-loclist.s -o %t2.o
 # RUN: %clang %cflags %t1.o %t2.o %t.o -o %t.exe
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
diff --git a/bolt/test/X86/dwarf4-df-dualcu-loclist.test b/bolt/test/X86/dwarf4-df-dualcu-loclist.test
index 57c75e282421aca..a094d46c8354c73 100644
--- a/bolt/test/X86/dwarf4-df-dualcu-loclist.test
+++ b/bolt/test/X86/dwarf4-df-dualcu-loclist.test
@@ -6,7 +6,7 @@
 ; RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-df-dualcu-loclist-helper.s \
 ; RUN: -split-dwarf-file=helper.dwo -o helper.o
 ; RUN: %clang %cflags -gdwarf-5 -O2 -gsplit-dwarf=split main.o helper.o -o main.exe
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.dwo | FileCheck -check-prefix=PRE-BOLT-DWO-MAIN %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.dwo.dwo | FileCheck -check-prefix=BOLT-DWO-MAIN %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info helper.dwo | FileCheck -check-prefix=PRE-BOLT-DWO-HELPER %s
diff --git a/bolt/test/X86/dwarf4-split-dwarf-no-address.test b/bolt/test/X86/dwarf4-split-dwarf-no-address.test
index fc6d8d324b9597c..014f76ca3d21ca5 100644
--- a/bolt/test/X86/dwarf4-split-dwarf-no-address.test
+++ b/bolt/test/X86/dwarf4-split-dwarf-no-address.test
@@ -6,7 +6,7 @@
 ; RUN: llvm-mc --split-dwarf-file=helper.dwo --triple=x86_64-unknown-linux-gnu \
 ; RUN: --filetype=obj %p/Inputs/dwarf4-split-dwarf-no-address-helper.s -o=helper.o
 ; RUN: %clang %cflags -gdwarf-4 -gsplit-dwarf=split main.o helper.o -o main.exe -fno-pic -no-pie
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe.bolt | FileCheck -check-prefix=BOLT %s
 
 ;; Testing that there are no asserts/crashes when one of the DWARF4 CUs does not modify .debug_addr
diff --git a/bolt/test/X86/dwarf4-subprogram-multiple-ranges-cus.test b/bolt/test/X86/dwarf4-subprogram-multiple-ranges-cus.test
index c9ade995b70878d..4e98299c7aa3696 100644
--- a/bolt/test/X86/dwarf4-subprogram-multiple-ranges-cus.test
+++ b/bolt/test/X86/dwarf4-subprogram-multiple-ranges-cus.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-subprogram-multiple-ranges-main.s -o %t1.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-subprogram-multiple-ranges-other.s -o %t2.o
 # RUN: %clang %cflags %t1.o %t2.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-objdump %t.bolt --disassemble > %t1.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt >> %t1.txt
 # RUN: cat %t1.txt | FileCheck --check-prefix=POSTCHECK %s
diff --git a/bolt/test/X86/dwarf4-types-dwarf5-types.test b/bolt/test/X86/dwarf4-types-dwarf5-types.test
index a253f2283609017..af01d1467e73f44 100644
--- a/bolt/test/X86/dwarf4-types-dwarf5-types.test
+++ b/bolt/test/X86/dwarf4-types-dwarf5-types.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-types-dwarf5-types-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-types-dwarf5-types-helper.s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-types %t.bolt | FileCheck --check-prefix=POSTCHECKTU %s
 
diff --git a/bolt/test/X86/dwarf4-types-dwarf5.test b/bolt/test/X86/dwarf4-types-dwarf5.test
index 1eb42683e40ee81..dd0a8efe3f520a9 100644
--- a/bolt/test/X86/dwarf4-types-dwarf5.test
+++ b/bolt/test/X86/dwarf4-types-dwarf5.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-types-dwarf5-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-types-dwarf5-helper.s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-types %t.bolt | FileCheck --check-prefix=POSTCHECKTU %s
 
diff --git a/bolt/test/X86/dwarf5-addr-section-reuse.s b/bolt/test/X86/dwarf5-addr-section-reuse.s
index cf511d6d111e079..10cecf7b2964247 100644
--- a/bolt/test/X86/dwarf5-addr-section-reuse.s
+++ b/bolt/test/X86/dwarf5-addr-section-reuse.s
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-helper2-addr-section-reuse.s -o %thelper2.o
 # RUN: %clang %cflags -dwarf-5 %thelper1.o %tmain.o %thelper2.o -o %t.exe -Wl,-q
 # RUN: llvm-dwarfdump --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
-# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --debug-info %t.exe.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## This test checks that when a binary is bolted if CU is not modified and has DW_AT_addr_base that is shared
diff --git a/bolt/test/X86/dwarf5-call-pc-function-null-check.test b/bolt/test/X86/dwarf5-call-pc-function-null-check.test
index 761a4da696217c5..2b489542f998289 100644
--- a/bolt/test/X86/dwarf5-call-pc-function-null-check.test
+++ b/bolt/test/X86/dwarf5-call-pc-function-null-check.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-call-pc-function-null-check-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-call-pc-function-null-check-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe > %t.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe.bolt >> %t.txt
 # RUN: cat %t.txt | FileCheck --check-prefix=CHECK %s
diff --git a/bolt/test/X86/dwarf5-call-pc.test b/bolt/test/X86/dwarf5-call-pc.test
index dc7773dc053d90a..a4359295556b9b0 100644
--- a/bolt/test/X86/dwarf5-call-pc.test
+++ b/bolt/test/X86/dwarf5-call-pc.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-call-pc-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-call-pc-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections -reorder-blocks=reverse
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections -reorder-blocks=reverse --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe > %tmain.txt
 # RUN: llvm-objdump %t.exe --disassemble >> %tmain.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe.bolt > %tmainbolt.txt
diff --git a/bolt/test/X86/dwarf5-cu-no-debug-addr.test b/bolt/test/X86/dwarf5-cu-no-debug-addr.test
index e78b68680d6cc10..44721b187504b1e 100644
--- a/bolt/test/X86/dwarf5-cu-no-debug-addr.test
+++ b/bolt/test/X86/dwarf5-cu-no-debug-addr.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-cu-no-debug-addr-main.s -o %t1main.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-cu-no-debug-addr-helper.s -o %t1helper.o
 # RUN: %clang %cflags -dwarf-5 %t1main.o %t1helper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe  | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
diff --git a/bolt/test/X86/dwarf5-df-input-lowpc-ranges-cus.test b/bolt/test/X86/dwarf5-df-input-lowpc-ranges-cus.test
index a325395fd532027..801389c60feacc4 100644
--- a/bolt/test/X86/dwarf5-df-input-lowpc-ranges-cus.test
+++ b/bolt/test/X86/dwarf5-df-input-lowpc-ranges-cus.test
@@ -6,7 +6,7 @@
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-input-lowpc-ranges-other.s \
 ; RUN: -split-dwarf-file=mainOther.dwo -o other.o
 ; RUN: %clang %cflags main.o other.o -o main.exe
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-rnglists main.exe.bolt &> %t/foo.txt
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-addr main.exe.bolt >> %t/foo.txt
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe.bolt >> %t/foo.txt
diff --git a/bolt/test/X86/dwarf5-df-mono-dualcu.test b/bolt/test/X86/dwarf5-df-mono-dualcu.test
index 13272cc1c3c4da9..66c0a3b1ad6fb33 100644
--- a/bolt/test/X86/dwarf5-df-mono-dualcu.test
+++ b/bolt/test/X86/dwarf5-df-mono-dualcu.test
@@ -5,7 +5,7 @@
 ; RUN: -split-dwarf-file=main.dwo -o main.o
 ; RUN: llvm-mc -filetype=obj -triple x86_64-unknown-linux-gnu %p/Inputs/dwarf5-df-mono-helper.s -o=helper.o
 ; RUN: %clang %cflags -gdwarf-5 main.o helper.o -o main.exe -fno-pic -no-pie
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --always-convert-to-ranges
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --always-convert-to-ranges --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe | FileCheck -check-prefix=PRE-BOLT %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-addr main.exe.bolt &> %t/foo.txt
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe.bolt >> %t/foo.txt
diff --git a/bolt/test/X86/dwarf5-df-output-dir-same-name.test b/bolt/test/X86/dwarf5-df-output-dir-same-name.test
index b466f87d95e5eb1..29658a8b5910e87 100644
--- a/bolt/test/X86/dwarf5-df-output-dir-same-name.test
+++ b/bolt/test/X86/dwarf5-df-output-dir-same-name.test
@@ -9,7 +9,7 @@
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-output-dir-same-name-helper.s \
 ; RUN: -split-dwarf-file=objects/o2/split.dwo -o helper.o
 ; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --dwarf-output-path=%t/dwo
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --dwarf-output-path=%t/dwo --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: ls -l %t/dwo > log
 ; RUN: llvm-dwarfdump --debug-info main.exe.bolt >> log
 ; RUN: cat log | FileCheck -check-prefix=BOLT %s
diff --git a/bolt/test/X86/dwarf5-df-types-debug-names.test b/bolt/test/X86/dwarf5-df-types-debug-names.test
index 7c1c8e4fd5b383d..f96a5b8dccf0b53 100644
--- a/bolt/test/X86/dwarf5-df-types-debug-names.test
+++ b/bolt/test/X86/dwarf5-df-types-debug-names.test
@@ -6,7 +6,7 @@
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-types-debug-names-helper.s \
 ; RUN: -split-dwarf-file=helper.dwo -o helper.o
 ; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --debug-info -r 0 main.dwo.dwo > log.txt
 ; RUN: llvm-dwarfdump --debug-info -r 0 helper.dwo.dwo >> log.txt
 ; RUN: llvm-dwarfdump --debug-info --debug-names main.exe.bolt >> log.txt
diff --git a/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test b/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
index c8cfd82753d7795..e8c78b211cc4b9d 100644
--- a/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
+++ b/bolt/test/X86/dwarf5-df-types-modify-dwo-name-mixed.test
@@ -6,7 +6,7 @@
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-types-dup-helper.s \
 ; RUN: -split-dwarf-file=helper.dwo -o helper.o
 ; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --debug-info -r 0 main.exe.bolt > log.txt
 ; RUN: llvm-dwarfdump --debug-info -r 0 main.dwo.dwo >> log.txt
 ; RUN: llvm-dwarfdump --debug-info -r 0 helper.dwo.dwo >> log.txt
diff --git a/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test b/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test
index 12a7f648c23257f..024bb5613f94b1f 100644
--- a/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test
+++ b/bolt/test/X86/dwarf5-df-types-modify-dwo-name.test
@@ -6,7 +6,7 @@
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-df-types-debug-names-helper.s \
 ; RUN: -split-dwarf-file=helper.dwo -o helper.o
 ; RUN: %clang %cflags -gdwarf-5 -gsplit-dwarf=split main.o helper.o -o main.exe
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --debug-info -r 0 main.exe.bolt > log.txt
 ; RUN: llvm-dwarfdump --debug-info -r 0 main.dwo.dwo >> log.txt
 ; RUN: llvm-dwarfdump --debug-info -r 0 helper.dwo.dwo >> log.txt
diff --git a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb11.test b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb11.test
index 10ad6ed404f1c1a..465062560d4fc14 100644
--- a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb11.test
+++ b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb11.test
@@ -4,7 +4,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-gdb-index-types-helper.s -o %thelpergdb.o
 # RUN: %clang %cflags %tmaingdb.o %thelpergdb.o -o %tgdb.exe -Wl,-q
 # RUN: llvm-objcopy %tgdb.exe --add-section=.gdb_index=%p/Inputs/dwarf5-dwarf4-gdb-index-types-v8.generted-gdb11.gdb-index
-# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections
+# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index %tgdb.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by GDB.
diff --git a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb9.test b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb9.test
index 2da0bcca89b2ac7..484cf0f1526781d 100644
--- a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb9.test
+++ b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-gdb-generated-gdb9.test
@@ -4,7 +4,7 @@
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-gdb-index-types-helper.s -o %thelpergdb.o
 # RUN: %clang %cflags %tmaingdb.o %thelpergdb.o -o %tgdb.exe -Wl,-q
 # RUN: llvm-objcopy %tgdb.exe --add-section=.gdb_index=%p/Inputs/dwarf5-dwarf4-gdb-index-types-v8.generted-gdb9.gdb-index
-# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections
+# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index %tgdb.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by GDB.
diff --git a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-lld-generated.test b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-lld-generated.test
index 9be540352005de1..7589bfac57f58aa 100644
--- a/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-lld-generated.test
+++ b/bolt/test/X86/dwarf5-dwarf4-gdb-index-types-lld-generated.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-gdb-index-types-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf4-gdb-index-types-helper.s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe -Wl,-q -Wl,--gdb-index
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by LLD.
diff --git a/bolt/test/X86/dwarf5-dwarf4-monolithic.test b/bolt/test/X86/dwarf5-dwarf4-monolithic.test
index ff0f6990aaac0f3..37fded1f3d6fdd7 100644
--- a/bolt/test/X86/dwarf5-dwarf4-monolithic.test
+++ b/bolt/test/X86/dwarf5-dwarf4-monolithic.test
@@ -5,7 +5,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-dwarf4-monolithic-helper1.s -o %t1.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-dwarf4-monolithic-helper2.s -o %t2.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %t0.o %t1.o %t2.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt --always-convert-to-ranges %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt --always-convert-to-ranges %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-line %t.exe > %t_line.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt > %t.txt
diff --git a/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test b/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test
index b48d6a5dc20d4d9..9ff64cb1ca250e7 100644
--- a/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test
+++ b/bolt/test/X86/dwarf5-dwarf4-types-backward-forward-cross-reference.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-dwarf4-types-backward-forward-cross-reference-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-dwarf4-types-backward-forward-cross-reference-helper.s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt | FileCheck --check-prefix=POSTCHECKADDR %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-types %t.bolt | FileCheck --check-prefix=POSTCHECKTU %s
diff --git a/bolt/test/X86/dwarf5-empty-arange.test b/bolt/test/X86/dwarf5-empty-arange.test
index 61e966204843420..4ed3c1dc0a6e496 100644
--- a/bolt/test/X86/dwarf5-empty-arange.test
+++ b/bolt/test/X86/dwarf5-empty-arange.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-empty-arange-main.s   -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-empty-arange-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,--entry=main -Wl,-q -Wl,-gc-sections
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --debug-aranges %t.bolt > %t.txt
 # RUN: llvm-dwarfdump --debug-info -r 0 %t.bolt >> %t.txt
 # RUN: cat %t.txt | FileCheck --check-prefix=POSTCHECK %s
diff --git a/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb11.test b/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb11.test
index 338a476e46f3b3e..139b24afa1b0dac 100644
--- a/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb11.test
+++ b/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb11.test
@@ -4,7 +4,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-gdb-index-types-helper.s -o %thelpergdb.o
 # RUN: %clang %cflags %tmaingdb.o %thelpergdb.o -o %tgdb.exe -Wl,-q
 # RUN: llvm-objcopy %tgdb.exe --add-section=.gdb_index=%p/Inputs/dwarf5-gdb-index-types-v8.generted-gdb11.gdb-index
-# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections
+# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index %tgdb.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by GDB.
diff --git a/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb9.test b/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb9.test
index c9d3913a1933cda..26ee101e9d1d189 100644
--- a/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb9.test
+++ b/bolt/test/X86/dwarf5-gdb-index-types-gdb-generated-gdb9.test
@@ -4,7 +4,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-gdb-index-types-helper.s -o %thelpergdb.o
 # RUN: %clang %cflags %tmaingdb.o %thelpergdb.o -o %tgdb.exe -Wl,-q
 # RUN: llvm-objcopy %tgdb.exe --add-section=.gdb_index=%p/Inputs/dwarf5-gdb-index-types-v8.generted-gdb9.gdb-index
-# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections
+# RUN: llvm-bolt %tgdb.exe -o %tgdb.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index %tgdb.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by GDB.
diff --git a/bolt/test/X86/dwarf5-gdb-index-types-lld-generated.test b/bolt/test/X86/dwarf5-gdb-index-types-lld-generated.test
index a770e40260dde34..731c56013339957 100644
--- a/bolt/test/X86/dwarf5-gdb-index-types-lld-generated.test
+++ b/bolt/test/X86/dwarf5-gdb-index-types-lld-generated.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-gdb-index-types-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-gdb-index-types-helper.s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe -Wl,-q -Wl,--gdb-index
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by LLD.
diff --git a/bolt/test/X86/dwarf5-locexpr-referrence.test b/bolt/test/X86/dwarf5-locexpr-referrence.test
index cc7bb27ce602e7b..5b38987e0a7126a 100644
--- a/bolt/test/X86/dwarf5-locexpr-referrence.test
+++ b/bolt/test/X86/dwarf5-locexpr-referrence.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-locexpr-referrence-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-locexpr-referrence-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=CHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt | FileCheck --check-prefix=CHECKADDR %s
 
diff --git a/bolt/test/X86/dwarf5-loclist-offset-form.test b/bolt/test/X86/dwarf5-loclist-offset-form.test
index 3178c11a67069fb..de6a6090efa7f4b 100644
--- a/bolt/test/X86/dwarf5-loclist-offset-form.test
+++ b/bolt/test/X86/dwarf5-loclist-offset-form.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-loclist-offset-form-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-loclist-offset-form-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt > %t.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt >> %t.txt
diff --git a/bolt/test/X86/dwarf5-one-loclists-two-bases.test b/bolt/test/X86/dwarf5-one-loclists-two-bases.test
index f25f6c7a468581e..9e6fdb94b916a8d 100644
--- a/bolt/test/X86/dwarf5-one-loclists-two-bases.test
+++ b/bolt/test/X86/dwarf5-one-loclists-two-bases.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-loc-base-no-loc-accesshelper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt > %t.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt >> %t.txt
diff --git a/bolt/test/X86/dwarf5-return-pc-form-addr.test b/bolt/test/X86/dwarf5-return-pc-form-addr.test
index 5a83615cac031cc..8e35ef670cfc2cb 100644
--- a/bolt/test/X86/dwarf5-return-pc-form-addr.test
+++ b/bolt/test/X86/dwarf5-return-pc-form-addr.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-return-pc-form-addr-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-return-pc-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections -reorder-blocks=reverse
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections -reorder-blocks=reverse --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe > %tmain.txt
 # RUN: llvm-objdump %t.exe --disassemble >> %tmain.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe.bolt > %tmainbolt.txt
diff --git a/bolt/test/X86/dwarf5-return-pc.test b/bolt/test/X86/dwarf5-return-pc.test
index e9ef99ef5b945fb..21b6854474a62fc 100644
--- a/bolt/test/X86/dwarf5-return-pc.test
+++ b/bolt/test/X86/dwarf5-return-pc.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-return-pc-main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-return-pc-helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections -reorder-blocks=reverse
+# RUN: llvm-bolt %t.exe -o %t.exe.bolt --update-debug-sections -reorder-blocks=reverse --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe > %tmain.txt
 # RUN: llvm-objdump %t.exe --disassemble >> %tmain.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe.bolt > %tmainbolt.txt
diff --git a/bolt/test/X86/dwarf5-shared-str-offset-base.s b/bolt/test/X86/dwarf5-shared-str-offset-base.s
index d8492298a1604b5..f00efd54a4b02c6 100644
--- a/bolt/test/X86/dwarf5-shared-str-offset-base.s
+++ b/bolt/test/X86/dwarf5-shared-str-offset-base.s
@@ -3,7 +3,7 @@
 # RUN: llvm-mc --filetype=obj --triple x86_64 %s -o %tmain.o --defsym MAIN=0
 # RUN: llvm-mc --filetype=obj --triple x86_64 %s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %tmain.exe
-# RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+# RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --debug-info %tmain.exe.bolt > %tout.text
 # RUN: llvm-dwarfdump --show-section-sizes %tmain.exe >> %tout.text
 # RUN: llvm-dwarfdump --show-section-sizes %tmain.exe.bolt >> %tout.text
diff --git a/bolt/test/X86/dwarf5-split-dwarf4-monolithic.test b/bolt/test/X86/dwarf5-split-dwarf4-monolithic.test
index 2cfe5e26bd4cdc1..3eb6a724523d987 100644
--- a/bolt/test/X86/dwarf5-split-dwarf4-monolithic.test
+++ b/bolt/test/X86/dwarf5-split-dwarf4-monolithic.test
@@ -9,7 +9,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux -split-dwarf-file=helper1.dwo %p/Inputs/dwarf5-split-dwarf4-monolithic-helper1.s -o helper1.o
 # RUN: llvm-mc -dwarf-version=4 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-dwarf4-monolithic-helper2.s -o helper2.o
 # RUN: %clang %cflags -dwarf-5 main.o helper0.o helper1.o helper2.o -o main.exe -Wl,-q
-# RUN: llvm-bolt --always-convert-to-ranges main.exe -o main.bolt --update-debug-sections
+# RUN: llvm-bolt --always-convert-to-ranges main.exe -o main.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info main.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-line main.exe | FileCheck --check-prefix=PRECHECK-LINE %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr main.bolt >  boltout.txt
diff --git a/bolt/test/X86/dwarf5-split-gdb-index-types-gdb-generated.test b/bolt/test/X86/dwarf5-split-gdb-index-types-gdb-generated.test
index ec2b8f7084c78d6..6fcb5a97c1488c9 100644
--- a/bolt/test/X86/dwarf5-split-gdb-index-types-gdb-generated.test
+++ b/bolt/test/X86/dwarf5-split-gdb-index-types-gdb-generated.test
@@ -7,7 +7,7 @@
 # RUN: llvm-mc --split-dwarf-file=helper.dwo -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-split-gdb-index-types-helper.s -o helpergdb.o
 # RUN: %clang %cflags maingdb.o helpergdb.o -o maingdb.exe -Wl,-q
 # RUN: llvm-objcopy maingdb.exe --add-section=.gdb_index=%p/Inputs/dwarf5-split-gdb-index-types-v8.gdb-index
-# RUN: llvm-bolt maingdb.exe -o maingdb.exe.bolt --update-debug-sections
+# RUN: llvm-bolt maingdb.exe -o maingdb.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --gdb-index maingdb.exe.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## Tests that BOLT correctly handles gdb-index generated by GDB with split-dwarf DWARF4.
diff --git a/bolt/test/X86/dwarf5-subprogram-multiple-ranges-cus.test b/bolt/test/X86/dwarf5-subprogram-multiple-ranges-cus.test
index bcf63fe6a0d8ceb..57f8d7e99bcafa8 100644
--- a/bolt/test/X86/dwarf5-subprogram-multiple-ranges-cus.test
+++ b/bolt/test/X86/dwarf5-subprogram-multiple-ranges-cus.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-subprogram-multiple-ranges-main.s -o %t1.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-subprogram-multiple-ranges-other.s -o %t2.o
 # RUN: %clang %cflags %t1.o %t2.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-objdump %t.bolt --disassemble > %t1.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt >> %t1.txt
 # RUN: cat %t1.txt | FileCheck --check-prefix=POSTCHECK %s
diff --git a/bolt/test/X86/dwarf5-two-cu-str-offset-table.test b/bolt/test/X86/dwarf5-two-cu-str-offset-table.test
index 20503951df4e18f..e59664e3281a1a7 100644
--- a/bolt/test/X86/dwarf5-two-cu-str-offset-table.test
+++ b/bolt/test/X86/dwarf5-two-cu-str-offset-table.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets %t.exe > %t.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets %t.bolt >> %t.txt
 # RUN: cat %t.txt | FileCheck --check-prefix=CHECK %s
diff --git a/bolt/test/X86/dwarf5-two-loclists.test b/bolt/test/X86/dwarf5-two-loclists.test
index a7c6351f9813cc6..5b3417e86109a92 100644
--- a/bolt/test/X86/dwarf5-two-loclists.test
+++ b/bolt/test/X86/dwarf5-two-loclists.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt > %t.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt >> %t.txt
diff --git a/bolt/test/X86/dwarf5-two-rnglists.test b/bolt/test/X86/dwarf5-two-rnglists.test
index 98f2e347d7673b6..3db47f983eaa26c 100644
--- a/bolt/test/X86/dwarf5-two-rnglists.test
+++ b/bolt/test/X86/dwarf5-two-rnglists.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_main.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5_helper.s -o %thelper.o
 # RUN: %clang %cflags -dwarf-5 %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt --always-convert-to-ranges %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt --always-convert-to-ranges %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.exe | FileCheck --check-prefix=PRECHECK %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt > %t.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt >> %t.txt
diff --git a/bolt/test/X86/dwarf5-type-unit-no-cu-str-offset-table.test b/bolt/test/X86/dwarf5-type-unit-no-cu-str-offset-table.test
index 21ced6ce687b5c2..dc6255ff8c7bc2d 100644
--- a/bolt/test/X86/dwarf5-type-unit-no-cu-str-offset-table.test
+++ b/bolt/test/X86/dwarf5-type-unit-no-cu-str-offset-table.test
@@ -3,7 +3,7 @@
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-basic-cu.s -o %tmain.o
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-no-cu.s -o %thelper.o
 # RUN: %clang %cflags %tmain.o %thelper.o -o %t.exe -Wl,-q
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets %t.exe | FileCheck -check-prefix=PRE-BOLT %s
 # RUN: llvm-dwarfdump --show-form --verbose --debug-str-offsets %t.bolt | FileCheck -check-prefix=POST-BOLT %s
 
diff --git a/bolt/test/X86/dwarf5-types-backward-cross-reference.s b/bolt/test/X86/dwarf5-types-backward-cross-reference.s
index 2345cac2fde9697..17a8d620493a608 100644
--- a/bolt/test/X86/dwarf5-types-backward-cross-reference.s
+++ b/bolt/test/X86/dwarf5-types-backward-cross-reference.s
@@ -2,7 +2,7 @@
 
 # RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s -o %t.o
 # RUN: %clang %cflags -gdwarf-5 %t.o -o %t.exe
-# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck --check-prefix=POSTCHECK %s
 
 ## This test checks that BOLT handles backward cross CU references for dwarf5
diff --git a/bolt/test/X86/dwarf5-types-debug-names.test b/bolt/test/X86/dwarf5-types-debug-names.test
index 94624298e289de7..8da35574b9e997d 100644
--- a/bolt/test/X86/dwarf5-types-debug-names.test
+++ b/bolt/test/X86/dwarf5-types-debug-names.test
@@ -1,7 +1,7 @@
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-debug-names-main.s   -o %tmain.o
 ; RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %p/Inputs/dwarf5-types-debug-names-helper.s -o %thelper.o
 ; RUN: %clang %cflags -gdwarf-5 %tmain.o %thelper.o -o %tmain.exe
-; RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections
+; RUN: llvm-bolt %tmain.exe -o %tmain.exe.bolt --update-debug-sections --debug-thread-count=4 --cu-processing-batch-size=4
 ; RUN: llvm-dwarfdump --debug-info --debug-names %tmain.exe.bolt > %tlog.txt
 ; RUN: cat %tlog.txt | FileCheck -check-prefix=BOLT %s
 

From f86594788ce93b696675c94f54016d27a6c21d18 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 8 Aug 2024 16:43:20 -0700
Subject: [PATCH 089/112] [rtsan] Fix warnings after #101232

---
 .../llvm/Transforms/Instrumentation/RealtimeSanitizer.h      | 3 ---
 llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp    | 5 ++---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Instrumentation/RealtimeSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/RealtimeSanitizer.h
index b141ad27996b7b6..f2ce1636551ce2f 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/RealtimeSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/RealtimeSanitizer.h
@@ -31,9 +31,6 @@ class RealtimeSanitizerPass : public PassInfoMixin<RealtimeSanitizerPass> {
   PreservedAnalyses run(Function &F, AnalysisManager<Function> &AM);
 
   static bool isRequired() { return true; }
-
-private:
-  RealtimeSanitizerOptions Options{};
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
index d3028ee97c3eb8d..5663f446613b502 100644
--- a/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/RealtimeSanitizer.cpp
@@ -41,13 +41,12 @@ static void insertCallAtAllFunctionExitPoints(Function &Fn,
                                               const char *InsertFnName) {
   for (auto &BB : Fn)
     for (auto &I : BB)
-      if (auto *RI = dyn_cast<ReturnInst>(&I))
+      if (isa<ReturnInst>(&I))
         insertCallBeforeInstruction(Fn, I, InsertFnName);
 }
 
 RealtimeSanitizerPass::RealtimeSanitizerPass(
-    const RealtimeSanitizerOptions &Options)
-    : Options{Options} {}
+    const RealtimeSanitizerOptions &Options) {}
 
 PreservedAnalyses RealtimeSanitizerPass::run(Function &F,
                                              AnalysisManager<Function> &AM) {

From 4ce559d059a91b161d991cacb5951abff653c5e6 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Thu, 8 Aug 2024 17:02:04 -0700
Subject: [PATCH 090/112] [msan] Support most Arm NEON vector shift
 instructions (#102507)

This adds support for the Arm NEON vector shift instructions that follow
the same pattern as x86 (handleVectorShiftIntrinsic).

VSLI is not supported because it does not follow the 2-argument pattern
expected by handleVectorShiftIntrinsic.

This patch also updates the arm64-vshift.ll MSan test that was
introduced in
https://github.com/llvm/llvm-project/commit/5d0a12d3e9b1606c36430cf908da20d19d101e04
---
 .../Instrumentation/MemorySanitizer.cpp       |   17 +
 .../MemorySanitizer/AArch64/arm64-vshift.ll   | 1420 +++++++++--------
 2 files changed, 801 insertions(+), 636 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 45b3edf2b8f2313..a1632a93966c89f 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4116,6 +4116,23 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_mmx_psrli_q:
     case Intrinsic::x86_mmx_psrai_w:
     case Intrinsic::x86_mmx_psrai_d:
+    case Intrinsic::aarch64_neon_rshrn:
+    case Intrinsic::aarch64_neon_sqrshl:
+    case Intrinsic::aarch64_neon_sqrshrn:
+    case Intrinsic::aarch64_neon_sqrshrun:
+    case Intrinsic::aarch64_neon_sqshl:
+    case Intrinsic::aarch64_neon_sqshlu:
+    case Intrinsic::aarch64_neon_sqshrn:
+    case Intrinsic::aarch64_neon_sqshrun:
+    case Intrinsic::aarch64_neon_srshl:
+    case Intrinsic::aarch64_neon_sshl:
+    case Intrinsic::aarch64_neon_uqrshl:
+    case Intrinsic::aarch64_neon_uqrshrn:
+    case Intrinsic::aarch64_neon_uqshl:
+    case Intrinsic::aarch64_neon_uqshrn:
+    case Intrinsic::aarch64_neon_urshl:
+    case Intrinsic::aarch64_neon_ushl:
+      // Not handled here: aarch64_neon_vsli (vector shift left and insert)
       handleVectorShiftIntrinsic(I, /* Variable */ false);
       break;
     case Intrinsic::x86_avx2_psllv_d:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll
index a755562d683fbd6..4cc038f03ff2c36 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll
@@ -35,7 +35,12 @@ define <8 x i8> @sqshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -74,7 +79,12 @@ define <4 x i16> @sqshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -113,7 +123,12 @@ define <2 x i32> @sqshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -152,7 +167,12 @@ define <1 x i64> @sqshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -179,7 +199,8 @@ define <1 x i64> @sqshl1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -217,7 +238,10 @@ define i64 @sqshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSLD1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[_MSLD]], i64 [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[TMP1]], i64 [[TMP2]])
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -244,7 +268,8 @@ define i64 @sqshl_scalar_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -282,7 +307,12 @@ define <8 x i8> @uqshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -321,7 +351,12 @@ define <4 x i16> @uqshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -360,7 +395,12 @@ define <2 x i32> @uqshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -399,7 +439,13 @@ define <16 x i8> @sqshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -438,7 +484,13 @@ define <8 x i16> @sqshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -477,7 +529,13 @@ define <4 x i32> @sqshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -516,7 +574,13 @@ define <2 x i64> @sqshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <2 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -555,7 +619,13 @@ define <16 x i8> @uqshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -594,7 +664,13 @@ define <8 x i16> @uqshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -633,7 +709,13 @@ define <4 x i32> @uqshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -672,7 +754,13 @@ define <2 x i64> @uqshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <2 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -711,7 +799,12 @@ define <1 x i64> @uqshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -738,7 +831,8 @@ define <1 x i64> @uqshl1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -776,7 +870,10 @@ define i64 @uqshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSLD1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[_MSLD]], i64 [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[TMP1]], i64 [[TMP2]])
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -803,7 +900,8 @@ define i64 @uqshl_scalar_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -864,7 +962,12 @@ define <8 x i8> @srshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -903,7 +1006,12 @@ define <4 x i16> @srshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -942,7 +1050,12 @@ define <2 x i32> @srshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -981,7 +1094,12 @@ define <1 x i64> @srshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -1008,7 +1126,8 @@ define <1 x i64> @srshl1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -1046,7 +1165,10 @@ define i64 @srshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSLD1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[_MSLD]], i64 [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[TMP1]], i64 [[TMP2]])
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -1073,7 +1195,8 @@ define i64 @srshl_scalar_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -1111,7 +1234,12 @@ define <8 x i8> @urshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -1150,7 +1278,12 @@ define <4 x i16> @urshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -1189,7 +1322,12 @@ define <2 x i32> @urshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -1228,7 +1366,12 @@ define <1 x i64> @urshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -1255,7 +1398,8 @@ define <1 x i64> @urshl1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -1293,7 +1437,10 @@ define i64 @urshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSLD1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[_MSLD]], i64 [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[TMP1]], i64 [[TMP2]])
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -1320,7 +1467,8 @@ define i64 @urshl_scalar_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -1358,7 +1506,13 @@ define <16 x i8> @srshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -1397,7 +1551,13 @@ define <8 x i16> @srshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -1436,7 +1596,13 @@ define <4 x i32> @srshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -1475,7 +1641,13 @@ define <2 x i64> @srshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <2 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -1514,7 +1686,13 @@ define <16 x i8> @urshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -1553,7 +1731,13 @@ define <8 x i16> @urshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -1592,7 +1776,13 @@ define <4 x i32> @urshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -1631,7 +1821,13 @@ define <2 x i64> @urshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <2 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -1692,7 +1888,12 @@ define <8 x i8> @sqrshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -1731,7 +1932,12 @@ define <4 x i16> @sqrshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -1770,7 +1976,12 @@ define <2 x i32> @sqrshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -1809,7 +2020,12 @@ define <8 x i8> @uqrshl8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i8>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <8 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -1848,7 +2064,12 @@ define <4 x i16> @uqrshl4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i16>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i16> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <4 x i16>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -1887,7 +2108,12 @@ define <2 x i32> @uqrshl2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i32>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <2 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -1926,7 +2152,13 @@ define <16 x i8> @sqrshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -1965,7 +2197,13 @@ define <8 x i16> @sqrshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -2004,7 +2242,13 @@ define <4 x i32> @sqrshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2043,7 +2287,13 @@ define <2 x i64> @sqrshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <2 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -2082,7 +2332,12 @@ define <1 x i64> @sqrshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -2109,7 +2364,8 @@ define <1 x i64> @sqrshl1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -2147,7 +2403,10 @@ define i64 @sqrshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSLD1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 [[_MSLD]], i64 [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 [[TMP1]], i64 [[TMP2]])
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -2174,7 +2433,8 @@ define i64 @sqrshl_scalar_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -2212,7 +2472,13 @@ define <16 x i8> @uqrshl16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <16 x i8>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <16 x i8>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]])
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -2251,7 +2517,13 @@ define <8 x i16> @uqrshl8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -2290,7 +2562,13 @@ define <4 x i32> @uqrshl4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2329,7 +2607,13 @@ define <2 x i64> @uqrshl2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i128 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i64 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i1 [[TMP15]] to i128
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i128 [[TMP16]] to <2 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP18]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -2368,7 +2652,12 @@ define <1 x i64> @uqrshl1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <1 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <1 x i64> [[_MSLD1]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i1 [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i64 [[TMP15]] to <1 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP17]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> [[TMP2]])
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -2395,7 +2684,8 @@ define <1 x i64> @uqrshl1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -2433,7 +2723,10 @@ define i64 @uqrshl_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load i64, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], [[_MSLD1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne i64 [[_MSLD1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = sext i1 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 [[_MSLD]], i64 [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP15]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 [[TMP1]], i64 [[TMP2]])
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -2460,7 +2753,8 @@ define i64 @uqrshl_scalar_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -2508,7 +2802,8 @@ define <8 x i8> @urshr8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -2534,7 +2829,8 @@ define <4 x i16> @urshr4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -2560,7 +2856,8 @@ define <2 x i32> @urshr2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 -1, i32 -1>)
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -2586,7 +2883,8 @@ define <16 x i8> @urshr16b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -2612,7 +2910,8 @@ define <8 x i16> @urshr8h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -2638,7 +2937,8 @@ define <4 x i32> @urshr4s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2664,7 +2964,8 @@ define <2 x i64> @urshr2d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 -1, i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 -1, i64 -1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -2690,7 +2991,8 @@ define <1 x i64> @urshr1d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 -1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -2716,7 +3018,8 @@ define i64 @urshr_scalar(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[_MSLD]], i64 -1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[TMP1]], i64 -1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -2742,7 +3045,8 @@ define <8 x i8> @srshr8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -2768,7 +3072,8 @@ define <4 x i16> @srshr4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -2794,7 +3099,8 @@ define <2 x i32> @srshr2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 -1, i32 -1>)
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -2820,7 +3126,8 @@ define <16 x i8> @srshr16b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -2846,7 +3153,8 @@ define <8 x i16> @srshr8h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -2872,7 +3180,8 @@ define <4 x i32> @srshr4s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -2898,7 +3207,8 @@ define <2 x i64> @srshr2d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 -1, i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 -1, i64 -1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -2924,7 +3234,8 @@ define <1 x i64> @srshr1d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 -1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -2950,7 +3261,8 @@ define i64 @srshr_scalar(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[_MSLD]], i64 -1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[TMP1]], i64 -1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -2976,7 +3288,8 @@ define <8 x i8> @sqshlu8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -3002,7 +3315,8 @@ define <4 x i16> @sqshlu4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -3028,7 +3342,8 @@ define <2 x i32> @sqshlu2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -3054,7 +3369,8 @@ define <16 x i8> @sqshlu16b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -3080,7 +3396,8 @@ define <8 x i16> @sqshlu8h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -3106,7 +3423,8 @@ define <4 x i32> @sqshlu4s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -3132,7 +3450,8 @@ define <2 x i64> @sqshlu2d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 1, i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 1, i64 1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -3158,7 +3477,8 @@ define <1 x i64> @sqshlu1d_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -3184,7 +3504,8 @@ define i64 @sqshlu_i64_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 [[_MSLD]], i64 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 [[TMP1]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -3210,7 +3531,8 @@ define i32 @sqshlu_i32_constant(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 [[TMP1]], i32 1)
 ; CHECK-NEXT:    store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP3]]
@@ -3248,15 +3570,10 @@ define <8 x i8> @rshrn8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -3280,15 +3597,10 @@ define <4 x i16> @rshrn4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -3312,15 +3624,10 @@ define <2 x i32> @rshrn2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -3356,15 +3663,10 @@ define <16 x i8> @rshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -3404,15 +3706,10 @@ define <8 x i16> @rshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -3452,15 +3749,10 @@ define <4 x i32> @rshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -3713,14 +4005,10 @@ define i32 @sqshrn1s(i64 %A) nounwind sanitize_memory {
 ; CHECK-SAME: i64 [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 [[TMP1]], i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 [[A]], i32 1)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ;
   %tmp = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %A, i32 1)
@@ -3743,15 +4031,10 @@ define <8 x i8> @sqshrn8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -3775,15 +4058,10 @@ define <4 x i16> @sqshrn4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -3807,15 +4085,10 @@ define <2 x i32> @sqshrn2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -3852,15 +4125,10 @@ define <16 x i8> @sqshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -3900,15 +4168,10 @@ define <8 x i16> @sqshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -3948,15 +4211,10 @@ define <4 x i32> @sqshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -3978,14 +4236,10 @@ define i32 @sqshrun1s(i64 %A) nounwind sanitize_memory {
 ; CHECK-SAME: i64 [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 [[TMP1]], i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 [[A]], i32 1)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ;
   %tmp = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %A, i32 1)
@@ -4008,15 +4262,10 @@ define <8 x i8> @sqshrun8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -4040,15 +4289,10 @@ define <4 x i16> @sqshrun4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -4072,15 +4316,10 @@ define <2 x i32> @sqshrun2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -4116,15 +4355,10 @@ define <16 x i8> @sqshrun16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -4164,15 +4398,10 @@ define <8 x i16> @sqshrun8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -4212,15 +4441,10 @@ define <4 x i32> @sqshrun4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -4242,14 +4466,10 @@ define i32 @sqrshrn1s(i64 %A) nounwind sanitize_memory {
 ; CHECK-SAME: i64 [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 [[TMP1]], i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 [[A]], i32 1)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ;
   %tmp = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %A, i32 1)
@@ -4272,15 +4492,10 @@ define <8 x i8> @sqrshrn8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -4304,15 +4519,10 @@ define <4 x i16> @sqrshrn4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -4336,15 +4546,10 @@ define <2 x i32> @sqrshrn2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -4380,15 +4585,10 @@ define <16 x i8> @sqrshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -4428,15 +4628,10 @@ define <8 x i16> @sqrshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -4476,15 +4671,10 @@ define <4 x i32> @sqrshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -4506,14 +4696,10 @@ define i32 @sqrshrun1s(i64 %A) nounwind sanitize_memory {
 ; CHECK-SAME: i64 [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 [[TMP1]], i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 [[A]], i32 1)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ;
   %tmp = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %A, i32 1)
@@ -4536,15 +4722,10 @@ define <8 x i8> @sqrshrun8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -4568,15 +4749,10 @@ define <4 x i16> @sqrshrun4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -4600,15 +4776,10 @@ define <2 x i32> @sqrshrun2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -4644,15 +4815,10 @@ define <16 x i8> @sqrshrun16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -4692,15 +4858,10 @@ define <8 x i16> @sqrshrun8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -4740,15 +4901,10 @@ define <4 x i32> @sqrshrun4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -4770,14 +4926,10 @@ define i32 @uqrshrn1s(i64 %A) nounwind sanitize_memory {
 ; CHECK-SAME: i64 [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 [[TMP1]], i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 [[A]], i32 1)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ;
   %tmp = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %A, i32 1)
@@ -4800,15 +4952,10 @@ define <8 x i8> @uqrshrn8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -4832,15 +4979,10 @@ define <4 x i16> @uqrshrn4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -4864,15 +5006,10 @@ define <2 x i32> @uqrshrn2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -4908,15 +5045,10 @@ define <16 x i8> @uqrshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -4956,15 +5088,10 @@ define <8 x i16> @uqrshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -5004,15 +5131,10 @@ define <4 x i32> @uqrshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -5034,14 +5156,10 @@ define i32 @uqshrn1s(i64 %A) nounwind sanitize_memory {
 ; CHECK-SAME: i64 [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 [[TMP1]], i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 0
 ; CHECK-NEXT:    [[TMP:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 [[A]], i32 1)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i32 [[TMP]]
 ;
   %tmp = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %A, i32 1)
@@ -5064,15 +5182,10 @@ define <8 x i8> @uqshrn8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <8 x i8> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
 ;
   %tmp1 = load <8 x i16>, ptr %A
@@ -5096,15 +5209,10 @@ define <4 x i16> @uqshrn4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
 ;
   %tmp1 = load <4 x i32>, ptr %A
@@ -5128,15 +5236,10 @@ define <2 x i32> @uqshrn2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[_MSLD]], i32 1)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
 ;
   %tmp1 = load <2 x i64>, ptr %A
@@ -5172,15 +5275,10 @@ define <16 x i8> @uqshrn16b(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i16> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i8> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i8> [[_MSLD]], <8 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i8> [[OUT]], <8 x i8> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP4]]
@@ -5220,15 +5318,10 @@ define <8 x i16> @uqshrn8h(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <4 x i32>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i16> [[_MSLD]], <4 x i16> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i16> [[OUT]], <4 x i16> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP4]]
@@ -5268,15 +5361,10 @@ define <4 x i32> @uqshrn4s(ptr %ret, ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
 ; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
 ; CHECK-NEXT:    [[_MSLD1:%.*]] = load <2 x i64>, ptr [[TMP12]], align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[_MSLD1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF0]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[_MSLD1]], i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <2 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[TMP1]], i32 1)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i32> [[_MSLD]], <2 x i32> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[OUT]], <2 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
@@ -5507,7 +5595,8 @@ define <8 x i16> @neon.ushll8h_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <8 x i8> [[_MSLD]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i16>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[_MSPROP]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[TMP2]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -5536,7 +5625,13 @@ define <8 x i16> @neon.ushl8h_no_constant_shift(ptr %A) nounwind sanitize_memory
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <8 x i8> [[_MSLD]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i16>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[_MSPROP]] to i128
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i128 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i1 [[TMP9]] to i128
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i128 [[TMP10]] to <8 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[_MSPROP]], <8 x i16> [[TMP2]])
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[TMP12]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP2]])
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -5565,7 +5660,8 @@ define <4 x i32> @neon.ushl8h_constant_shift_extend_not_2x(ptr %A) nounwind sani
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i8>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <4 x i8> [[_MSLD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[_MSPROP]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -5592,7 +5688,8 @@ define <8 x i16> @neon.ushl8_noext_constant_shift(ptr %A) nounwind sanitize_memo
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -5620,7 +5717,8 @@ define <4 x i32> @neon.ushll4s_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <4 x i16> [[_MSLD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[_MSPROP]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -5650,7 +5748,8 @@ define <4 x i32> @neon.ushll4s_neg_constant_shift(ptr %A) nounwind sanitize_memo
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <4 x i16> [[_MSLD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[_MSPROP]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[TMP2]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -5666,8 +5765,10 @@ define <4 x i32> @neon.ushll4s_constant_fold() nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @neon.ushll4s_constant_fold(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
@@ -5692,7 +5793,8 @@ define <2 x i64> @neon.ushll2d_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <2 x i32> [[_MSLD]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> <i64 1, i64 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[TMP2]], <2 x i64> <i64 1, i64 1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -5721,7 +5823,8 @@ define <1 x i64> @neon.ushl_vscalar_constant_shift(ptr %A) nounwind sanitize_mem
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <1 x i32> [[_MSLD]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <1 x i32> [[TMP1]] to <1 x i64>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <1 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[_MSPROP]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[TMP2]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -5750,7 +5853,8 @@ define i64 @neon.ushl_scalar_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext i32 [[_MSLD]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 [[_MSPROP]], i64 1)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 [[TMP2]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -5844,7 +5948,8 @@ define <16 x i8> @neon.sshl16b_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
@@ -5870,7 +5975,8 @@ define <16 x i8> @neon.sshl16b_non_splat_constant_shift(ptr %A) nounwind sanitiz
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 6, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 6, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
@@ -5896,7 +6002,8 @@ define <16 x i8> @neon.sshl16b_neg_constant_shift(ptr %A) nounwind sanitize_memo
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2, i8 -2>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP2]]
@@ -5924,7 +6031,8 @@ define <8 x i16> @neon.sshll8h_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <8 x i8> [[_MSLD]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[_MSPROP]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[TMP2]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -5953,7 +6061,8 @@ define <4 x i32> @neon.sshl4s_wrong_ext_constant_shift(ptr %A) nounwind sanitize
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i8>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <4 x i8> [[_MSLD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[_MSPROP]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -5982,7 +6091,8 @@ define <4 x i32> @neon.sshll4s_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <4 x i16> [[_MSLD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[_MSPROP]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[TMP2]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -6011,7 +6121,8 @@ define <4 x i32> @neon.sshll4s_neg_constant_shift(ptr %A) nounwind sanitize_memo
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <4 x i16> [[_MSLD]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[_MSPROP]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[TMP2]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -6027,8 +6138,10 @@ define <4 x i32> @neon.sshl4s_constant_fold() nounwind sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @neon.sshl4s_constant_fold(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 ;
   %tmp3 = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>)
@@ -6051,7 +6164,8 @@ define <4 x i32> @neon.sshl4s_no_fold(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -6079,7 +6193,8 @@ define <2 x i64> @neon.sshll2d_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <2 x i32> [[_MSLD]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> <i64 1, i64 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[TMP2]], <2 x i64> <i64 1, i64 1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -6108,7 +6223,8 @@ define <1 x i64> @neon.sshll_vscalar_constant_shift(ptr %A) nounwind sanitize_me
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <1 x i32> [[_MSLD]] to <1 x i64>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <1 x i32> [[TMP1]] to <1 x i64>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <1 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[_MSPROP]], <1 x i64> <i64 1>)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <1 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[TMP2]], <1 x i64> <i64 1>)
 ; CHECK-NEXT:    store <1 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <1 x i64> [[TMP3]]
@@ -6137,7 +6253,8 @@ define i64 @neon.sshll_scalar_constant_shift(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext i32 [[_MSLD]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 [[_MSPROP]], i64 1)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 [[TMP2]], i64 1)
 ; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -6166,7 +6283,8 @@ define i64 @neon.sshll_scalar_constant_shift_m1(ptr %A) nounwind sanitize_memory
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = zext i32 [[_MSLD]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[_MSPROP]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 [[_MSPROP]], i64 -1)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[TMP7]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 [[TMP2]], i64 -1)
 ; CHECK-NEXT:    store i64 [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret i64 [[TMP3]]
@@ -6182,8 +6300,10 @@ define <2 x i64> @neon.sshl2d_constant_fold() nounwind sanitize_memory {
 ; CHECK-LABEL: define <2 x i64> @neon.sshl2d_constant_fold(
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> zeroinitializer, <2 x i64> <i64 1, i64 1>)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <2 x i64> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> <i64 99, i64 1000>, <2 x i64> <i64 1, i64 1>)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %tmp3 = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> <i64 99, i64 1000>, <2 x i64> <i64 1, i64 1>)
@@ -6206,7 +6326,8 @@ define <2 x i64> @neon.sshl2d_no_fold(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 2, i64 2>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[TMP2]], <2 x i64> <i64 2, i64 2>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -6331,7 +6452,8 @@ define <8 x i8> @sqshli8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -6357,7 +6479,8 @@ define <4 x i16> @sqshli4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -6383,7 +6506,8 @@ define <2 x i32> @sqshli2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -6409,7 +6533,8 @@ define <16 x i8> @sqshli16b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -6435,7 +6560,8 @@ define <8 x i16> @sqshli8h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -6461,7 +6587,8 @@ define <4 x i32> @sqshli4s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -6487,7 +6614,8 @@ define <2 x i64> @sqshli2d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 1, i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 1, i64 1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -6513,7 +6641,8 @@ define <8 x i8> @uqshli8b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -6539,7 +6668,8 @@ define <8 x i8> @uqshli8b_1(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>)
 ; CHECK-NEXT:    store <8 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i8> [[TMP3]]
@@ -6565,7 +6695,8 @@ define <4 x i16> @uqshli4h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <4 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[TMP3]]
@@ -6591,7 +6722,8 @@ define <2 x i32> @uqshli2s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
 ; CHECK-NEXT:    store <2 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i32> [[TMP3]]
@@ -6617,7 +6749,8 @@ define <16 x i8> @uqshli16b(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
 ; CHECK-NEXT:    store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i8> [[TMP3]]
@@ -6643,7 +6776,8 @@ define <8 x i16> @uqshli8h(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
 ; CHECK-NEXT:    store <8 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
@@ -6669,7 +6803,8 @@ define <4 x i32> @uqshli4s(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
@@ -6695,7 +6830,8 @@ define <2 x i64> @uqshli2d(ptr %A) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 193514046488576
 ; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 1, i64 1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 1, i64 1>)
 ; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
@@ -6722,14 +6858,15 @@ define <8 x i8> @ursra8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -6764,14 +6901,15 @@ define <4 x i16> @ursra4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -6806,14 +6944,15 @@ define <2 x i32> @ursra2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 -1, i32 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -6848,14 +6987,15 @@ define <16 x i8> @ursra16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -6890,14 +7030,15 @@ define <8 x i16> @ursra8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -6932,14 +7073,15 @@ define <4 x i32> @ursra4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -6974,14 +7116,15 @@ define <2 x i64> @ursra2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 -1, i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 -1, i64 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7016,14 +7159,15 @@ define <1 x i64> @ursra1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7058,14 +7202,15 @@ define i64 @ursra_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[_MSLD]], i64 -1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP8]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 [[TMP1]], i64 -1)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7100,14 +7245,15 @@ define <8 x i8> @srsra8b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i8>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[_MSLD]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i8> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> [[TMP1]], <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7142,14 +7288,15 @@ define <4 x i16> @srsra4h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[_MSLD]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[TMP1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7184,14 +7331,15 @@ define <2 x i32> @srsra2s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i32>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[_MSLD]], <2 x i32> <i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> <i32 -1, i32 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7226,14 +7374,15 @@ define <16 x i8> @srsra16b(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[_MSLD]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i8> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7268,14 +7417,15 @@ define <8 x i16> @srsra8h(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[_MSLD]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i16> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7310,14 +7460,15 @@ define <4 x i32> @srsra4s(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[_MSLD]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7352,14 +7503,15 @@ define <2 x i64> @srsra2d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[_MSLD]], <2 x i64> <i64 -1, i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[TMP1]], <2 x i64> <i64 -1, i64 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, ptr [[B]], align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7394,14 +7546,15 @@ define <1 x i64> @srsra1d(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <1 x i64>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[_MSLD]], <1 x i64> <i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <1 x i64> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[TMP1]], <1 x i64> <i64 -1>)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -7436,14 +7589,15 @@ define i64 @srsra_scalar(ptr %A, ptr %B) nounwind sanitize_memory {
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[_MSLD]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[_MSLD]], i64 -1)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP8]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 [[TMP1]], i64 -1)
 ; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
-; CHECK:       8:
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF0]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
+; CHECK:       11:
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[B]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[B]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 193514046488576
@@ -9253,14 +9407,15 @@ define void @sqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9285,14 +9440,15 @@ define void @uqshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9317,14 +9473,15 @@ define void @srshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9349,14 +9506,15 @@ define void @urshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sanit
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9381,14 +9539,15 @@ define void @sqshlu_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) sani
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9413,14 +9572,15 @@ define void @sshl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) saniti
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9445,14 +9605,15 @@ define void @ushl_zero_shift_amount(<2 x i64> %a, <2 x i64> %b, ptr %dst) saniti
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[VPADDQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[A]], <2 x i64> [[B]])
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[_MSPROP]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[VSHLQ_V2_I_I:%.*]] = tail call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[VPADDQ_V2_I_I]], <2 x i64> zeroinitializer)
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF0]]
-; CHECK:       3:
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF0]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       4:
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[DST]] to i64
 ; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 193514046488576
 ; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
@@ -9473,16 +9634,12 @@ define <4 x i32> @sext_rshrn(<4 x i32> noundef %a) sanitize_memory {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[TMP0]], i32 13)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[VRSHRN_N1:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[A]], i32 13)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
 ; CHECK-NEXT:    [[VMOVL_I:%.*]] = sext <4 x i16> [[VRSHRN_N1]] to <4 x i32>
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
 ;
 entry:
@@ -9497,16 +9654,12 @@ define <4 x i32> @zext_rshrn(<4 x i32> noundef %a) sanitize_memory {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[TMP0]], i32 13)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[VRSHRN_N1:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[A]], i32 13)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
 ; CHECK-NEXT:    [[VMOVL_I:%.*]] = zext <4 x i16> [[VRSHRN_N1]] to <4 x i32>
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[VMOVL_I]]
 ;
 entry:
@@ -9523,15 +9676,10 @@ define <4 x i16> @mul_rshrn(<4 x i32> noundef %a) sanitize_memory {
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer
 ; CHECK-NEXT:    [[B:%.*]] = add <4 x i32> [[A]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[_MSPROP]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF0]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[_MSPROP]], i32 13)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i16> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[VRSHRN_N1:%.*]] = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[B]], i32 13)
-; CHECK-NEXT:    store <4 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <4 x i16> [[TMP2]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i16> [[VRSHRN_N1]]
 ;
 entry:

From 4a2bf05980fd59013e74603f961d4a52cf8db940 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Thu, 8 Aug 2024 16:33:03 -0700
Subject: [PATCH 091/112] Reapply "[ctx_prof] Fix the pre-thinlink "use" case
 (#102511)"

This reverts commit 967185eeb85abb77bd6b6cdd2b026d5c54b7d4f3.

The problem was link dependencies, moved `UseCtxProfile` to `Analysis`.
---
 .../Instrumentation/PGOCtxProfLowering.h        |  3 ++-
 llvm/lib/Analysis/CtxProfAnalysis.cpp           |  8 ++++++--
 llvm/lib/Passes/PassBuilderPipelines.cpp        | 17 ++++++++---------
 .../Instrumentation/PGOCtxProfLowering.cpp      |  2 +-
 .../Instrumentation/PGOInstrumentation.cpp      | 17 ++++++++++-------
 .../PGOProfile/ctx-prof-use-prelink.ll          | 14 ++++++++++----
 6 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
index 5256aff56205bab..f127d16b8de124d 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOCtxProfLowering.h
@@ -19,7 +19,8 @@ class Type;
 class PGOCtxProfLoweringPass : public PassInfoMixin<PGOCtxProfLoweringPass> {
 public:
   explicit PGOCtxProfLoweringPass() = default;
-  static bool isContextualIRPGOEnabled();
+  // True if contextual instrumentation is enabled.
+  static bool isCtxIRPGOInstrEnabled();
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
 };
diff --git a/llvm/lib/Analysis/CtxProfAnalysis.cpp b/llvm/lib/Analysis/CtxProfAnalysis.cpp
index f56f910bd778443..fbae705127538a9 100644
--- a/llvm/lib/Analysis/CtxProfAnalysis.cpp
+++ b/llvm/lib/Analysis/CtxProfAnalysis.cpp
@@ -17,11 +17,17 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/ProfileData/PGOCtxProfReader.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 #define DEBUG_TYPE "ctx_prof"
 
+using namespace llvm;
+cl::opt<std::string>
+    UseCtxProfile("use-ctx-profile", cl::init(""), cl::Hidden,
+                  cl::desc("Use the specified contextual profile file"));
+
 namespace llvm {
 namespace json {
 Value toJSON(const PGOCtxProfContext &P) {
@@ -58,8 +64,6 @@ Value toJSON(const PGOCtxProfContext::CallTargetMapTy &P) {
 } // namespace json
 } // namespace llvm
 
-using namespace llvm;
-
 AnalysisKey CtxProfAnalysis::Key;
 
 CtxProfAnalysis::Result CtxProfAnalysis::run(Module &M,
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index c175ee898098495..6927a2886b962b5 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -304,9 +304,7 @@ static cl::opt<bool> UseLoopVersioningLICM(
     "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
     cl::desc("Enable the experimental Loop Versioning LICM pass"));
 
-cl::opt<std::string>
-    UseCtxProfile("use-ctx-profile", cl::init(""), cl::Hidden,
-                  cl::desc("Use the specified contextual profile file"));
+extern cl::opt<std::string> UseCtxProfile;
 
 namespace llvm {
 extern cl::opt<bool> EnableMemProfContextDisambiguation;
@@ -1173,13 +1171,12 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   const bool IsMemprofUse = IsPGOPreLink && !PGOOpt->MemoryProfile.empty();
   // We don't want to mix pgo ctx gen and pgo gen; we also don't currently
   // enable ctx profiling from the frontend.
-  assert(
-      !(IsPGOInstrGen && PGOCtxProfLoweringPass::isContextualIRPGOEnabled()) &&
-      "Enabling both instrumented FDO and contextual instrumentation is not "
-      "supported.");
+  assert(!(IsPGOInstrGen && PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled()) &&
+         "Enabling both instrumented PGO and contextual instrumentation is not "
+         "supported.");
   // Enable contextual profiling instrumentation.
   const bool IsCtxProfGen = !IsPGOInstrGen && IsPreLink &&
-                            PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+                            PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled();
   const bool IsCtxProfUse = !UseCtxProfile.empty() && !PGOOpt &&
                             Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
 
@@ -1670,8 +1667,10 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) {
   // In pre-link, for ctx prof use, we stop here with an instrumented IR. We let
   // thinlto use the contextual info to perform imports; then use the contextual
   // profile in the post-thinlink phase.
-  if (!UseCtxProfile.empty() && !PGOOpt)
+  if (!UseCtxProfile.empty() && !PGOOpt) {
+    addRequiredLTOPreLinkPasses(MPM);
     return MPM;
+  }
 
   // Run partial inlining pass to partially inline functions that have
   // large bodies.
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
index de1d4d2381c06ee..d6ba12465bb3283 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
@@ -30,7 +30,7 @@ static cl::list<std::string> ContextRoots(
         "root of an interesting graph, which will be profiled independently "
         "from other similar graphs."));
 
-bool PGOCtxProfLoweringPass::isContextualIRPGOEnabled() {
+bool PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled() {
   return !ContextRoots.empty();
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 1ce8f58c1aa1408..41618194d12ed7c 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -321,6 +321,7 @@ static cl::opt<unsigned> PGOFunctionCriticalEdgeThreshold(
              " greater than this threshold."));
 
 extern cl::opt<unsigned> MaxNumVTableAnnotations;
+extern cl::opt<std::string> UseCtxProfile;
 
 namespace llvm {
 // Command line option to turn on CFG dot dump after profile annotation.
@@ -338,9 +339,12 @@ extern cl::opt<bool> EnableVTableProfileUse;
 extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
 } // namespace llvm
 
+bool shouldInstrumentForCtxProf() {
+  return PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled() ||
+         !UseCtxProfile.empty();
+}
 bool shouldInstrumentEntryBB() {
-  return PGOInstrumentEntry ||
-         PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+  return PGOInstrumentEntry || shouldInstrumentForCtxProf();
 }
 
 // FIXME(mtrofin): re-enable this for ctx profiling, for non-indirect calls. Ctx
@@ -348,8 +352,7 @@ bool shouldInstrumentEntryBB() {
 // Supporting other values is relatively straight-forward - just another counter
 // range within the context.
 bool isValueProfilingDisabled() {
-  return DisableValueProfiling ||
-         PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+  return DisableValueProfiling || shouldInstrumentForCtxProf();
 }
 
 // Return a string describing the branch condition that can be
@@ -902,7 +905,7 @@ static void instrumentOneFunc(
   unsigned NumCounters =
       InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
 
-  if (PGOCtxProfLoweringPass::isContextualIRPGOEnabled()) {
+  if (shouldInstrumentForCtxProf()) {
     auto *CSIntrinsic =
         Intrinsic::getDeclaration(M, Intrinsic::instrprof_callsite);
     // We want to count the instrumentable callsites, then instrument them. This
@@ -1861,7 +1864,7 @@ static bool InstrumentAllFunctions(
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) {
   // For the context-sensitve instrumentation, we should have a separated pass
   // (before LTO/ThinLTO linking) to create these variables.
-  if (!IsCS && !PGOCtxProfLoweringPass::isContextualIRPGOEnabled())
+  if (!IsCS && !shouldInstrumentForCtxProf())
     createIRLevelProfileFlagVar(M, /*IsCS=*/false);
 
   Triple TT(M.getTargetTriple());
@@ -2112,7 +2115,7 @@ static bool annotateAllFunctions(
   bool InstrumentFuncEntry = PGOReader->instrEntryBBEnabled();
   if (PGOInstrumentEntry.getNumOccurrences() > 0)
     InstrumentFuncEntry = PGOInstrumentEntry;
-  InstrumentFuncEntry |= PGOCtxProfLoweringPass::isContextualIRPGOEnabled();
+  InstrumentFuncEntry |= shouldInstrumentForCtxProf();
 
   bool HasSingleByteCoverage = PGOReader->hasSingleByteCoverage();
   for (auto &F : M) {
diff --git a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
index b50a815be5abf5b..18ac2f92aa39d46 100644
--- a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
+++ b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; There is no profile, but that's OK because the prelink does not care about
 ; the content of the profile, just that we intend to use one.
 ; There is no scenario currently of doing ctx profile use without thinlto.
@@ -7,19 +7,22 @@
 
 declare void @bar()
 
+;.
+; CHECK: @__profn_foo = private constant [3 x i8] c"foo"
+;.
 define void @foo(i32 %a, ptr %fct) {
 ; CHECK-LABEL: define void @foo(
 ; CHECK-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0)
 ; CHECK-NEXT:    [[T:%.*]] = icmp eq i32 [[A]], 0
 ; CHECK-NEXT:    br i1 [[T]], label %[[YES:.*]], label %[[NO:.*]]
 ; CHECK:       [[YES]]:
 ; CHECK-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[FCT]] to i64
-; CHECK-NEXT:    call void @llvm.instrprof.value.profile(ptr @__profn_foo, i64 728453322856651412, i64 [[TMP1]], i32 0, i32 0)
+; CHECK-NEXT:    call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0, ptr [[FCT]])
 ; CHECK-NEXT:    call void [[FCT]](i32 0)
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[NO]]:
-; CHECK-NEXT:    call void @llvm.instrprof.increment(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 0)
+; CHECK-NEXT:    call void @llvm.instrprof.callsite(ptr @__profn_foo, i64 728453322856651412, i32 2, i32 1, ptr @bar)
 ; CHECK-NEXT:    call void @bar()
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[EXIT]]:
@@ -36,3 +39,6 @@ no:
 exit:
   ret void
 }
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+;.

From 4c1dbbe7aaeb1cb3f991e1de9c7d0dd312e565f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 8 Aug 2024 17:09:53 -0700
Subject: [PATCH 092/112] [flang][cuda] Make CUFRegisterAllocator callable from
 C/Fortran (#102543)

---
 flang/include/flang/Runtime/CUDA/allocator.h  | 6 +++++-
 flang/runtime/CUDA/allocator.cpp              | 4 +++-
 flang/unittests/Runtime/CUDA/AllocatorCUF.cpp | 6 +++---
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/flang/include/flang/Runtime/CUDA/allocator.h b/flang/include/flang/Runtime/CUDA/allocator.h
index 8f5204769d7aa72..f0bfc1548e64587 100644
--- a/flang/include/flang/Runtime/CUDA/allocator.h
+++ b/flang/include/flang/Runtime/CUDA/allocator.h
@@ -10,6 +10,7 @@
 #define FORTRAN_RUNTIME_CUDA_ALLOCATOR_H_
 
 #include "flang/Runtime/descriptor.h"
+#include "flang/Runtime/entry-names.h"
 
 #define CUDA_REPORT_IF_ERROR(expr) \
   [](CUresult result) { \
@@ -25,7 +26,10 @@
 
 namespace Fortran::runtime::cuda {
 
-void CUFRegisterAllocator();
+extern "C" {
+
+void RTDECL(CUFRegisterAllocator)();
+}
 
 void *CUFAllocPinned(std::size_t);
 void CUFFreePinned(void *);
diff --git a/flang/runtime/CUDA/allocator.cpp b/flang/runtime/CUDA/allocator.cpp
index cd00d40361d28b3..bd657b800c61e89 100644
--- a/flang/runtime/CUDA/allocator.cpp
+++ b/flang/runtime/CUDA/allocator.cpp
@@ -18,8 +18,9 @@
 #include "cuda.h"
 
 namespace Fortran::runtime::cuda {
+extern "C" {
 
-void CUFRegisterAllocator() {
+void RTDEF(CUFRegisterAllocator)() {
   allocatorRegistry.Register(
       kPinnedAllocatorPos, {&CUFAllocPinned, CUFFreePinned});
   allocatorRegistry.Register(
@@ -29,6 +30,7 @@ void CUFRegisterAllocator() {
   allocatorRegistry.Register(
       kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified});
 }
+}
 
 void *CUFAllocPinned(std::size_t sizeInBytes) {
   void *p;
diff --git a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
index 4f53e654034cb76..9f5ec289ee8f740 100644
--- a/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
+++ b/flang/unittests/Runtime/CUDA/AllocatorCUF.cpp
@@ -55,7 +55,7 @@ class ScopedContext {
 
 TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
   using Fortran::common::TypeCategory;
-  Fortran::runtime::cuda::CUFRegisterAllocator();
+  RTNAME(CUFRegisterAllocator)();
   ScopedContext ctx;
   // REAL(4), DEVICE, ALLOCATABLE :: a(:)
   auto a{createAllocatable(TypeCategory::Real, 4)};
@@ -73,7 +73,7 @@ TEST(AllocatableCUFTest, SimpleDeviceAllocate) {
 
 TEST(AllocatableCUFTest, SimplePinnedAllocate) {
   using Fortran::common::TypeCategory;
-  Fortran::runtime::cuda::CUFRegisterAllocator();
+  RTNAME(CUFRegisterAllocator)();
   ScopedContext ctx;
   // INTEGER(4), PINNED, ALLOCATABLE :: a(:)
   auto a{createAllocatable(TypeCategory::Integer, 4)};
@@ -92,7 +92,7 @@ TEST(AllocatableCUFTest, SimplePinnedAllocate) {
 
 TEST(AllocatableCUFTest, DescriptorAllocationTest) {
   using Fortran::common::TypeCategory;
-  Fortran::runtime::cuda::CUFRegisterAllocator();
+  RTNAME(CUFRegisterAllocator)();
   ScopedContext ctx;
   // REAL(4), DEVICE, ALLOCATABLE :: a(:)
   auto a{createAllocatable(TypeCategory::Real, 4)};

From 201da87c3f53f9450dd60ee5adbddf46fe19c430 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Thu, 8 Aug 2024 17:29:09 -0700
Subject: [PATCH 093/112] [mlir][vector] Handle corner cases in
 DropUnitDimsFromTransposeOp. (#102518)

https://github.com/llvm/llvm-project/commit/da8778e499d8049ac68c2e152941a38ff2bc9fb2
breaks the lowering of vector.transpose that all the dimensions are unit
dimensions. The revision fixes the issue and adds a test.

---------

Signed-off-by: hanhanW <hanhan0912@gmail.com>
---
 .../Dialect/Vector/Transforms/VectorTransforms.cpp   |  7 +++++++
 .../test/Dialect/Vector/vector-transfer-flatten.mlir | 12 ++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index bc0c96b32a80f5b..7f59a378e035124 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -1771,6 +1771,13 @@ struct DropUnitDimsFromTransposeOp final
       newPerm.push_back(idx - droppedDimsBefore[idx]);
     }
 
+    // Fixup for `newPerm`. The `sourceTypeWithoutUnitDims` could be vector<1xT>
+    // type when the dimensions are unit dimensions. In this case, the newPerm
+    // should be [0].
+    if (newPerm.empty()) {
+      newPerm.push_back(0);
+    }
+
     Location loc = op.getLoc();
     // Drop the unit dims via shape_cast.
     auto dropDimsShapeCast = rewriter.create<vector::ShapeCastOp>(
diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
index 937dbf22bb713f8..df0a5c5fa0ce869 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
@@ -737,6 +737,18 @@ func.func @transpose_with_scalable_unit_dims(%vec: vector<[1]x1x2x4x1xf32>) -> v
 
 // -----
 
+func.func @transpose_with_all_unit_dims(%vec: vector<1x1x1xf32>) -> vector<1x1x1xf32> {
+  %res = vector.transpose %vec, [0, 2, 1] : vector<1x1x1xf32> to vector<1x1x1xf32>
+  return %res : vector<1x1x1xf32>
+}
+// The `vec` is returned because there are other flattening patterns that fold
+// vector.shape_cast ops away.
+// CHECK-LABEL: func.func @transpose_with_all_unit_dims
+// CHECK-SAME:      %[[VEC:.[a-zA-Z0-9]+]]
+// CHECK-NEXT:    return %[[VEC]]
+
+// -----
+
 func.func @negative_transpose_with_no_unit_dims(%vec: vector<4x2x3xf32>) -> vector<4x3x2xf32> {
   %res = vector.transpose %vec, [0, 2, 1] : vector<4x2x3xf32> to vector<4x3x2xf32>
   return %res : vector<4x3x2xf32>

From 0a4e1c518bbca5f3bced6ded6dd71d2fe6622ac3 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 8 Aug 2024 17:33:54 -0700
Subject: [PATCH 094/112] [RISCV] Add some Zfinx instructions to
 hasAllNBitUsers.

---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp      |  6 ++++++
 llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp        |  6 ++++++
 llvm/test/CodeGen/RISCV/double-convert-strict.ll |  8 +++-----
 llvm/test/CodeGen/RISCV/double-convert.ll        |  8 +++-----
 llvm/test/CodeGen/RISCV/float-convert-strict.ll  |  8 +++-----
 llvm/test/CodeGen/RISCV/float-convert.ll         |  8 +++-----
 llvm/test/CodeGen/RISCV/half-convert-strict.ll   | 16 ++++++----------
 llvm/test/CodeGen/RISCV/half-convert.ll          | 16 ++++++----------
 8 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 604234b243153c3..ce3a37e194d545a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3194,11 +3194,17 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,
     case RISCV::SLLI_UW:
     case RISCV::FMV_W_X:
     case RISCV::FCVT_H_W:
+    case RISCV::FCVT_H_W_INX:
     case RISCV::FCVT_H_WU:
+    case RISCV::FCVT_H_WU_INX:
     case RISCV::FCVT_S_W:
+    case RISCV::FCVT_S_W_INX:
     case RISCV::FCVT_S_WU:
+    case RISCV::FCVT_S_WU_INX:
     case RISCV::FCVT_D_W:
+    case RISCV::FCVT_D_W_INX:
     case RISCV::FCVT_D_WU:
+    case RISCV::FCVT_D_WU_INX:
     case RISCV::TH_REVW:
     case RISCV::TH_SRRIW:
       if (Bits >= 32)
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 00ac8cb9a9066d7..effec2cc776d809 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -174,11 +174,17 @@ static bool hasAllNBitUsers(const MachineInstr &OrigMI,
       case RISCV::SLLI_UW:
       case RISCV::FMV_W_X:
       case RISCV::FCVT_H_W:
+      case RISCV::FCVT_H_W_INX:
       case RISCV::FCVT_H_WU:
+      case RISCV::FCVT_H_WU_INX:
       case RISCV::FCVT_S_W:
+      case RISCV::FCVT_S_W_INX:
       case RISCV::FCVT_S_WU:
+      case RISCV::FCVT_S_WU_INX:
       case RISCV::FCVT_D_W:
+      case RISCV::FCVT_D_W_INX:
       case RISCV::FCVT_D_WU:
+      case RISCV::FCVT_D_WU_INX:
         if (Bits >= 32)
           break;
         return false;
diff --git a/llvm/test/CodeGen/RISCV/double-convert-strict.ll b/llvm/test/CodeGen/RISCV/double-convert-strict.ll
index 13bcafb5ebd1361..3732978b8bd83ea 100644
--- a/llvm/test/CodeGen/RISCV/double-convert-strict.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert-strict.ll
@@ -777,11 +777,9 @@ define signext i32 @fcvt_d_w_demanded_bits(i32 signext %0, ptr %1) nounwind stri
 ;
 ; RV64IZFINXZDINX-LABEL: fcvt_d_w_demanded_bits:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    addiw a2, a0, 1
-; RV64IZFINXZDINX-NEXT:    addi a0, a0, 1
-; RV64IZFINXZDINX-NEXT:    fcvt.d.w a0, a0
-; RV64IZFINXZDINX-NEXT:    sd a0, 0(a1)
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    addiw a0, a0, 1
+; RV64IZFINXZDINX-NEXT:    fcvt.d.w a2, a0
+; RV64IZFINXZDINX-NEXT:    sd a2, 0(a1)
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_d_w_demanded_bits:
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index feea4f19720b0ba..2e2e1b924cf0096 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -1459,11 +1459,9 @@ define signext i32 @fcvt_d_w_demanded_bits(i32 signext %0, ptr %1) nounwind {
 ;
 ; RV64IZFINXZDINX-LABEL: fcvt_d_w_demanded_bits:
 ; RV64IZFINXZDINX:       # %bb.0:
-; RV64IZFINXZDINX-NEXT:    addiw a2, a0, 1
-; RV64IZFINXZDINX-NEXT:    addi a0, a0, 1
-; RV64IZFINXZDINX-NEXT:    fcvt.d.w a0, a0
-; RV64IZFINXZDINX-NEXT:    sd a0, 0(a1)
-; RV64IZFINXZDINX-NEXT:    mv a0, a2
+; RV64IZFINXZDINX-NEXT:    addiw a0, a0, 1
+; RV64IZFINXZDINX-NEXT:    fcvt.d.w a2, a0
+; RV64IZFINXZDINX-NEXT:    sd a2, 0(a1)
 ; RV64IZFINXZDINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_d_w_demanded_bits:
diff --git a/llvm/test/CodeGen/RISCV/float-convert-strict.ll b/llvm/test/CodeGen/RISCV/float-convert-strict.ll
index 402d6f0362e6d3d..0c265e11652a2ec 100644
--- a/llvm/test/CodeGen/RISCV/float-convert-strict.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert-strict.ll
@@ -645,11 +645,9 @@ define signext i32 @fcvt_s_w_demanded_bits(i32 signext %0, ptr %1) nounwind stri
 ;
 ; RV64IZFINX-LABEL: fcvt_s_w_demanded_bits:
 ; RV64IZFINX:       # %bb.0:
-; RV64IZFINX-NEXT:    addiw a2, a0, 1
-; RV64IZFINX-NEXT:    addi a0, a0, 1
-; RV64IZFINX-NEXT:    fcvt.s.w a0, a0
-; RV64IZFINX-NEXT:    sw a0, 0(a1)
-; RV64IZFINX-NEXT:    mv a0, a2
+; RV64IZFINX-NEXT:    addiw a0, a0, 1
+; RV64IZFINX-NEXT:    fcvt.s.w a2, a0
+; RV64IZFINX-NEXT:    sw a2, 0(a1)
 ; RV64IZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_s_w_demanded_bits:
diff --git a/llvm/test/CodeGen/RISCV/float-convert.ll b/llvm/test/CodeGen/RISCV/float-convert.ll
index 7eabd3f5f2273af..21bf6618c52a26b 100644
--- a/llvm/test/CodeGen/RISCV/float-convert.ll
+++ b/llvm/test/CodeGen/RISCV/float-convert.ll
@@ -1247,11 +1247,9 @@ define signext i32 @fcvt_s_w_demanded_bits(i32 signext %0, ptr %1) nounwind {
 ;
 ; RV64IZFINX-LABEL: fcvt_s_w_demanded_bits:
 ; RV64IZFINX:       # %bb.0:
-; RV64IZFINX-NEXT:    addiw a2, a0, 1
-; RV64IZFINX-NEXT:    addi a0, a0, 1
-; RV64IZFINX-NEXT:    fcvt.s.w a0, a0
-; RV64IZFINX-NEXT:    sw a0, 0(a1)
-; RV64IZFINX-NEXT:    mv a0, a2
+; RV64IZFINX-NEXT:    addiw a0, a0, 1
+; RV64IZFINX-NEXT:    fcvt.s.w a2, a0
+; RV64IZFINX-NEXT:    sw a2, 0(a1)
 ; RV64IZFINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_s_w_demanded_bits:
diff --git a/llvm/test/CodeGen/RISCV/half-convert-strict.ll b/llvm/test/CodeGen/RISCV/half-convert-strict.ll
index 677aa9263ea6155..8f88a4c570ea054 100644
--- a/llvm/test/CodeGen/RISCV/half-convert-strict.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert-strict.ll
@@ -1963,11 +1963,9 @@ define signext i32 @fcvt_h_w_demanded_bits(i32 signext %0, ptr %1) strictfp {
 ;
 ; RV64IZHINX-LABEL: fcvt_h_w_demanded_bits:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    addiw a2, a0, 1
-; RV64IZHINX-NEXT:    addi a0, a0, 1
-; RV64IZHINX-NEXT:    fcvt.h.w a0, a0
-; RV64IZHINX-NEXT:    sh a0, 0(a1)
-; RV64IZHINX-NEXT:    mv a0, a2
+; RV64IZHINX-NEXT:    addiw a0, a0, 1
+; RV64IZHINX-NEXT:    fcvt.h.w a2, a0
+; RV64IZHINX-NEXT:    sh a2, 0(a1)
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IDZFH-LABEL: fcvt_h_w_demanded_bits:
@@ -1993,11 +1991,9 @@ define signext i32 @fcvt_h_w_demanded_bits(i32 signext %0, ptr %1) strictfp {
 ;
 ; RV64IZDINXZHINX-LABEL: fcvt_h_w_demanded_bits:
 ; RV64IZDINXZHINX:       # %bb.0:
-; RV64IZDINXZHINX-NEXT:    addiw a2, a0, 1
-; RV64IZDINXZHINX-NEXT:    addi a0, a0, 1
-; RV64IZDINXZHINX-NEXT:    fcvt.h.w a0, a0
-; RV64IZDINXZHINX-NEXT:    sh a0, 0(a1)
-; RV64IZDINXZHINX-NEXT:    mv a0, a2
+; RV64IZDINXZHINX-NEXT:    addiw a0, a0, 1
+; RV64IZDINXZHINX-NEXT:    fcvt.h.w a2, a0
+; RV64IZDINXZHINX-NEXT:    sh a2, 0(a1)
 ; RV64IZDINXZHINX-NEXT:    ret
 ;
 ; CHECK32-IZFHMIN-LABEL: fcvt_h_w_demanded_bits:
diff --git a/llvm/test/CodeGen/RISCV/half-convert.ll b/llvm/test/CodeGen/RISCV/half-convert.ll
index 31fb6e2ee9c8409..48bfe1c37c625c6 100644
--- a/llvm/test/CodeGen/RISCV/half-convert.ll
+++ b/llvm/test/CodeGen/RISCV/half-convert.ll
@@ -5760,11 +5760,9 @@ define signext i32 @fcvt_h_w_demanded_bits(i32 signext %0, ptr %1) nounwind {
 ;
 ; RV64IZHINX-LABEL: fcvt_h_w_demanded_bits:
 ; RV64IZHINX:       # %bb.0:
-; RV64IZHINX-NEXT:    addiw a2, a0, 1
-; RV64IZHINX-NEXT:    addi a0, a0, 1
-; RV64IZHINX-NEXT:    fcvt.h.w a0, a0
-; RV64IZHINX-NEXT:    sh a0, 0(a1)
-; RV64IZHINX-NEXT:    mv a0, a2
+; RV64IZHINX-NEXT:    addiw a0, a0, 1
+; RV64IZHINX-NEXT:    fcvt.h.w a2, a0
+; RV64IZHINX-NEXT:    sh a2, 0(a1)
 ; RV64IZHINX-NEXT:    ret
 ;
 ; RV32IZDINXZHINX-LABEL: fcvt_h_w_demanded_bits:
@@ -5776,11 +5774,9 @@ define signext i32 @fcvt_h_w_demanded_bits(i32 signext %0, ptr %1) nounwind {
 ;
 ; RV64IZDINXZHINX-LABEL: fcvt_h_w_demanded_bits:
 ; RV64IZDINXZHINX:       # %bb.0:
-; RV64IZDINXZHINX-NEXT:    addiw a2, a0, 1
-; RV64IZDINXZHINX-NEXT:    addi a0, a0, 1
-; RV64IZDINXZHINX-NEXT:    fcvt.h.w a0, a0
-; RV64IZDINXZHINX-NEXT:    sh a0, 0(a1)
-; RV64IZDINXZHINX-NEXT:    mv a0, a2
+; RV64IZDINXZHINX-NEXT:    addiw a0, a0, 1
+; RV64IZDINXZHINX-NEXT:    fcvt.h.w a2, a0
+; RV64IZDINXZHINX-NEXT:    sh a2, 0(a1)
 ; RV64IZDINXZHINX-NEXT:    ret
 ;
 ; RV32I-LABEL: fcvt_h_w_demanded_bits:

From 9788368c37c319b11eb9a31af0f10aac62ba4f72 Mon Sep 17 00:00:00 2001
From: Job Henandez Lara <hj93@protonmail.com>
Date: Thu, 8 Aug 2024 17:58:59 -0700
Subject: [PATCH 095/112] [libc][math][c23] Fix setpayloadsig smoke test on
 RV32 (#102538)

---
 libc/test/src/math/smoke/SetPayloadSigTest.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/libc/test/src/math/smoke/SetPayloadSigTest.h b/libc/test/src/math/smoke/SetPayloadSigTest.h
index 7ec3ac08a180abe..60913c60b481c66 100644
--- a/libc/test/src/math/smoke/SetPayloadSigTest.h
+++ b/libc/test/src/math/smoke/SetPayloadSigTest.h
@@ -35,7 +35,13 @@ class SetPayloadSigTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_EQ(1, func(&res, T(-1.0)));
     EXPECT_EQ(1, func(&res, T(0x42.1p+0)));
     EXPECT_EQ(1, func(&res, T(-0x42.1p+0)));
-    EXPECT_EQ(1, func(&res, T(StorageType(1) << (FPBits::FRACTION_LEN - 1))));
+
+    FPBits default_snan_payload_bits = FPBits::one();
+    default_snan_payload_bits.set_biased_exponent(FPBits::FRACTION_LEN - 1 +
+                                                  FPBits::EXP_BIAS);
+    T default_snan_payload = default_snan_payload_bits.get_val();
+
+    EXPECT_EQ(1, func(&res, default_snan_payload));
   }
 
   void testValidPayloads(SetPayloadSigFunc func) {
@@ -56,7 +62,12 @@ class SetPayloadSigTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
     EXPECT_EQ(FPBits::signaling_nan(Sign::POS, 0x123).uintval(),
               FPBits(res).uintval());
 
-    EXPECT_EQ(0, func(&res, T(FPBits::FRACTION_MASK >> 1)));
+    FPBits nan_payload_bits = FPBits::one();
+    nan_payload_bits.set_biased_exponent(FPBits::FRACTION_LEN - 2 +
+                                         FPBits::EXP_BIAS);
+    nan_payload_bits.set_mantissa(FPBits::SIG_MASK - 3);
+    T nan_payload = nan_payload_bits.get_val();
+    EXPECT_EQ(0, func(&res, nan_payload));
     EXPECT_TRUE(FPBits(res).is_signaling_nan());
     EXPECT_EQ(
         FPBits::signaling_nan(Sign::POS, FPBits::FRACTION_MASK >> 1).uintval(),

From c1912b4dd772b79814a17b03ad183959f73034cc Mon Sep 17 00:00:00 2001
From: Peter Jung <admin@ptr1337.dev>
Date: Fri, 9 Aug 2024 03:05:41 +0200
Subject: [PATCH 096/112] [BOLT][docs] Fix typo (#98640)

Typo:

`chwon` --> `chown`

Signed-off-by: Peter Jung <admin@ptr1337.dev>
---
 bolt/docs/OptimizingLinux.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bolt/docs/OptimizingLinux.md b/bolt/docs/OptimizingLinux.md
index 0045f0ead9fd0d0..c85fecabcccc292 100644
--- a/bolt/docs/OptimizingLinux.md
+++ b/bolt/docs/OptimizingLinux.md
@@ -37,7 +37,7 @@ Convert `perf` profile into a format suitable for BOLT passing the `vmlinux` bin
 
 
 ```bash
-$ sudo chwon $USER perf.data
+$ sudo chown $USER perf.data
 $ perf2bolt -p perf.data -o perf.fdata vmlinux
 ```
 

From 2ac2e9a5b6c97cbf267db1ef322ed21ebceb2aba Mon Sep 17 00:00:00 2001
From: Diego Caballero <dieg0ca6aller0@gmail.com>
Date: Thu, 8 Aug 2024 19:27:54 -0700
Subject: [PATCH 097/112] [mlir][LLVM] Improve lowering of `llvm.byval`
 function arguments (#100028)

When a function argument is annotated with the `llvm.byval` attribute,
[LLVM expects](https://llvm.org/docs/LangRef.html#parameter-attributes)
the function argument type to be an `llvm.ptr`. For example:

```
func.func (%args0 : llvm.ptr {llvm.byval = !llvm.struct<(i32)>} {
  ...
}
```

Unfortunately, this makes the type conversion context-dependent, which
is something that the type conversion infrastructure (i.e.,
`LLVMTypeConverter` in this particular case) doesn't support. For
example, we may want to convert `MyType` to `llvm.struct<(i32)>` in
general, but to an `llvm.ptr` type only when it's a function argument
passed by value.

To fix this problem, this PR changes the FuncToLLVM conversion logic to
generate an `llvm.ptr` when the function argument has a `llvm.byval`
attribute. An `llvm.load` is inserted into the function to retrieve the
value expected by the argument users.
---
 .../Conversion/LLVMCommon/TypeConverter.h     | 33 ++++++++-
 mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp | 46 +++++++++++-
 .../Conversion/LLVMCommon/TypeConverter.cpp   | 71 +++++++++++++++++--
 .../test/Transforms/test-convert-func-op.mlir | 30 +++++++-
 .../FuncToLLVM/TestConvertFuncOp.cpp          | 16 ++++-
 5 files changed, 184 insertions(+), 12 deletions(-)

diff --git a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
index e228229302cff46..d79b90f840ce836 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
@@ -21,6 +21,7 @@
 namespace mlir {
 
 class DataLayoutAnalysis;
+class FunctionOpInterface;
 class LowerToLLVMOptions;
 
 namespace LLVM {
@@ -50,13 +51,25 @@ class LLVMTypeConverter : public TypeConverter {
   LLVMTypeConverter(MLIRContext *ctx, const LowerToLLVMOptions &options,
                     const DataLayoutAnalysis *analysis = nullptr);
 
-  /// Convert a function type.  The arguments and results are converted one by
+  /// Convert a function type. The arguments and results are converted one by
   /// one and results are packed into a wrapped LLVM IR structure type. `result`
   /// is populated with argument mapping.
   Type convertFunctionSignature(FunctionType funcTy, bool isVariadic,
                                 bool useBarePtrCallConv,
                                 SignatureConversion &result) const;
 
+  /// Convert a function type. The arguments and results are converted one by
+  /// one and results are packed into a wrapped LLVM IR structure type. `result`
+  /// is populated with argument mapping. Converted types of `llvm.byval` and
+  /// `llvm.byref` function arguments which are not LLVM pointers are overridden
+  /// with LLVM pointers. Overridden arguments are returned in
+  /// `byValRefNonPtrAttrs`.
+  Type convertFunctionSignature(FunctionOpInterface funcOp, bool isVariadic,
+                                bool useBarePtrCallConv,
+                                LLVMTypeConverter::SignatureConversion &result,
+                                SmallVectorImpl<std::optional<NamedAttribute>>
+                                    &byValRefNonPtrAttrs) const;
+
   /// Convert a non-empty list of types to be returned from a function into an
   /// LLVM-compatible type. In particular, if more than one value is returned,
   /// create an LLVM dialect structure type with elements that correspond to
@@ -159,12 +172,26 @@ class LLVMTypeConverter : public TypeConverter {
   SmallVector<Type> &getCurrentThreadRecursiveStack();
 
 private:
-  /// Convert a function type.  The arguments and results are converted one by
-  /// one.  Additionally, if the function returns more than one value, pack the
+  /// Convert a function type. The arguments and results are converted one by
+  /// one. Additionally, if the function returns more than one value, pack the
   /// results into an LLVM IR structure type so that the converted function type
   /// returns at most one result.
   Type convertFunctionType(FunctionType type) const;
 
+  /// Common implementation for `convertFunctionSignature` methods. Convert a
+  /// function type. The arguments and results are converted one by one and
+  /// results are packed into a wrapped LLVM IR structure type. `result` is
+  /// populated with argument mapping. If `byValRefNonPtrAttrs` is provided,
+  /// converted types of `llvm.byval` and `llvm.byref` function arguments which
+  /// are not LLVM pointers are overridden with LLVM pointers. `llvm.byval` and
+  /// `llvm.byref` arguments that were already converted to LLVM pointer types
+  /// are removed from 'byValRefNonPtrAttrs`.
+  Type convertFunctionSignatureImpl(
+      FunctionType funcTy, bool isVariadic, bool useBarePtrCallConv,
+      LLVMTypeConverter::SignatureConversion &result,
+      SmallVectorImpl<std::optional<NamedAttribute>> *byValRefNonPtrAttrs)
+      const;
+
   /// Convert the index type.  Uses llvmModule data layout to create an integer
   /// of the pointer bitwidth.
   Type convertIndexType(IndexType type) const;
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index c1f6d8bc5b361df..4c2e8682285c52f 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -267,6 +267,38 @@ static void wrapExternalFunction(OpBuilder &builder, Location loc,
   }
 }
 
+/// Inserts `llvm.load` ops in the function body to restore the expected pointee
+/// value from `llvm.byval`/`llvm.byref` function arguments that were converted
+/// to LLVM pointer types.
+static void restoreByValRefArgumentType(
+    ConversionPatternRewriter &rewriter, const LLVMTypeConverter &typeConverter,
+    ArrayRef<std::optional<NamedAttribute>> byValRefNonPtrAttrs,
+    LLVM::LLVMFuncOp funcOp) {
+  // Nothing to do for function declarations.
+  if (funcOp.isExternal())
+    return;
+
+  ConversionPatternRewriter::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPointToStart(&funcOp.getFunctionBody().front());
+
+  for (const auto &[arg, byValRefAttr] :
+       llvm::zip(funcOp.getArguments(), byValRefNonPtrAttrs)) {
+    // Skip argument if no `llvm.byval` or `llvm.byref` attribute.
+    if (!byValRefAttr)
+      continue;
+
+    // Insert load to retrieve the actual argument passed by value/reference.
+    assert(isa<LLVM::LLVMPointerType>(arg.getType()) &&
+           "Expected LLVM pointer type for argument with "
+           "`llvm.byval`/`llvm.byref` attribute");
+    Type resTy = typeConverter.convertType(
+        cast<TypeAttr>(byValRefAttr->getValue()).getValue());
+
+    auto valueArg = rewriter.create<LLVM::LoadOp>(arg.getLoc(), resTy, arg);
+    rewriter.replaceAllUsesExcept(arg, valueArg, valueArg);
+  }
+}
+
 FailureOr<LLVM::LLVMFuncOp>
 mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
                                 ConversionPatternRewriter &rewriter,
@@ -280,10 +312,14 @@ mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
   // Convert the original function arguments. They are converted using the
   // LLVMTypeConverter provided to this legalization pattern.
   auto varargsAttr = funcOp->getAttrOfType<BoolAttr>(varargsAttrName);
+  // Gather `llvm.byval` and `llvm.byref` arguments whose type convertion was
+  // overriden with an LLVM pointer type for later processing.
+  SmallVector<std::optional<NamedAttribute>> byValRefNonPtrAttrs;
   TypeConverter::SignatureConversion result(funcOp.getNumArguments());
   auto llvmType = converter.convertFunctionSignature(
-      funcTy, varargsAttr && varargsAttr.getValue(),
-      shouldUseBarePtrCallConv(funcOp, &converter), result);
+      funcOp, varargsAttr && varargsAttr.getValue(),
+      shouldUseBarePtrCallConv(funcOp, &converter), result,
+      byValRefNonPtrAttrs);
   if (!llvmType)
     return rewriter.notifyMatchFailure(funcOp, "signature conversion failed");
 
@@ -398,6 +434,12 @@ mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
                                        "region types conversion failed");
   }
 
+  // Fix the type mismatch between the materialized `llvm.ptr` and the expected
+  // pointee type in the function body when converting `llvm.byval`/`llvm.byref`
+  // function arguments.
+  restoreByValRefArgumentType(rewriter, converter, byValRefNonPtrAttrs,
+                              newFuncOp);
+
   if (!shouldUseBarePtrCallConv(funcOp, &converter)) {
     if (funcOp->getAttrOfType<UnitAttr>(
             LLVM::LLVMDialect::getEmitCWrapperAttrName())) {
diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
index 17be4d91ee05465..5313a64ed47e3ae 100644
--- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
@@ -270,13 +270,42 @@ Type LLVMTypeConverter::convertFunctionType(FunctionType type) const {
   return LLVM::LLVMPointerType::get(type.getContext());
 }
 
+/// Returns the `llvm.byval` or `llvm.byref` attributes that are present in the
+/// function arguments. Returns an empty container if none of these attributes
+/// are found in any of the arguments.
+static void
+filterByValRefArgAttrs(FunctionOpInterface funcOp,
+                       SmallVectorImpl<std::optional<NamedAttribute>> &result) {
+  assert(result.empty() && "Unexpected non-empty output");
+  result.resize(funcOp.getNumArguments(), std::nullopt);
+  bool foundByValByRefAttrs = false;
+  for (int argIdx : llvm::seq(funcOp.getNumArguments())) {
+    for (NamedAttribute namedAttr : funcOp.getArgAttrs(argIdx)) {
+      if ((namedAttr.getName() == LLVM::LLVMDialect::getByValAttrName() ||
+           namedAttr.getName() == LLVM::LLVMDialect::getByRefAttrName())) {
+        foundByValByRefAttrs = true;
+        result[argIdx] = namedAttr;
+        break;
+      }
+    }
+  }
+
+  if (!foundByValByRefAttrs)
+    result.clear();
+}
+
 // Function types are converted to LLVM Function types by recursively converting
-// argument and result types.  If MLIR Function has zero results, the LLVM
-// Function has one VoidType result.  If MLIR Function has more than one result,
+// argument and result types. If MLIR Function has zero results, the LLVM
+// Function has one VoidType result. If MLIR Function has more than one result,
 // they are into an LLVM StructType in their order of appearance.
-Type LLVMTypeConverter::convertFunctionSignature(
+// If `byValRefNonPtrAttrs` is provided, converted types of `llvm.byval` and
+// `llvm.byref` function arguments which are not LLVM pointers are overridden
+// with LLVM pointers. `llvm.byval` and `llvm.byref` arguments that were already
+// converted to LLVM pointer types are removed from 'byValRefNonPtrAttrs`.
+Type LLVMTypeConverter::convertFunctionSignatureImpl(
     FunctionType funcTy, bool isVariadic, bool useBarePtrCallConv,
-    LLVMTypeConverter::SignatureConversion &result) const {
+    LLVMTypeConverter::SignatureConversion &result,
+    SmallVectorImpl<std::optional<NamedAttribute>> *byValRefNonPtrAttrs) const {
   // Select the argument converter depending on the calling convention.
   useBarePtrCallConv = useBarePtrCallConv || options.useBarePtrCallConv;
   auto funcArgConverter = useBarePtrCallConv ? barePtrFuncArgTypeConverter
@@ -286,6 +315,19 @@ Type LLVMTypeConverter::convertFunctionSignature(
     SmallVector<Type, 8> converted;
     if (failed(funcArgConverter(*this, type, converted)))
       return {};
+
+    // Rewrite converted type of `llvm.byval` or `llvm.byref` function
+    // argument that was not converted to an LLVM pointer types.
+    if (byValRefNonPtrAttrs != nullptr && !byValRefNonPtrAttrs->empty() &&
+        converted.size() == 1 && (*byValRefNonPtrAttrs)[idx].has_value()) {
+      // If the argument was already converted to an LLVM pointer type, we stop
+      // tracking it as it doesn't need more processing.
+      if (isa<LLVM::LLVMPointerType>(converted[0]))
+        (*byValRefNonPtrAttrs)[idx] = std::nullopt;
+      else
+        converted[0] = LLVM::LLVMPointerType::get(&getContext());
+    }
+
     result.addInputs(idx, converted);
   }
 
@@ -302,6 +344,27 @@ Type LLVMTypeConverter::convertFunctionSignature(
                                      isVariadic);
 }
 
+Type LLVMTypeConverter::convertFunctionSignature(
+    FunctionType funcTy, bool isVariadic, bool useBarePtrCallConv,
+    LLVMTypeConverter::SignatureConversion &result) const {
+  return convertFunctionSignatureImpl(funcTy, isVariadic, useBarePtrCallConv,
+                                      result,
+                                      /*byValRefNonPtrAttrs=*/nullptr);
+}
+
+Type LLVMTypeConverter::convertFunctionSignature(
+    FunctionOpInterface funcOp, bool isVariadic, bool useBarePtrCallConv,
+    LLVMTypeConverter::SignatureConversion &result,
+    SmallVectorImpl<std::optional<NamedAttribute>> &byValRefNonPtrAttrs) const {
+  // Gather all `llvm.byval` and `llvm.byref` function arguments. Only those
+  // that were not converted to LLVM pointer types will be returned for further
+  // processing.
+  filterByValRefArgAttrs(funcOp, byValRefNonPtrAttrs);
+  auto funcTy = cast<FunctionType>(funcOp.getFunctionType());
+  return convertFunctionSignatureImpl(funcTy, isVariadic, useBarePtrCallConv,
+                                      result, &byValRefNonPtrAttrs);
+}
+
 /// Converts the function type to a C-compatible format, in particular using
 /// pointers to memref descriptors for arguments.
 std::pair<LLVM::LLVMFunctionType, LLVM::LLVMStructType>
diff --git a/mlir/test/Transforms/test-convert-func-op.mlir b/mlir/test/Transforms/test-convert-func-op.mlir
index 6e96703cda57891..180f16a32991b32 100644
--- a/mlir/test/Transforms/test-convert-func-op.mlir
+++ b/mlir/test/Transforms/test-convert-func-op.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -test-convert-func-op | FileCheck %s
+// RUN: mlir-opt %s -test-convert-func-op --split-input-file | FileCheck %s
 
 // CHECK-LABEL: llvm.func @add
 func.func @add(%arg0: i32, %arg1: i32) -> i32 attributes { llvm.emit_c_interface } {
@@ -10,3 +10,31 @@ func.func @add(%arg0: i32, %arg1: i32) -> i32 attributes { llvm.emit_c_interface
 // CHECK-SAME: [[ARG1:%[a-zA-Z0-9_]+]]: i32
 // CHECK-NEXT: [[RES:%.*]] = llvm.call @add([[ARG0]], [[ARG1]])
 // CHECK-NEXT: llvm.return [[RES]]
+
+// -----
+
+// Test that `llvm.byval` arguments are converted to `llvm.ptr` and the actual
+// value is retrieved within the `llvm.func`.
+
+// CHECK-LABEL: llvm.func @byval
+func.func @byval(%arg0: !test.smpla {llvm.byval = !test.smpla}) -> !test.smpla {
+  return %arg0 : !test.smpla
+}
+
+// CHECK-SAME: (%[[ARG0:.*]]: !llvm.ptr {llvm.byval = !llvm.struct<(i8, i8)>}) -> !llvm.struct<(i8, i8)>
+//      CHECK: %[[LD:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> !llvm.struct<(i8, i8)>
+//      CHECK: llvm.return %[[LD]] : !llvm.struct<(i8, i8)>
+
+// -----
+
+// Test that `llvm.byref` arguments are converted to `llvm.ptr` and the actual
+// value is retrieved within the `llvm.func`.
+
+// CHECK-LABEL: llvm.func @byref
+func.func @byref(%arg0: !test.smpla {llvm.byref = !test.smpla}) -> !test.smpla {
+  return %arg0 : !test.smpla
+}
+
+// CHECK-SAME: (%[[ARG0:.*]]: !llvm.ptr {llvm.byref = !llvm.struct<(i8, i8)>}) -> !llvm.struct<(i8, i8)>
+//      CHECK: %[[LD:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> !llvm.struct<(i8, i8)>
+//      CHECK: llvm.return %[[LD]] : !llvm.struct<(i8, i8)>
diff --git a/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp b/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp
index e25e890e2290a48..75168dde93130f6 100644
--- a/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp
+++ b/mlir/test/lib/Conversion/FuncToLLVM/TestConvertFuncOp.cpp
@@ -47,12 +47,23 @@ struct ReturnOpConversion : public ConvertOpToLLVMPattern<func::ReturnOp> {
   LogicalResult
   matchAndRewrite(func::ReturnOp returnOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(returnOp,
-                                                returnOp->getOperands());
+    SmallVector<Type> resTys;
+    if (failed(typeConverter->convertTypes(returnOp->getResultTypes(), resTys)))
+      return failure();
+
+    rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(returnOp, resTys,
+                                                adaptor.getOperands());
     return success();
   }
 };
 
+static std::optional<Type>
+convertSimpleATypeToStruct(test::SimpleAType simpleTy) {
+  MLIRContext *ctx = simpleTy.getContext();
+  SmallVector<Type> memberTys(2, IntegerType::get(ctx, /*width=*/8));
+  return LLVM::LLVMStructType::getLiteral(ctx, memberTys);
+}
+
 struct TestConvertFuncOp
     : public PassWrapper<TestConvertFuncOp, OperationPass<ModuleOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestConvertFuncOp)
@@ -74,6 +85,7 @@ struct TestConvertFuncOp
     LowerToLLVMOptions options(ctx);
     // Populate type conversions.
     LLVMTypeConverter typeConverter(ctx, options);
+    typeConverter.addConversion(convertSimpleATypeToStruct);
 
     RewritePatternSet patterns(ctx);
     patterns.add<FuncOpConversion>(typeConverter);

From 00139ae1bc0ae855ebe9c8991f480b382bbc4308 Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Thu, 8 Aug 2024 23:38:08 -0300
Subject: [PATCH 098/112] Revert "[clang] Reland: Instantiate concepts with
 sugared template arguments (#101782)" (#102551)

---
 clang/docs/ReleaseNotes.rst                      |  5 ++---
 clang/include/clang/Sema/Template.h              |  6 +++---
 clang/lib/Sema/SemaConcept.cpp                   | 11 ++++-------
 clang/lib/Sema/SemaExprCXX.cpp                   |  5 +++--
 clang/lib/Sema/SemaTemplate.cpp                  | 10 +++++-----
 clang/lib/Sema/SemaTemplateDeduction.cpp         | 16 ++++++++--------
 clang/lib/Serialization/ASTReaderDecl.cpp        |  2 +-
 clang/test/AST/ast-dump-concepts.cpp             | 10 ++++------
 .../expr.prim.req/compound-requirement.cpp       | 10 +++++-----
 .../expr.prim.req/nested-requirement.cpp         |  2 +-
 .../expr.prim.req/simple-requirement.cpp         |  4 ++--
 .../expr.prim/expr.prim.req/type-requirement.cpp | 12 ++++++------
 .../temp/temp.constr/temp.constr.normal/p1.cpp   |  2 +-
 clang/test/CXX/temp/temp.param/p10-2a.cpp        |  4 ++--
 .../SemaTemplate/concepts-recursive-inst.cpp     | 12 ++++++------
 clang/test/SemaTemplate/concepts.cpp             |  4 ++--
 .../SemaTemplate/instantiate-requires-expr.cpp   | 16 +++-------------
 clang/test/SemaTemplate/pr52970.cpp              |  2 +-
 18 files changed, 59 insertions(+), 74 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index cb0a1b25e51c2a5..7beef7be0e6a533 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -75,8 +75,8 @@ sections with improvements to Clang's support for those languages.
 
 C++ Language Changes
 --------------------
-- Allow single element access of GCC vector/ext_vector_type object to be 
-  constant expression. Supports the `V.xyzw` syntax and other tidbits 
+- Allow single element access of GCC vector/ext_vector_type object to be
+  constant expression. Supports the `V.xyzw` syntax and other tidbits
   as seen in OpenCL. Selecting multiple elements is left as a future work.
 
 C++17 Feature Support
@@ -166,7 +166,6 @@ Improvements to Clang's diagnostics
 - Clang now diagnoses undefined behavior in constant expressions more consistently. This includes invalid shifts, and signed overflow in arithmetic.
 
 - -Wdangling-assignment-gsl is enabled by default.
-- Clang now does a better job preserving the template arguments as written when specializing concepts.
 - Clang now always preserves the template arguments as written used
   to specialize template type aliases.
 
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index d616865afe807e6..0340c23fd170d62 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -234,8 +234,7 @@ enum class TemplateSubstitutionKind : char {
     /// Replaces the current 'innermost' level with the provided argument list.
     /// This is useful for type deduction cases where we need to get the entire
     /// list from the AST, but then add the deduced innermost list.
-    void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args,
-                                           bool Final = false) {
+    void replaceInnermostTemplateArguments(Decl *AssociatedDecl, ArgList Args) {
       assert((!TemplateArgumentLists.empty() || NumRetainedOuterLevels) &&
              "Replacing in an empty list?");
 
@@ -247,7 +246,8 @@ enum class TemplateSubstitutionKind : char {
         TemplateArgumentLists[0].Args = Args;
       } else {
         --NumRetainedOuterLevels;
-        TemplateArgumentLists.push_back({{AssociatedDecl, Final}, Args});
+        TemplateArgumentLists.push_back(
+            {{AssociatedDecl, /*Final=*/false}, Args});
       }
     }
 
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index d4c9d044985e34d..d1e62fb5cee6233 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -414,8 +414,7 @@ DiagRecursiveConstraintEval(Sema &S, llvm::FoldingSetNodeID &ID,
   E->Profile(ID, S.Context, /*Canonical=*/true);
   for (const auto &List : MLTAL)
     for (const auto &TemplateArg : List.Args)
-      S.Context.getCanonicalTemplateArgument(TemplateArg)
-          .Profile(ID, S.Context);
+      TemplateArg.Profile(ID, S.Context);
 
   // Note that we have to do this with our own collection, because there are
   // times where a constraint-expression check can cause us to need to evaluate
@@ -643,8 +642,8 @@ bool Sema::CheckConstraintSatisfaction(
   // here.
   llvm::SmallVector<TemplateArgument, 4> FlattenedArgs;
   for (auto List : TemplateArgsLists)
-    for (const TemplateArgument &Arg : List.Args)
-      FlattenedArgs.emplace_back(Context.getCanonicalTemplateArgument(Arg));
+    FlattenedArgs.insert(FlattenedArgs.end(), List.Args.begin(),
+                         List.Args.end());
 
   llvm::FoldingSetNodeID ID;
   ConstraintSatisfaction::Profile(ID, Context, Template, FlattenedArgs);
@@ -828,8 +827,6 @@ Sema::SetupConstraintCheckingTemplateArgumentsAndScope(
                                    /*RelativeToPrimary=*/true,
                                    /*Pattern=*/nullptr,
                                    /*ForConstraintInstantiation=*/true);
-  if (TemplateArgs)
-    MLTAL.replaceInnermostTemplateArguments(FD, *TemplateArgs, /*Final=*/true);
   if (SetupConstraintScope(FD, TemplateArgs, MLTAL, Scope))
     return std::nullopt;
 
@@ -1483,7 +1480,7 @@ static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N,
                                         const ConceptSpecializationExpr *CSE) {
   MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
       CSE->getNamedConcept(), CSE->getNamedConcept()->getLexicalDeclContext(),
-      /*Final=*/true, CSE->getTemplateArguments(),
+      /*Final=*/false, CSE->getTemplateArguments(),
       /*RelativeToPrimary=*/true,
       /*Pattern=*/nullptr,
       /*ForConstraintInstantiation=*/true);
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 1b56b4cabd133ef..124435330ca104f 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -9330,13 +9330,14 @@ Sema::BuildExprRequirement(
     //     be satisfied.
     TemplateParameterList *TPL =
         ReturnTypeRequirement.getTypeConstraintTemplateParameterList();
-    QualType MatchedType = Context.getReferenceQualifiedType(E);
+    QualType MatchedType =
+        Context.getReferenceQualifiedType(E).getCanonicalType();
     llvm::SmallVector<TemplateArgument, 1> Args;
     Args.push_back(TemplateArgument(MatchedType));
 
     auto *Param = cast<TemplateTypeParmDecl>(TPL->getParam(0));
 
-    MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/true);
+    MultiLevelTemplateArgumentList MLTAL(Param, Args, /*Final=*/false);
     MLTAL.addOuterRetainedLevels(TPL->getDepth());
     const TypeConstraint *TC = Param->getTypeConstraint();
     assert(TC && "Type Constraint cannot be null here");
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index cd3ee31fcca610b..29e7978ba5b1f86 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -4358,13 +4358,13 @@ Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
 
   auto *CSD = ImplicitConceptSpecializationDecl::Create(
       Context, NamedConcept->getDeclContext(), NamedConcept->getLocation(),
-      SugaredConverted);
+      CanonicalConverted);
   ConstraintSatisfaction Satisfaction;
   bool AreArgsDependent =
       TemplateSpecializationType::anyDependentTemplateArguments(
-          *TemplateArgs, SugaredConverted);
-  MultiLevelTemplateArgumentList MLTAL(NamedConcept, SugaredConverted,
-                                       /*Final=*/true);
+          *TemplateArgs, CanonicalConverted);
+  MultiLevelTemplateArgumentList MLTAL(NamedConcept, CanonicalConverted,
+                                       /*Final=*/false);
   LocalInstantiationScope Scope(*this);
 
   EnterExpressionEvaluationContext EECtx{
@@ -5583,7 +5583,7 @@ bool Sema::CheckTemplateArgumentList(
     CXXThisScopeRAII(*this, RD, ThisQuals, RD != nullptr);
 
     MultiLevelTemplateArgumentList MLTAL = getTemplateInstantiationArgs(
-        Template, NewContext, /*Final=*/true, SugaredConverted,
+        Template, NewContext, /*Final=*/false, CanonicalConverted,
         /*RelativeToPrimary=*/true,
         /*Pattern=*/nullptr,
         /*ForConceptInstantiation=*/true);
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 978f1a9dc1a933d..e9705ec43d86cc6 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3078,7 +3078,7 @@ CheckDeducedArgumentConstraints(Sema &S, TemplateDeclT *Template,
   // If we don't need to replace the deduced template arguments,
   // we can add them immediately as the inner-most argument list.
   if (!DeducedArgsNeedReplacement(Template))
-    Innermost = SugaredDeducedArgs;
+    Innermost = CanonicalDeducedArgs;
 
   MultiLevelTemplateArgumentList MLTAL = S.getTemplateInstantiationArgs(
       Template, Template->getDeclContext(), /*Final=*/false, Innermost,
@@ -3090,7 +3090,7 @@ CheckDeducedArgumentConstraints(Sema &S, TemplateDeclT *Template,
   // not class-scope explicit specialization, so replace with Deduced Args
   // instead of adding to inner-most.
   if (!Innermost)
-    MLTAL.replaceInnermostTemplateArguments(Template, SugaredDeducedArgs);
+    MLTAL.replaceInnermostTemplateArguments(Template, CanonicalDeducedArgs);
 
   if (S.CheckConstraintSatisfaction(Template, AssociatedConstraints, MLTAL,
                                     Info.getLocation(),
@@ -3913,13 +3913,13 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
       (CanonicalBuilder.size() ==
        FunctionTemplate->getTemplateParameters()->size())) {
     if (CheckInstantiatedFunctionTemplateConstraints(
-            Info.getLocation(), Specialization, SugaredBuilder,
+            Info.getLocation(), Specialization, CanonicalBuilder,
             Info.AssociatedConstraintsSatisfaction))
       return TemplateDeductionResult::MiscellaneousDeductionFailure;
 
     if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) {
-      Info.reset(TemplateArgumentList::CreateCopy(Context, SugaredBuilder),
-                 Info.takeCanonical());
+      Info.reset(Info.takeSugared(),
+                 TemplateArgumentList::CreateCopy(Context, CanonicalBuilder));
       return TemplateDeductionResult::ConstraintsNotSatisfied;
     }
   }
@@ -4993,8 +4993,8 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
                                   /*PartialTemplateArgs=*/false,
                                   SugaredConverted, CanonicalConverted))
     return true;
-  MultiLevelTemplateArgumentList MLTAL(Concept, SugaredConverted,
-                                       /*Final=*/true);
+  MultiLevelTemplateArgumentList MLTAL(Concept, CanonicalConverted,
+                                       /*Final=*/false);
   // Build up an EvaluationContext with an ImplicitConceptSpecializationDecl so
   // that the template arguments of the constraint can be preserved. For
   // example:
@@ -5008,7 +5008,7 @@ static bool CheckDeducedPlaceholderConstraints(Sema &S, const AutoType &Type,
       S, Sema::ExpressionEvaluationContext::Unevaluated,
       ImplicitConceptSpecializationDecl::Create(
           S.getASTContext(), Concept->getDeclContext(), Concept->getLocation(),
-          SugaredConverted));
+          CanonicalConverted));
   if (S.CheckConstraintSatisfaction(Concept, {Concept->getConstraintExpr()},
                                     MLTAL, TypeLoc.getLocalSourceRange(),
                                     Satisfaction))
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index fcf1579d86dda7f..c118f3818467d93 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2391,7 +2391,7 @@ void ASTDeclReader::VisitImplicitConceptSpecializationDecl(
   VisitDecl(D);
   llvm::SmallVector<TemplateArgument, 4> Args;
   for (unsigned I = 0; I < D->NumTemplateArgs; ++I)
-    Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/false));
+    Args.push_back(Record.readTemplateArgument(/*Canonicalize=*/true));
   D->setTemplateArguments(Args);
 }
 
diff --git a/clang/test/AST/ast-dump-concepts.cpp b/clang/test/AST/ast-dump-concepts.cpp
index 4b8e9026b2916b0..a5e0673c241ef41 100644
--- a/clang/test/AST/ast-dump-concepts.cpp
+++ b/clang/test/AST/ast-dump-concepts.cpp
@@ -20,9 +20,8 @@ struct Foo {
   // CHECK:      TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'binary_concept'
   // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13, col:31> 'bool' Concept {{.*}} 'binary_concept'
   // CHECK-NEXT:   |-ImplicitConceptSpecializationDecl {{.*}} <line:13:9> col:9
-  // CHECK-NEXT:   | |-TemplateArgument type 'R'
-  // CHECK-NEXT:   | | `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0
-  // CHECK-NEXT:   | |   `-TemplateTypeParm {{.*}} 'R'
+  // CHECK-NEXT:   | |-TemplateArgument type 'type-parameter-1-0'  
+  // CHECK-NEXT:   | | `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0
   // CHECK-NEXT:   | `-TemplateArgument type 'int'
   // CHECK-NEXT:   |   `-BuiltinType {{.*}} 'int'
   // CHECK-NEXT:   |-TemplateArgument {{.*}} type 'R'
@@ -36,9 +35,8 @@ struct Foo {
   // CHECK:      TemplateTypeParmDecl {{.*}} referenced Concept {{.*}} 'unary_concept'
   // CHECK-NEXT: `-ConceptSpecializationExpr {{.*}} <col:13> 'bool'
   // CHECK-NEXT:   |-ImplicitConceptSpecializationDecl {{.*}} <line:10:9> col:9
-  // CHECK-NEXT:   | `-TemplateArgument type 'R'
-  // CHECK-NEXT:   |   `-TemplateTypeParmType {{.*}} 'R' dependent {{.*}}depth 1 index 0
-  // CHECK-NEXT:   |     `-TemplateTypeParm {{.*}} 'R'
+  // CHECK-NEXT:   | `-TemplateArgument type 'type-parameter-1-0'
+  // CHECK-NEXT:   |   `-TemplateTypeParmType {{.*}} 'type-parameter-1-0' dependent {{.*}}depth 1 index 0
   template <unary_concept R>
   Foo(R);
 
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
index 3d6f0b11fa99c90..dc0e84280e05672 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/compound-requirement.cpp
@@ -35,14 +35,14 @@ using r2i2 = r2<A>; // expected-error{{constraints not satisfied for class templ
 using r2i3 = r2<D>;
 using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class template 'r2' [with T = const D]}}
 
-template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}}
+template<typename T> requires requires { { sizeof(T) }; } // expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}} expected-note{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}}
 struct r3 {};
 
 using r3i1 = r3<int>;
 using r3i2 = r3<A>;
 using r3i3 = r3<A &>;
 using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}}
-using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}}
+using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}}
 
 // Non-dependent expressions
 
@@ -149,7 +149,7 @@ namespace std_example {
   template<typename T> constexpr bool is_same_v<T, T> = true;
 
   template<typename T, typename U> concept same_as = is_same_v<T, U>;
-  // expected-note@-1 {{because 'is_same_v<int, typename T2::inner>' evaluated to false}}
+  // expected-note@-1 {{because 'is_same_v<int, int *>' evaluated to false}}
 
   static_assert(C1<int>);
   static_assert(C1<int*>);
@@ -173,9 +173,9 @@ namespace std_example {
     int operator *() { return 0; }
   };
   static_assert(C2<T1>);
-  template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'T2' does not satisfy 'C2'}}
+  template<C2 T> struct C2_check {}; // expected-note{{because 'int' does not satisfy 'C2'}} expected-note{{because 'std_example::T2' does not satisfy 'C2'}}
   using c2c1 = C2_check<int>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = int]}}
-  using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = T2]}}
+  using c2c2 = C2_check<T2>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::T2]}}
 
   template<typename T>
   void g(T t) noexcept(sizeof(T) == 1) {}
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
index 00ac9d0422d67e5..763d983d20f6152 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
@@ -27,7 +27,7 @@ using r4i = X<void>::r4<int>; // expected-error{{constraints not satisfied for c
 
 // C++ [expr.prim.req.nested] Examples
 namespace std_example {
-  template<typename U> concept C1 = sizeof(U) == 1; // expected-note{{because 'sizeof(decltype(+t)) == 1' (4 == 1) evaluated to false}}
+  template<typename U> concept C1 = sizeof(U) == 1; // expected-note{{because 'sizeof(int) == 1' (4 == 1) evaluated to false}}
   template<typename T> concept D =
     requires (T t) {
       requires C1<decltype (+t)>; // expected-note{{because 'decltype(+t)' (aka 'int') does not satisfy 'C1'}}
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
index abfadfa34884119..7515f5c62d5ea88 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/simple-requirement.cpp
@@ -39,14 +39,14 @@ using r2i4 = r2<const D>; // expected-error{{constraints not satisfied for class
 
 template<typename T> requires requires { sizeof(T); }
 // expected-note@-1{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'void'}}
-// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'class nonexistent'}}
+// expected-note@-2{{because 'sizeof(T)' would be invalid: invalid application of 'sizeof' to an incomplete type 'nonexistent'}}
 struct r3 {};
 
 using r3i1 = r3<int>;
 using r3i2 = r3<A>;
 using r3i3 = r3<A &>;
 using r3i4 = r3<void>; // expected-error{{constraints not satisfied for class template 'r3' [with T = void]}}
-using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = class nonexistent]}}
+using r3i4 = r3<class nonexistent>; // expected-error{{constraints not satisfied for class template 'r3' [with T = nonexistent]}}
 
 template<typename T> requires requires (T t) { 0; "a"; (void)'a'; }
 struct r4 {};
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
index 28dff336d053c6f..5433cfb21955dd8 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/type-requirement.cpp
@@ -182,14 +182,14 @@ namespace std_example {
   static_assert(C1<has_inner_and_type> && C2<has_inner_and_type> && C3<has_inner_and_type>);
   template<C1 T> struct C1_check {};
   // expected-note@-1 {{because 'int' does not satisfy 'C1'}}
-  // expected-note@-2 {{because 'has_type' does not satisfy 'C1'}}
+  // expected-note@-2 {{because 'std_example::has_type' does not satisfy 'C1'}}
   template<C2 T> struct C2_check {};
-  // expected-note@-1 {{because 'has_inner' does not satisfy 'C2'}}
+  // expected-note@-1 {{because 'std_example::has_inner' does not satisfy 'C2'}}
   template<C3 T> struct C3_check {};
   // expected-note@-1 {{because 'void' does not satisfy 'C3'}}
   using c1 = C1_check<int>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = int]}}
-  using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = has_type]}}
-  using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = has_inner]}}
+  using c2 = C1_check<has_type>; // expected-error{{constraints not satisfied for class template 'C1_check' [with T = std_example::has_type]}}
+  using c3 = C2_check<has_inner>; // expected-error{{constraints not satisfied for class template 'C2_check' [with T = std_example::has_inner]}}
   using c4 = C3_check<void>; // expected-error{{constraints not satisfied for class template 'C3_check' [with T = void]}}
 }
 
@@ -199,10 +199,10 @@ template <typename T> concept C = requires { requires requires { T::a; }; };
 // expected-note@-1 {{because 'T::a' would be invalid: no member named 'a' in 'PR48656::T1'}}
 
 template <C...> struct A {};
-// expected-note@-1 {{because 'T1' does not satisfy 'C'}}
+// expected-note@-1 {{because 'PR48656::T1' does not satisfy 'C'}}
 
 struct T1 {};
-template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <T1>]}}
+template struct A<T1>; // expected-error {{constraints not satisfied for class template 'A' [with $0 = <PR48656::T1>]}}
 
 struct T2 { static constexpr bool a = false; };
 template struct A<T2>;
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
index 02c97b4591a1566..d80710937cdfa1d 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.normal/p1.cpp
@@ -8,7 +8,7 @@ template<typename T> requires Bar<T> && true struct S<T> { };
 
 template<typename T> concept True2 = sizeof(T) >= 0;
 template<typename T> concept Foo2 = True2<T*>;
-// expected-error@-1{{'type name' declared as a pointer to a reference of type 'T &'}}
+// expected-error@-1{{'type name' declared as a pointer to a reference of type 'type-parameter-0-0 &'}}
 template<typename T> concept Bar2 = Foo2<T&>;
 // expected-note@-1{{while substituting into concept arguments here; substitution failures not allowed in concept arguments}}
 template<typename T> requires Bar2<T> struct S2 { };
diff --git a/clang/test/CXX/temp/temp.param/p10-2a.cpp b/clang/test/CXX/temp/temp.param/p10-2a.cpp
index 97e0ef35837b16d..4f5fdd3b4809ac5 100644
--- a/clang/test/CXX/temp/temp.param/p10-2a.cpp
+++ b/clang/test/CXX/temp/temp.param/p10-2a.cpp
@@ -94,8 +94,8 @@ concept OneOf = (is_same_v<T, Ts> || ...);
 // expected-note@-5 {{and 'is_same_v<short, char>' evaluated to false}}
 // expected-note@-6 3{{because 'is_same_v<int, char[1]>' evaluated to false}}
 // expected-note@-7 3{{and 'is_same_v<int, char[2]>' evaluated to false}}
-// expected-note@-8 2{{because 'is_same_v<decltype(nullptr), char>' evaluated to false}}
-// expected-note@-9 2{{and 'is_same_v<decltype(nullptr), int>' evaluated to false}}
+// expected-note@-8 2{{because 'is_same_v<std::nullptr_t, char>' evaluated to false}}
+// expected-note@-9 2{{and 'is_same_v<std::nullptr_t, int>' evaluated to false}}
 
 template<OneOf<char[1], char[2]> T, OneOf<int, long, char> U>
 // expected-note@-1 2{{because 'OneOf<char, char[1], char[2]>' evaluated to false}}
diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
index 8ac5a0e753a3409..9330df8cdd0398a 100644
--- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp
+++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
@@ -12,7 +12,7 @@ void g() {
   // expected-note@#FDEF{{because 'int' does not satisfy 'c'}}
   // expected-note@#CDEF{{because 'f(t)' would be invalid: no matching function for call to 'f'}}
 }
-} // namespace GH53213
+} // namespace GH53213 
 
 namespace GH45736 {
 struct constrained;
@@ -69,13 +69,13 @@ void baz() {
 auto it = begin(rng); // #BEGIN_CALL
 // expected-error@#INF_BEGIN {{satisfaction of constraint 'Inf<Inf auto>' depends on itself}}
 // expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}}
-// expected-note@#INF_BEGIN_EXPR {{while checking constraint satisfaction for template 'begin<struct my_range>' required here}}
+// expected-note@#INF_BEGIN_EXPR {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
 // expected-note@#INF_BEGIN_EXPR {{while substituting deduced template arguments into function template 'begin'}}
 // expected-note@#INF_BEGIN_EXPR {{in instantiation of requirement here}}
 // expected-note@#INF_REQ {{while substituting template arguments into constraint expression here}}
-// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<struct my_range>' requested here}}
+// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}}
 // expected-note@#INF_BEGIN {{while substituting template arguments into constraint expression here}}
-// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<struct my_range>' required here}}
+// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
 // expected-note@#BEGIN_CALL {{in instantiation of function template specialization}}
 
 // Fallout of the failure is failed lookup, which is necessary to stop odd
@@ -100,7 +100,7 @@ namespace GH50891 {
   static_assert(Numeric<Deferred>); // #STATIC_ASSERT
   // expected-error@#NUMERIC{{satisfaction of constraint 'requires (T a) { foo(a); }' depends on itself}}
   // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}}
-  // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}}
+  // expected-note@#OP_TO {{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}}
   // expected-note@#OP_TO {{while substituting template arguments into constraint expression here}}
   // expected-note@#FOO_CALL {{while checking constraint satisfaction for template}}
   // expected-note@#FOO_CALL {{in instantiation of function template specialization}}
@@ -108,7 +108,7 @@ namespace GH50891 {
   // expected-note@#NUMERIC {{while substituting template arguments into constraint expression here}}
 
   // expected-error@#STATIC_ASSERT {{static assertion failed}}
-  // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<Deferred>' requested here}}
+  // expected-note@#STATIC_ASSERT{{while checking the satisfaction of concept 'Numeric<GH50891::Deferred>' requested here}}
   // expected-note@#STATIC_ASSERT{{because substituted constraint expression is ill-formed: constraint depends on a previously diagnosed expression}}
 
 } // namespace GH50891
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 19b138270eebb28..a4b42cad79abd49 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -836,13 +836,13 @@ struct Parent {
 static_assert(Parent<void>::TakesUnary<int, 0>::i == 0);
 // expected-error@+3{{constraints not satisfied for class template 'TakesUnary'}}
 // expected-note@#UNARY{{because 'decltype(0ULL)' (aka 'unsigned long long') does not satisfy 'C'}}
-// expected-note@#61777_C{{because 'sizeof(decltype(0ULL)) == 4' (8 == 4) evaluated to false}}
+// expected-note@#61777_C{{because 'sizeof(unsigned long long) == 4' (8 == 4) evaluated to false}}
 static_assert(Parent<void>::TakesUnary<int, 0uLL>::i == 0);
 
 static_assert(Parent<int>::TakesBinary<int, 0>::i == 0);
 // expected-error@+3{{constraints not satisfied for class template 'TakesBinary'}}
 // expected-note@#BINARY{{because 'C2<decltype(0ULL), int>' evaluated to false}}
-// expected-note@#61777_C2{{because 'sizeof(decltype(0ULL)) == sizeof(int)' (8 == 4) evaluated to false}}
+// expected-note@#61777_C2{{because 'sizeof(unsigned long long) == sizeof(int)' (8 == 4) evaluated to false}}
 static_assert(Parent<int>::TakesBinary<int, 0ULL>::i == 0);
 }
 
diff --git a/clang/test/SemaTemplate/instantiate-requires-expr.cpp b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
index c5ebeff828295a8..20a19d731ae1696 100644
--- a/clang/test/SemaTemplate/instantiate-requires-expr.cpp
+++ b/clang/test/SemaTemplate/instantiate-requires-expr.cpp
@@ -76,8 +76,8 @@ namespace type_requirement {
   // expected-note@-2 {{because 'false_v<requires { typename contains_template<short>::template temp<contains_template<short> >; }>' evaluated to false}}
   struct r2 {};
 
-  using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<int>]}}
-  using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = contains_template<short>]}}
+  using r2i1 = r2<contains_template<int>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<int>]}}
+  using r2i2 = r2<contains_template<short>>; // expected-error{{constraints not satisfied for class template 'r2' [with T = type_requirement::contains_template<short>]}}
 
   // substitution error occurs, then requires expr is instantiated again
 
@@ -108,7 +108,7 @@ namespace type_requirement {
   // expected-note@-1 {{because 'false_v<requires { <<error-type>>; } && requires { <<error-type>>; }>' evaluated to false}}
   struct r7 {};
 
-  using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, A>]}}
+  using r7i = r7<int, A>; // expected-error{{constraints not satisfied for class template 'r7' [with Ts = <int, type_requirement::A>]}}
 }
 
 namespace expr_requirement {
@@ -237,13 +237,3 @@ constexpr bool e_v = true;
 static_assert(e_v<bool>);
 
 } // namespace GH73885
-
-namespace sugared_instantiation {
-  template <class C1> concept C = requires { C1{}; };
-  template <class D1> concept D = requires { new D1; };
-
-  // Test that 'deduced auto' doesn't get confused with 'undeduced auto'.
-  auto f() { return 0; }
-  static_assert(requires { { f() } -> C; });
-  static_assert(requires { { f() } -> D; });
-} // namespace sugared_instantiation
diff --git a/clang/test/SemaTemplate/pr52970.cpp b/clang/test/SemaTemplate/pr52970.cpp
index 6aabc419bd2b88c..7aac5ee85659349 100644
--- a/clang/test/SemaTemplate/pr52970.cpp
+++ b/clang/test/SemaTemplate/pr52970.cpp
@@ -53,7 +53,7 @@ static_assert(!DotFollowingPointer::f(Bad{}), "");
 #if __cplusplus >= 202002L
 template <class T>
 concept C = requires(T t) { t.begin(); };
-  // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Bad' (aka 'Holder<Incomplete> *') is a pointer}}
+  // cxx20-note@-1 {{because 't.begin()' would be invalid: member reference type 'Holder<Incomplete> *' is a pointer}}
 
 static_assert(C<Good>);
 static_assert(!C<Bad>);

From 689700439ab07619a822cf1d902b90ee2b6037fe Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 8 Aug 2024 20:50:27 -0700
Subject: [PATCH 099/112] Fix test on Windows

---
 .../Windows/dll_report_globals_symbolization_at_startup.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp
index e3efa0a1a4fd278..06a632e6708b1e6 100644
--- a/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp
+++ b/compiler-rt/test/asan/TestCases/Windows/dll_report_globals_symbolization_at_startup.cpp
@@ -10,9 +10,8 @@
 #include <windows.h>
 #include <stdio.h>
 
-extern "C" {
 #if defined(EXE)
-__declspec(dllimport) int foo_from_dll();
+extern "C" __declspec(dllimport) int foo_from_dll();
 
 // CHECK: in DLL(reason=1)
 int main(int argc, char **argv) {
@@ -23,6 +22,7 @@ int main(int argc, char **argv) {
 // CHECK: in DLL(reason=0)
 }
 #elif defined(DLL)
+extern "C" {
 // This global is registered at startup.
 int x[42];
 
@@ -35,7 +35,7 @@ BOOL WINAPI DllMain(HMODULE, DWORD reason, LPVOID) {
   fflush(0);
   return TRUE;
 }
+}
 #else
 # error oops!
 #endif
-}

From 5297b750e54dafe16cc13f24b8d5478214e83682 Mon Sep 17 00:00:00 2001
From: akshaykumars614 <88362922+akshaykumars614@users.noreply.github.com>
Date: Fri, 9 Aug 2024 00:55:15 -0400
Subject: [PATCH 100/112] clang-tidy: readability-redundant-smartptr-get does
 not remove (#97964) (#100177)

added a check to remove '->' if exists

added testcase and modified Release Notes
---
 .../readability/RedundantSmartptrGetCheck.cpp |  4 ++
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 ++
 .../readability/redundant-smartptr-get.cpp    | 48 +++++++++++++++++++
 3 files changed, 56 insertions(+)

diff --git a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
index 8837ac16e882817..be52af77ae0a519 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantSmartptrGetCheck.cpp
@@ -164,6 +164,10 @@ void RedundantSmartptrGetCheck::check(const MatchFinder::MatchResult &Result) {
   StringRef SmartptrText = Lexer::getSourceText(
       CharSourceRange::getTokenRange(Smartptr->getSourceRange()),
       *Result.SourceManager, getLangOpts());
+  // Check if the last two characters are "->" and remove them
+  if (SmartptrText.ends_with("->")) {
+    SmartptrText = SmartptrText.drop_back(2);
+  }
   // Replace foo->get() with *foo, and foo.get() with foo.
   std::string Replacement = Twine(IsPtrToPtr ? "*" : "", SmartptrText).str();
   diag(GetCall->getBeginLoc(), "redundant get() call on smart pointer")
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 642ad39cc0c1c5d..b72d109b3d3938b 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -104,6 +104,10 @@ New check aliases
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Improved :doc:`readability-redundant-smartptr-get
+  <clang-tidy/checks/readability/redundant-smartptr-get>` check to
+  remove `->`, when reduntant `get()` is removed.
+
 Removed checks
 ^^^^^^^^^^^^^^
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get.cpp
index 01f12b6bfe6ea0b..ec4ca4cb79484b2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get.cpp
@@ -20,6 +20,23 @@ struct shared_ptr {
   explicit operator bool() const noexcept;
 };
 
+template <typename T>
+struct vector {
+  vector();
+  bool operator==(const vector<T>& other) const;
+  bool operator!=(const vector<T>& other) const;
+  unsigned long size() const;
+  bool empty() const;
+
+  using iterator = T*;
+
+  iterator begin();
+  iterator end();
+
+  T* data;
+  unsigned long sz;
+};
+
 }  // namespace std
 
 struct Bar {
@@ -235,3 +252,34 @@ void Negative() {
   if (MACRO(x) == nullptr)
     ;
 }
+
+void test_redundant_get() {
+  std::vector<std::shared_ptr<int>> v;
+  auto f = [](int) {};
+  for (auto i = v.begin(); i != v.end(); ++i) {
+    f(*i->get());
+    // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: redundant get() call
+    // CHECK-FIXES: f(**i);
+  }
+}
+
+struct Inner {
+  int a;
+  int *getValue()  { return &a; }
+};
+
+struct Example {
+  Inner inner;
+  Inner* get() { return &inner; }
+  int *getValue()  { return inner.getValue(); }
+};
+
+void test_redundant_get_with_member() {
+  std::vector<std::shared_ptr<Example>> v;
+  auto f = [](int) {};
+  for (auto i = v.begin(); i != v.end(); ++i) {
+    f(*(*i).get()->get()->getValue());
+    // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: redundant get() call
+    // CHECK-FIXES: f(**i->get()->getValue());
+ }
+}

From be66c506c7fd6fdb7363f724075d02ca0d35713a Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 9 Aug 2024 13:39:24 +0800
Subject: [PATCH 101/112] [C++20] [Modules] Emit Errors when compiling a
 non-module source as module (#102565)

Close https://github.com/llvm/llvm-project/issues/101398

The root cause of the issue is that I removed the codes before and
failed to recognize it in time and this was not found for a long time
due to it only crashes with invalid codes.
---
 clang/include/clang/Basic/DiagnosticSemaKinds.td     |  2 ++
 clang/lib/Sema/Sema.cpp                              | 12 ++++++++++++
 .../dcl.dcl/dcl.module/dcl.module.interface/p1.cppm  |  4 +++-
 clang/test/Modules/pr101398.cppm                     |  5 +++++
 4 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Modules/pr101398.cppm

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5cdf36660b2a66d..554dbaff2ce0d87 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -11693,6 +11693,8 @@ def err_module_not_defined : Error<
 def err_module_redeclaration : Error<
   "translation unit contains multiple module declarations">;
 def note_prev_module_declaration : Note<"previous module declaration is here">;
+def err_module_declaration_missing : Error<
+  "missing 'export module' declaration in module interface unit">;
 def err_module_declaration_missing_after_global_module_introducer : Error<
   "missing 'module' declaration at end of global module fragment "
   "introduced here">;
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 19d8692ee64849b..633b8220ffbf11d 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1272,6 +1272,18 @@ void Sema::ActOnEndOfTranslationUnit() {
                                    Module::ExplicitGlobalModuleFragment) {
     Diag(ModuleScopes.back().BeginLoc,
          diag::err_module_declaration_missing_after_global_module_introducer);
+  } else if (getLangOpts().getCompilingModule() ==
+                 LangOptions::CMK_ModuleInterface &&
+             // We can't use ModuleScopes here since ModuleScopes is always
+             // empty if we're compiling the BMI.
+             !getASTContext().getCurrentNamedModule()) {
+    // If we are building a module interface unit, we should have seen the
+    // module declaration.
+    //
+    // FIXME: Make a better guess as to where to put the module declaration.
+    Diag(getSourceManager().getLocForStartOfFile(
+             getSourceManager().getMainFileID()),
+         diag::err_module_declaration_missing);
   }
 
   // Now we can decide whether the modules we're building need an initializer.
diff --git a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
index 1a01ffac0154aef..84ef85126c369a0 100644
--- a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
+++ b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++20 %s -verify -o /dev/null
+// RUN: %clang_cc1 -std=c++20 %s -verify -emit-module-interface -o /dev/null
 // RUN: %clang_cc1 -std=c++20 %s -DINTERFACE -verify -emit-module-interface -o %t
 // RUN: %clang_cc1 -std=c++20 %s -DIMPLEMENTATION -verify -fmodule-file=A=%t -o /dev/null
 //
@@ -15,6 +15,8 @@ module A; // #module-decl
   // expected-error@-2 {{missing 'export' specifier in module declaration while building module interface}}
   #define INTERFACE
  #endif
+#else // Not in a module
+// expected-error@* {{missing 'export module' declaration in module interface unit}}
 #endif
 
 #ifndef INTERFACE
diff --git a/clang/test/Modules/pr101398.cppm b/clang/test/Modules/pr101398.cppm
new file mode 100644
index 000000000000000..843d0ce84fdce3a
--- /dev/null
+++ b/clang/test/Modules/pr101398.cppm
@@ -0,0 +1,5 @@
+// RUN: mkdir -p %t
+// RUN: %clang -std=c++20 -xc++-module %s -Xclang -verify --precompile -o %t/tmp.pcm
+// not modules
+
+// expected-error@* {{missing 'export module' declaration in module interface unit}}

From f09a28e66cfc5815dbac3e42944b5d0432f970bc Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 9 Aug 2024 07:51:30 +0200
Subject: [PATCH 102/112] [mlir][Transforms][NFC] Dialect conversion: Eagerly
 build reverse mapping (#101476)

The "inverse mapping" is an inverse IRMapping that points from replaced
values to their original values. This inverse mapping is needed when
legalizing unresolved materializations, to figure out if a value has any
uses. (It is not sufficient to examine the IR, because some IR changes
have not been materialized yet.)

There was a check in `OperationConverter::finalize` that computed the
inverse mapping only when needed. This check is not needed.
`legalizeUnresolvedMaterializations` always computes the inverse
mapping, so we can just do that in `OperationConverter::finalize` before
calling `legalizeUnresolvedMaterializations`.

Depends on #98805
---
 .../Transforms/Utils/DialectConversion.cpp    | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index e86da57fb915788..8f9b21b7ee1e5b9 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -2352,7 +2352,7 @@ struct OperationConverter {
   LogicalResult legalizeUnresolvedMaterializations(
       ConversionPatternRewriter &rewriter,
       ConversionPatternRewriterImpl &rewriterImpl,
-      std::optional<DenseMap<Value, SmallVector<Value>>> &inverseMapping);
+      DenseMap<Value, SmallVector<Value>> &inverseMapping);
 
   /// Legalize an operation result that was marked as "erased".
   LogicalResult
@@ -2454,10 +2454,12 @@ LogicalResult OperationConverter::convertOperations(ArrayRef<Operation *> ops) {
 
 LogicalResult
 OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
-  std::optional<DenseMap<Value, SmallVector<Value>>> inverseMapping;
   ConversionPatternRewriterImpl &rewriterImpl = rewriter.getImpl();
-  if (failed(legalizeConvertedArgumentTypes(rewriter, rewriterImpl)) ||
-      failed(legalizeUnresolvedMaterializations(rewriter, rewriterImpl,
+  if (failed(legalizeConvertedArgumentTypes(rewriter, rewriterImpl)))
+    return failure();
+  DenseMap<Value, SmallVector<Value>> inverseMapping =
+      rewriterImpl.mapping.getInverse();
+  if (failed(legalizeUnresolvedMaterializations(rewriter, rewriterImpl,
                                                 inverseMapping)))
     return failure();
 
@@ -2483,15 +2485,11 @@ OperationConverter::finalize(ConversionPatternRewriter &rewriter) {
       if (result.getType() == newValue.getType())
         continue;
 
-      // Compute the inverse mapping only if it is really needed.
-      if (!inverseMapping)
-        inverseMapping = rewriterImpl.mapping.getInverse();
-
       // Legalize this result.
       rewriter.setInsertionPoint(op);
       if (failed(legalizeChangedResultType(
               op, result, newValue, opReplacement->getConverter(), rewriter,
-              rewriterImpl, *inverseMapping)))
+              rewriterImpl, inverseMapping)))
         return failure();
     }
   }
@@ -2503,6 +2501,8 @@ LogicalResult OperationConverter::legalizeConvertedArgumentTypes(
     ConversionPatternRewriterImpl &rewriterImpl) {
   // Functor used to check if all users of a value will be dead after
   // conversion.
+  // TODO: This should probably query the inverse mapping, same as in
+  // `legalizeChangedResultType`.
   auto findLiveUser = [&](Value val) {
     auto liveUserIt = llvm::find_if_not(val.getUsers(), [&](Operation *user) {
       return rewriterImpl.isOpIgnored(user);
@@ -2796,20 +2796,18 @@ static LogicalResult legalizeUnresolvedMaterialization(
 LogicalResult OperationConverter::legalizeUnresolvedMaterializations(
     ConversionPatternRewriter &rewriter,
     ConversionPatternRewriterImpl &rewriterImpl,
-    std::optional<DenseMap<Value, SmallVector<Value>>> &inverseMapping) {
-  inverseMapping = rewriterImpl.mapping.getInverse();
-
+    DenseMap<Value, SmallVector<Value>> &inverseMapping) {
   // As an initial step, compute all of the inserted materializations that we
   // expect to persist beyond the conversion process.
   DenseMap<Operation *, UnresolvedMaterializationRewrite *> materializationOps;
   SetVector<UnresolvedMaterializationRewrite *> necessaryMaterializations;
   computeNecessaryMaterializations(materializationOps, rewriter, rewriterImpl,
-                                   *inverseMapping, necessaryMaterializations);
+                                   inverseMapping, necessaryMaterializations);
 
   // Once computed, legalize any necessary materializations.
   for (auto *mat : necessaryMaterializations) {
     if (failed(legalizeUnresolvedMaterialization(
-            *mat, materializationOps, rewriter, rewriterImpl, *inverseMapping)))
+            *mat, materializationOps, rewriter, rewriterImpl, inverseMapping)))
       return failure();
   }
   return success();

From e1a16cd88d761569025068ca94195acd77b2df7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Roger=20Ferrer=20Ib=C3=A1=C3=B1ez?= <rofirrim@gmail.com>
Date: Fri, 9 Aug 2024 08:01:42 +0200
Subject: [PATCH 103/112] [ExpandVectorPredication] Be more precise reporting
 changes (#102313)

This is used by PreISelIntrinsicLowering. With this change,
PreISelIntrinsicLowering does not have to assume that there were changes
just because we encountered a VP intrinsic.
---
 .../llvm/CodeGen/ExpandVectorPredication.h    | 19 +++++--
 llvm/lib/CodeGen/ExpandVectorPredication.cpp  | 51 ++++++++++++-------
 llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp | 10 ++--
 3 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ExpandVectorPredication.h b/llvm/include/llvm/CodeGen/ExpandVectorPredication.h
index c42c644c99e91ec..3aafb22eab6beaf 100644
--- a/llvm/include/llvm/CodeGen/ExpandVectorPredication.h
+++ b/llvm/include/llvm/CodeGen/ExpandVectorPredication.h
@@ -16,10 +16,21 @@ namespace llvm {
 class TargetTransformInfo;
 class VPIntrinsic;
 
-/// Expand a vector predication intrinsic. Returns true if the intrinsic was
-/// removed/replaced.
-bool expandVectorPredicationIntrinsic(VPIntrinsic &VPI,
-                                      const TargetTransformInfo &TTI);
+/// Represents the details the expansion of a VP intrinsic.
+enum class VPExpansionDetails {
+  /// No change happened during expansion.
+  IntrinsicUnchanged,
+  /// At least one operand was updated.
+  IntrinsicUpdated,
+  /// The whole intrinsic was replaced.
+  IntrinsicReplaced,
+};
+
+/// Expand a vector predication intrinsic. Returns the kind of expansion
+/// that was applied to the intrinsic.
+VPExpansionDetails
+expandVectorPredicationIntrinsic(VPIntrinsic &VPI,
+                                 const TargetTransformInfo &TTI);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 5ffdbcda93e7530..675d88d6d38cd9e 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -160,11 +160,15 @@ struct CachingVPExpander {
   Value *convertEVLToMask(IRBuilder<> &Builder, Value *EVLParam,
                           ElementCount ElemCount);
 
-  Value *foldEVLIntoMask(VPIntrinsic &VPI);
+  /// If needed, folds the EVL in the mask operand and discards the EVL
+  /// parameter. Returns a pair of the value of the intrinsic after the change
+  /// (if any) and whether the mask was actually folded.
+  std::pair<Value *, bool> foldEVLIntoMask(VPIntrinsic &VPI);
 
   /// "Remove" the %evl parameter of \p PI by setting it to the static vector
-  /// length of the operation.
-  void discardEVLParameter(VPIntrinsic &PI);
+  /// length of the operation. Returns true if the %evl (if any) was effectively
+  /// changed.
+  bool discardEVLParameter(VPIntrinsic &PI);
 
   /// Lower this VP binary operator to a unpredicated binary operator.
   Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder,
@@ -206,7 +210,9 @@ struct CachingVPExpander {
   CachingVPExpander(const TargetTransformInfo &TTI)
       : TTI(TTI), UsingTTIOverrides(anyExpandVPOverridesSet()) {}
 
-  bool expandVectorPredication(VPIntrinsic &VPI);
+  /// Expand llvm.vp.* intrinsics as requested by \p TTI.
+  /// Returns the details of the expansion.
+  VPExpansionDetails expandVectorPredication(VPIntrinsic &VPI);
 };
 
 //// CachingVPExpander {
@@ -645,15 +651,15 @@ Value *CachingVPExpander::expandPredicationInComparison(IRBuilder<> &Builder,
   return NewCmp;
 }
 
-void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
+bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
   LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n");
 
   if (VPI.canIgnoreVectorLengthParam())
-    return;
+    return false;
 
   Value *EVLParam = VPI.getVectorLengthParam();
   if (!EVLParam)
-    return;
+    return false;
 
   ElementCount StaticElemCount = VPI.getStaticVectorLength();
   Value *MaxEVL = nullptr;
@@ -672,16 +678,17 @@ void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
     MaxEVL = ConstantInt::get(Int32Ty, StaticElemCount.getFixedValue(), false);
   }
   VPI.setVectorLengthParam(MaxEVL);
+  return true;
 }
 
-Value *CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
+std::pair<Value *, bool> CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
   LLVM_DEBUG(dbgs() << "Folding vlen for " << VPI << '\n');
 
   IRBuilder<> Builder(&VPI);
 
   // Ineffective %evl parameter and so nothing to do here.
   if (VPI.canIgnoreVectorLengthParam())
-    return &VPI;
+    return {&VPI, false};
 
   // Only VP intrinsics can have an %evl parameter.
   Value *OldMaskParam = VPI.getMaskParam();
@@ -704,7 +711,7 @@ Value *CachingVPExpander::foldEVLIntoMask(VPIntrinsic &VPI) {
          "transformation did not render the evl param ineffective!");
 
   // Reassess the modified instruction.
-  return &VPI;
+  return {&VPI, true};
 }
 
 Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
@@ -807,21 +814,27 @@ CachingVPExpander::getVPLegalizationStrategy(const VPIntrinsic &VPI) const {
   return VPStrat;
 }
 
-/// Expand llvm.vp.* intrinsics as requested by \p TTI.
-bool CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) {
+VPExpansionDetails
+CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) {
   auto Strategy = getVPLegalizationStrategy(VPI);
   sanitizeStrategy(VPI, Strategy);
 
+  VPExpansionDetails Changed = VPExpansionDetails::IntrinsicUnchanged;
+
   // Transform the EVL parameter.
   switch (Strategy.EVLParamStrategy) {
   case VPLegalization::Legal:
     break;
   case VPLegalization::Discard:
-    discardEVLParameter(VPI);
+    if (discardEVLParameter(VPI))
+      Changed = VPExpansionDetails::IntrinsicUpdated;
     break;
   case VPLegalization::Convert:
-    if (foldEVLIntoMask(VPI))
+    if (auto [NewVPI, Folded] = foldEVLIntoMask(VPI); Folded) {
+      (void)NewVPI;
+      Changed = VPExpansionDetails::IntrinsicUpdated;
       ++NumFoldedVL;
+    }
     break;
   }
 
@@ -834,17 +847,17 @@ bool CachingVPExpander::expandVectorPredication(VPIntrinsic &VPI) {
   case VPLegalization::Convert:
     if (Value *V = expandPredication(VPI); V != &VPI) {
       ++NumLoweredVPOps;
-      // Return true if and only if the intrinsic was actually removed.
-      return true;
+      Changed = VPExpansionDetails::IntrinsicReplaced;
     }
     break;
   }
 
-  return false;
+  return Changed;
 }
 } // namespace
 
-bool llvm::expandVectorPredicationIntrinsic(VPIntrinsic &VPI,
-                                            const TargetTransformInfo &TTI) {
+VPExpansionDetails
+llvm::expandVectorPredicationIntrinsic(VPIntrinsic &VPI,
+                                       const TargetTransformInfo &TTI) {
   return CachingVPExpander(TTI).expandVectorPredication(VPI);
 }
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 0df90f402aaf481..3373b76edb268f8 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -361,10 +361,14 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
         Function *Parent = CI->getParent()->getParent();
         const TargetTransformInfo &TTI = LookupTTI(*Parent);
         auto *VPI = cast<VPIntrinsic>(CI);
-        return expandVectorPredicationIntrinsic(*VPI, TTI);
+        VPExpansionDetails ED = expandVectorPredicationIntrinsic(*VPI, TTI);
+        // Expansion of VP intrinsics may change the IR but not actually
+        // replace the intrinsic, so update Changed for the pass
+        // and compute Removed for forEachCall.
+        Changed |= ED != VPExpansionDetails::IntrinsicUnchanged;
+        bool Removed = ED == VPExpansionDetails::IntrinsicReplaced;
+        return Removed;
       });
-      // Not all intrinsics are removed, but the code is changed in any case.
-      Changed = true;
       break;
     case Intrinsic::objc_autorelease:
       Changed |= lowerObjCCall(F, "objc_autorelease");

From dbae30df242bdd5f20ca9a9592c1d9f201c7cb6c Mon Sep 17 00:00:00 2001
From: hev <wangrui@loongson.cn>
Date: Fri, 9 Aug 2024 14:08:32 +0800
Subject: [PATCH 104/112] [LoongArch] Load floating-point immediate using VLDI
 (#101923)

This commit uses the VLDI instruction to load some common floating-point
constants when the LSX feature is enabled.
---
 .../LoongArch/LoongArchISelLowering.cpp       |   20 +-
 .../Target/LoongArch/LoongArchISelLowering.h  |    2 +
 .../Target/LoongArch/LoongArchLSXInstrInfo.td |   31 +
 llvm/test/CodeGen/LoongArch/float-imm-vldi.ll | 5126 +++++------------
 4 files changed, 1334 insertions(+), 3845 deletions(-)

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index bdbb7ab0f513957..44d684331b3eb46 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -5573,6 +5573,24 @@ SDValue LoongArchTargetLowering::LowerReturn(
   return DAG.getNode(LoongArchISD::RET, DL, MVT::Other, RetOps);
 }
 
+bool LoongArchTargetLowering::isFPImmVLDILegal(const APFloat &Imm,
+                                               EVT VT) const {
+  if (!Subtarget.hasExtLSX())
+    return false;
+
+  if (VT == MVT::f32) {
+    uint64_t masked = Imm.bitcastToAPInt().getZExtValue() & 0x7e07ffff;
+    return (masked == 0x3e000000 || masked == 0x40000000);
+  }
+
+  if (VT == MVT::f64) {
+    uint64_t masked = Imm.bitcastToAPInt().getZExtValue() & 0x7fc0ffffffffffff;
+    return (masked == 0x3fc0000000000000 || masked == 0x4000000000000000);
+  }
+
+  return false;
+}
+
 bool LoongArchTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
                                            bool ForCodeSize) const {
   // TODO: Maybe need more checks here after vector extension is supported.
@@ -5580,7 +5598,7 @@ bool LoongArchTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
     return false;
   if (VT == MVT::f64 && !Subtarget.hasBasicD())
     return false;
-  return (Imm.isZero() || Imm.isExactlyValue(+1.0));
+  return (Imm.isZero() || Imm.isExactlyValue(1.0) || isFPImmVLDILegal(Imm, VT));
 }
 
 bool LoongArchTargetLowering::isCheapToSpeculateCttz(Type *) const {
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index d834a5d8587fd95..9723789e919b153 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -260,6 +260,8 @@ class LoongArchTargetLowering : public TargetLowering {
   bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
                               Align &PrefAlign) const override;
 
+  bool isFPImmVLDILegal(const APFloat &Imm, EVT VT) const;
+
 private:
   /// Target-specific function used to lower LoongArch calling conventions.
   typedef bool LoongArchCCAssignFn(const DataLayout &DL, LoongArchABI::ABI ABI,
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 0580683c3ce3033..cac47e35afe258c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -203,6 +203,29 @@ def to_valid_timm : SDNodeXForm<timm, [{
   return CurDAG->getTargetConstant(CN->getSExtValue(), SDLoc(N), Subtarget->getGRLenVT());
 }]>;
 
+// FP immediate of VLDI patterns.
+def f32imm_vldi : PatLeaf<(fpimm), [{
+  const auto &TLI =
+      *static_cast<const LoongArchTargetLowering*>(getTargetLowering());
+  return TLI.isFPImmVLDILegal(N->getValueAPF(), MVT::f32);
+}]>;
+def f64imm_vldi : PatLeaf<(fpimm), [{
+  const auto &TLI =
+      *static_cast<const LoongArchTargetLowering*>(getTargetLowering());
+  return TLI.isFPImmVLDILegal(N->getValueAPF(), MVT::f64);
+}]>;
+
+def to_f32imm_vldi : SDNodeXForm<fpimm, [{
+  uint64_t x = N->getValueAPF().bitcastToAPInt().getZExtValue();
+  x = (0b11011 << 8) | (((x >> 24) & 0xc0) ^ 0x40) | ((x >> 19) & 0x3f);
+  return CurDAG->getTargetConstant(SignExtend32<13>(x), SDLoc(N), MVT::i32);
+}]>;
+def to_f64imm_vldi : SDNodeXForm<fpimm, [{
+  uint64_t x = N->getValueAPF().bitcastToAPInt().getZExtValue();
+  x = (0b11100 << 8) | (((x >> 56) & 0xc0) ^ 0x40) | ((x >> 48) & 0x3f);
+  return CurDAG->getTargetConstant(SignExtend32<13>(x), SDLoc(N), MVT::i32);
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
@@ -663,7 +686,9 @@ def VMSKGEZ_B : LSX2R_VV<0x729c5000>;
 
 def VMSKNZ_B : LSX2R_VV<0x729c6000>;
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 def VLDI : LSX1RI13_VI<0x73e00000>;
+}
 
 def VAND_V : LSX3R_VVV<0x71260000>;
 def VOR_V : LSX3R_VVV<0x71268000>;
@@ -1910,6 +1935,12 @@ def : Pat<(v2i64 (fp_to_sint v2f64:$vj)), (VFTINTRZ_L_D v2f64:$vj)>;
 def : Pat<(v4i32 (fp_to_uint v4f32:$vj)), (VFTINTRZ_WU_S v4f32:$vj)>;
 def : Pat<(v2i64 (fp_to_uint v2f64:$vj)), (VFTINTRZ_LU_D v2f64:$vj)>;
 
+// Vector loads floating-point constants
+def : Pat<(f32 f32imm_vldi:$in),
+          (f32 (EXTRACT_SUBREG (VLDI (to_f32imm_vldi f32imm_vldi:$in)), sub_32))>;
+def : Pat<(f64 f64imm_vldi:$in),
+          (f64 (EXTRACT_SUBREG (VLDI (to_f64imm_vldi f64imm_vldi:$in)), sub_64))>;
+
 } // Predicates = [HasExtLSX]
 
 /// Intrinsic pattern
diff --git a/llvm/test/CodeGen/LoongArch/float-imm-vldi.ll b/llvm/test/CodeGen/LoongArch/float-imm-vldi.ll
index f228efb6b658824..551ab6ea44c66d0 100644
--- a/llvm/test/CodeGen/LoongArch/float-imm-vldi.ll
+++ b/llvm/test/CodeGen/LoongArch/float-imm-vldi.ll
@@ -1,5125 +1,2563 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc --mtriple=loongarch32 --mattr=+lsx < %s | FileCheck %s --check-prefix=LA32
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s --check-prefix=LA64
+; RUN: llc --mtriple=loongarch32 --mattr=+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
 
 define dso_local { float, double } @test1() {
-; LA32-LABEL: test1:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI0_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI0_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test1:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI0_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI0_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1280
+; CHECK-NEXT:    vldi $vr1, -1024
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.0000000000, double 2.0000000000 }
 }
 
 define dso_local { float, double } @test2() {
-; LA32-LABEL: test2:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI1_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI1_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test2:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI1_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI1_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI1_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1279
+; CHECK-NEXT:    vldi $vr1, -1023
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.1250000000, double 2.1250000000 }
 }
 
 define dso_local { float, double } @test3() {
-; LA32-LABEL: test3:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI2_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI2_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test3:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI2_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI2_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1278
+; CHECK-NEXT:    vldi $vr1, -1022
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.2500000000, double 2.2500000000 }
 }
 
 define dso_local { float, double } @test4() {
-; LA32-LABEL: test4:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI3_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI3_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test4:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI3_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI3_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1277
+; CHECK-NEXT:    vldi $vr1, -1021
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.3750000000, double 2.3750000000 }
 }
 
 define dso_local { float, double } @test5() {
-; LA32-LABEL: test5:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI4_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI4_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test5:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI4_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI4_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1276
+; CHECK-NEXT:    vldi $vr1, -1020
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.5000000000, double 2.5000000000 }
 }
 
 define dso_local { float, double } @test6() {
-; LA32-LABEL: test6:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI5_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI5_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test6:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI5_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI5_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1275
+; CHECK-NEXT:    vldi $vr1, -1019
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.6250000000, double 2.6250000000 }
 }
 
 define dso_local { float, double } @test7() {
-; LA32-LABEL: test7:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI6_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI6_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test7:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI6_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI6_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI6_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1274
+; CHECK-NEXT:    vldi $vr1, -1018
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.7500000000, double 2.7500000000 }
 }
 
 define dso_local { float, double } @test8() {
-; LA32-LABEL: test8:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI7_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI7_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test8:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI7_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI7_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI7_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1273
+; CHECK-NEXT:    vldi $vr1, -1017
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 2.8750000000, double 2.8750000000 }
 }
 
 define dso_local { float, double } @test9() {
-; LA32-LABEL: test9:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI8_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI8_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test9:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI8_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI8_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test9:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1272
+; CHECK-NEXT:    vldi $vr1, -1016
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.0000000000, double 3.0000000000 }
 }
 
 define dso_local { float, double } @test10() {
-; LA32-LABEL: test10:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI9_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI9_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test10:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI9_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI9_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI9_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test10:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1271
+; CHECK-NEXT:    vldi $vr1, -1015
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.1250000000, double 3.1250000000 }
 }
 
 define dso_local { float, double } @test11() {
-; LA32-LABEL: test11:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI10_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI10_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test11:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI10_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI10_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI10_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test11:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1270
+; CHECK-NEXT:    vldi $vr1, -1014
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.2500000000, double 3.2500000000 }
 }
 
 define dso_local { float, double } @test12() {
-; LA32-LABEL: test12:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI11_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI11_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test12:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI11_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI11_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI11_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test12:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1269
+; CHECK-NEXT:    vldi $vr1, -1013
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.3750000000, double 3.3750000000 }
 }
 
 define dso_local { float, double } @test13() {
-; LA32-LABEL: test13:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI12_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI12_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test13:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI12_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI12_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI12_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test13:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1268
+; CHECK-NEXT:    vldi $vr1, -1012
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.5000000000, double 3.5000000000 }
 }
 
 define dso_local { float, double } @test14() {
-; LA32-LABEL: test14:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI13_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI13_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test14:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI13_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI13_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI13_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test14:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1267
+; CHECK-NEXT:    vldi $vr1, -1011
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.6250000000, double 3.6250000000 }
 }
 
 define dso_local { float, double } @test15() {
-; LA32-LABEL: test15:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI14_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI14_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test15:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI14_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI14_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI14_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test15:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1266
+; CHECK-NEXT:    vldi $vr1, -1010
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.7500000000, double 3.7500000000 }
 }
 
 define dso_local { float, double } @test16() {
-; LA32-LABEL: test16:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI15_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI15_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test16:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI15_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI15_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI15_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1265
+; CHECK-NEXT:    vldi $vr1, -1009
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 3.8750000000, double 3.8750000000 }
 }
 
 define dso_local { float, double } @test17() {
-; LA32-LABEL: test17:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI16_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI16_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test17:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI16_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI16_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI16_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test17:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1264
+; CHECK-NEXT:    vldi $vr1, -1008
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 4.0000000000, double 4.0000000000 }
 }
 
 define dso_local { float, double } @test18() {
-; LA32-LABEL: test18:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI17_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI17_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test18:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI17_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI17_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI17_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test18:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1263
+; CHECK-NEXT:    vldi $vr1, -1007
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 4.2500000000, double 4.2500000000 }
 }
 
 define dso_local { float, double } @test19() {
-; LA32-LABEL: test19:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI18_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI18_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test19:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI18_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI18_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI18_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test19:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1262
+; CHECK-NEXT:    vldi $vr1, -1006
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 4.5000000000, double 4.5000000000 }
 }
 
 define dso_local { float, double } @test20() {
-; LA32-LABEL: test20:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI19_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI19_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test20:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI19_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI19_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI19_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test20:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1261
+; CHECK-NEXT:    vldi $vr1, -1005
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 4.7500000000, double 4.7500000000 }
 }
 
 define dso_local { float, double } @test21() {
-; LA32-LABEL: test21:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI20_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI20_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test21:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI20_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI20_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI20_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test21:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1260
+; CHECK-NEXT:    vldi $vr1, -1004
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 5.0000000000, double 5.0000000000 }
 }
 
 define dso_local { float, double } @test22() {
-; LA32-LABEL: test22:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI21_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI21_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test22:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI21_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI21_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI21_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test22:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1259
+; CHECK-NEXT:    vldi $vr1, -1003
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 5.2500000000, double 5.2500000000 }
 }
 
 define dso_local { float, double } @test23() {
-; LA32-LABEL: test23:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI22_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI22_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI22_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI22_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test23:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI22_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI22_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI22_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI22_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test23:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1258
+; CHECK-NEXT:    vldi $vr1, -1002
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 5.5000000000, double 5.5000000000 }
 }
 
 define dso_local { float, double } @test24() {
-; LA32-LABEL: test24:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI23_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI23_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI23_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI23_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test24:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI23_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI23_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI23_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI23_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test24:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1257
+; CHECK-NEXT:    vldi $vr1, -1001
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 5.7500000000, double 5.7500000000 }
 }
 
 define dso_local { float, double } @test25() {
-; LA32-LABEL: test25:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI24_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI24_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test25:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI24_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI24_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI24_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test25:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1256
+; CHECK-NEXT:    vldi $vr1, -1000
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 6.0000000000, double 6.0000000000 }
 }
 
 define dso_local { float, double } @test26() {
-; LA32-LABEL: test26:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI25_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI25_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI25_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI25_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test26:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI25_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI25_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI25_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI25_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test26:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1255
+; CHECK-NEXT:    vldi $vr1, -999
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 6.2500000000, double 6.2500000000 }
 }
 
 define dso_local { float, double } @test27() {
-; LA32-LABEL: test27:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI26_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI26_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI26_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI26_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test27:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI26_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI26_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI26_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI26_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test27:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1254
+; CHECK-NEXT:    vldi $vr1, -998
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 6.5000000000, double 6.5000000000 }
 }
 
 define dso_local { float, double } @test28() {
-; LA32-LABEL: test28:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI27_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI27_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI27_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI27_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test28:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI27_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI27_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI27_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI27_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test28:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1253
+; CHECK-NEXT:    vldi $vr1, -997
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 6.7500000000, double 6.7500000000 }
 }
 
 define dso_local { float, double } @test29() {
-; LA32-LABEL: test29:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI28_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI28_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI28_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI28_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test29:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI28_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI28_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI28_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI28_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test29:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1252
+; CHECK-NEXT:    vldi $vr1, -996
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 7.0000000000, double 7.0000000000 }
 }
 
 define dso_local { float, double } @test30() {
-; LA32-LABEL: test30:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI29_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI29_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test30:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI29_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI29_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI29_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test30:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1251
+; CHECK-NEXT:    vldi $vr1, -995
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 7.2500000000, double 7.2500000000 }
 }
 
 define dso_local { float, double } @test31() {
-; LA32-LABEL: test31:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI30_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI30_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI30_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI30_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test31:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI30_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI30_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI30_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI30_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test31:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1250
+; CHECK-NEXT:    vldi $vr1, -994
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 7.5000000000, double 7.5000000000 }
 }
 
 define dso_local { float, double } @test32() {
-; LA32-LABEL: test32:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI31_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI31_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI31_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI31_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test32:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI31_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI31_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI31_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI31_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1249
+; CHECK-NEXT:    vldi $vr1, -993
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 7.7500000000, double 7.7500000000 }
 }
 
 define dso_local { float, double } @test33() {
-; LA32-LABEL: test33:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI32_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI32_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI32_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI32_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test33:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI32_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI32_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI32_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI32_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test33:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1248
+; CHECK-NEXT:    vldi $vr1, -992
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 8.0000000000, double 8.0000000000 }
 }
 
 define dso_local { float, double } @test34() {
-; LA32-LABEL: test34:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI33_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI33_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI33_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI33_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test34:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI33_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI33_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI33_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI33_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test34:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1247
+; CHECK-NEXT:    vldi $vr1, -991
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 8.5000000000, double 8.5000000000 }
 }
 
 define dso_local { float, double } @test35() {
-; LA32-LABEL: test35:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI34_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI34_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI34_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI34_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test35:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI34_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI34_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI34_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI34_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test35:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1246
+; CHECK-NEXT:    vldi $vr1, -990
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 9.0000000000, double 9.0000000000 }
 }
 
 define dso_local { float, double } @test36() {
-; LA32-LABEL: test36:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI35_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI35_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI35_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI35_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test36:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI35_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI35_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI35_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI35_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test36:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1245
+; CHECK-NEXT:    vldi $vr1, -989
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 9.5000000000, double 9.5000000000 }
 }
 
 define dso_local { float, double } @test37() {
-; LA32-LABEL: test37:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI36_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI36_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI36_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI36_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test37:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI36_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI36_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI36_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI36_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test37:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1244
+; CHECK-NEXT:    vldi $vr1, -988
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 10.0000000000, double 10.0000000000 }
 }
 
 define dso_local { float, double } @test38() {
-; LA32-LABEL: test38:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI37_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI37_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test38:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI37_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI37_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI37_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test38:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1243
+; CHECK-NEXT:    vldi $vr1, -987
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 10.5000000000, double 10.5000000000 }
 }
 
 define dso_local { float, double } @test39() {
-; LA32-LABEL: test39:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI38_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI38_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI38_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI38_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test39:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI38_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI38_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI38_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI38_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test39:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1242
+; CHECK-NEXT:    vldi $vr1, -986
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 11.0000000000, double 11.0000000000 }
 }
 
 define dso_local { float, double } @test40() {
-; LA32-LABEL: test40:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI39_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI39_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI39_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI39_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test40:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI39_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI39_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI39_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI39_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test40:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1241
+; CHECK-NEXT:    vldi $vr1, -985
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 11.5000000000, double 11.5000000000 }
 }
 
 define dso_local { float, double } @test41() {
-; LA32-LABEL: test41:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI40_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI40_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI40_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI40_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test41:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI40_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI40_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI40_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI40_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test41:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1240
+; CHECK-NEXT:    vldi $vr1, -984
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 12.0000000000, double 12.0000000000 }
 }
 
 define dso_local { float, double } @test42() {
-; LA32-LABEL: test42:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI41_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI41_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI41_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI41_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test42:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI41_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI41_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI41_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI41_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test42:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1239
+; CHECK-NEXT:    vldi $vr1, -983
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 12.5000000000, double 12.5000000000 }
 }
 
 define dso_local { float, double } @test43() {
-; LA32-LABEL: test43:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI42_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI42_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI42_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI42_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test43:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI42_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI42_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI42_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI42_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test43:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1238
+; CHECK-NEXT:    vldi $vr1, -982
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 13.0000000000, double 13.0000000000 }
 }
 
 define dso_local { float, double } @test44() {
-; LA32-LABEL: test44:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI43_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI43_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI43_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI43_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test44:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI43_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI43_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI43_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI43_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test44:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1237
+; CHECK-NEXT:    vldi $vr1, -981
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 13.5000000000, double 13.5000000000 }
 }
 
 define dso_local { float, double } @test45() {
-; LA32-LABEL: test45:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI44_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI44_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI44_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI44_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test45:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI44_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI44_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI44_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI44_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test45:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1236
+; CHECK-NEXT:    vldi $vr1, -980
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 14.0000000000, double 14.0000000000 }
 }
 
 define dso_local { float, double } @test46() {
-; LA32-LABEL: test46:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI45_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI45_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI45_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI45_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test46:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI45_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI45_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI45_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI45_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test46:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1235
+; CHECK-NEXT:    vldi $vr1, -979
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 14.5000000000, double 14.5000000000 }
 }
 
 define dso_local { float, double } @test47() {
-; LA32-LABEL: test47:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI46_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI46_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI46_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI46_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test47:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI46_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI46_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI46_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI46_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test47:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1234
+; CHECK-NEXT:    vldi $vr1, -978
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 15.0000000000, double 15.0000000000 }
 }
 
 define dso_local { float, double } @test48() {
-; LA32-LABEL: test48:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI47_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI47_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI47_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI47_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test48:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI47_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI47_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI47_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI47_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test48:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1233
+; CHECK-NEXT:    vldi $vr1, -977
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 15.5000000000, double 15.5000000000 }
 }
 
 define dso_local { float, double } @test49() {
-; LA32-LABEL: test49:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI48_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI48_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI48_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI48_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test49:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI48_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI48_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI48_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI48_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test49:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1232
+; CHECK-NEXT:    vldi $vr1, -976
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 16.0000000000, double 16.0000000000 }
 }
 
 define dso_local { float, double } @test50() {
-; LA32-LABEL: test50:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI49_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI49_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI49_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI49_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test50:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI49_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI49_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI49_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI49_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test50:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1231
+; CHECK-NEXT:    vldi $vr1, -975
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 17.0000000000, double 17.0000000000 }
 }
 
 define dso_local { float, double } @test51() {
-; LA32-LABEL: test51:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI50_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI50_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI50_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI50_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test51:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI50_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI50_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI50_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI50_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test51:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1230
+; CHECK-NEXT:    vldi $vr1, -974
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 18.0000000000, double 18.0000000000 }
 }
 
 define dso_local { float, double } @test52() {
-; LA32-LABEL: test52:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI51_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI51_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI51_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI51_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test52:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI51_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI51_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI51_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI51_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test52:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1229
+; CHECK-NEXT:    vldi $vr1, -973
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 19.0000000000, double 19.0000000000 }
 }
 
 define dso_local { float, double } @test53() {
-; LA32-LABEL: test53:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI52_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI52_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI52_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI52_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test53:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI52_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI52_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI52_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI52_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test53:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1228
+; CHECK-NEXT:    vldi $vr1, -972
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 20.0000000000, double 20.0000000000 }
 }
 
 define dso_local { float, double } @test54() {
-; LA32-LABEL: test54:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI53_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI53_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI53_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI53_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test54:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI53_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI53_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI53_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI53_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test54:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1227
+; CHECK-NEXT:    vldi $vr1, -971
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 21.0000000000, double 21.0000000000 }
 }
 
 define dso_local { float, double } @test55() {
-; LA32-LABEL: test55:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI54_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI54_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI54_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI54_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test55:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI54_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI54_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI54_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI54_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test55:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1226
+; CHECK-NEXT:    vldi $vr1, -970
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 22.0000000000, double 22.0000000000 }
 }
 
 define dso_local { float, double } @test56() {
-; LA32-LABEL: test56:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI55_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI55_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI55_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI55_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test56:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI55_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI55_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI55_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI55_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test56:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1225
+; CHECK-NEXT:    vldi $vr1, -969
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 23.0000000000, double 23.0000000000 }
 }
 
 define dso_local { float, double } @test57() {
-; LA32-LABEL: test57:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI56_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI56_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI56_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI56_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test57:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI56_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI56_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI56_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI56_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test57:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1224
+; CHECK-NEXT:    vldi $vr1, -968
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 24.0000000000, double 24.0000000000 }
 }
 
 define dso_local { float, double } @test58() {
-; LA32-LABEL: test58:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI57_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI57_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI57_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI57_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test58:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI57_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI57_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI57_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI57_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test58:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1223
+; CHECK-NEXT:    vldi $vr1, -967
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 25.0000000000, double 25.0000000000 }
 }
 
 define dso_local { float, double } @test59() {
-; LA32-LABEL: test59:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI58_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI58_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI58_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI58_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test59:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI58_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI58_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI58_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI58_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test59:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1222
+; CHECK-NEXT:    vldi $vr1, -966
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 26.0000000000, double 26.0000000000 }
 }
 
 define dso_local { float, double } @test60() {
-; LA32-LABEL: test60:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI59_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI59_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI59_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI59_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test60:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI59_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI59_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI59_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI59_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test60:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1221
+; CHECK-NEXT:    vldi $vr1, -965
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 27.0000000000, double 27.0000000000 }
 }
 
 define dso_local { float, double } @test61() {
-; LA32-LABEL: test61:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI60_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI60_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI60_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI60_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test61:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI60_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI60_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI60_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI60_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test61:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1220
+; CHECK-NEXT:    vldi $vr1, -964
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 28.0000000000, double 28.0000000000 }
 }
 
 define dso_local { float, double } @test62() {
-; LA32-LABEL: test62:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI61_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI61_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI61_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI61_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test62:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI61_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI61_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI61_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI61_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test62:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1219
+; CHECK-NEXT:    vldi $vr1, -963
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 29.0000000000, double 29.0000000000 }
 }
 
 define dso_local { float, double } @test63() {
-; LA32-LABEL: test63:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI62_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI62_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI62_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI62_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test63:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI62_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI62_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI62_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI62_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test63:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1218
+; CHECK-NEXT:    vldi $vr1, -962
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 30.0000000000, double 30.0000000000 }
 }
 
 define dso_local { float, double } @test64() {
-; LA32-LABEL: test64:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI63_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI63_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI63_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI63_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test64:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI63_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI63_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI63_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI63_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1217
+; CHECK-NEXT:    vldi $vr1, -961
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 31.0000000000, double 31.0000000000 }
 }
 
 define dso_local { float, double } @test65() {
-; LA32-LABEL: test65:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI64_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI64_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI64_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI64_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test65:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI64_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI64_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI64_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI64_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test65:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1216
+; CHECK-NEXT:    vldi $vr1, -960
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1250000000, double 0.1250000000 }
 }
 
 define dso_local { float, double } @test66() {
-; LA32-LABEL: test66:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI65_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI65_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI65_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI65_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test66:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI65_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI65_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI65_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI65_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test66:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1215
+; CHECK-NEXT:    vldi $vr1, -959
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1328125000, double 0.1328125000 }
 }
 
 define dso_local { float, double } @test67() {
-; LA32-LABEL: test67:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI66_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI66_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI66_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI66_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test67:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI66_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI66_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI66_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI66_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test67:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1214
+; CHECK-NEXT:    vldi $vr1, -958
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1406250000, double 0.1406250000 }
 }
 
 define dso_local { float, double } @test68() {
-; LA32-LABEL: test68:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI67_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI67_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI67_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI67_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test68:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI67_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI67_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI67_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI67_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test68:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1213
+; CHECK-NEXT:    vldi $vr1, -957
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1484375000, double 0.1484375000 }
 }
 
 define dso_local { float, double } @test69() {
-; LA32-LABEL: test69:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI68_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI68_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI68_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI68_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test69:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI68_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI68_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI68_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI68_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test69:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1212
+; CHECK-NEXT:    vldi $vr1, -956
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1562500000, double 0.1562500000 }
 }
 
 define dso_local { float, double } @test70() {
-; LA32-LABEL: test70:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI69_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI69_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI69_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI69_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test70:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI69_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI69_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI69_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI69_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test70:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1211
+; CHECK-NEXT:    vldi $vr1, -955
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1640625000, double 0.1640625000 }
 }
 
 define dso_local { float, double } @test71() {
-; LA32-LABEL: test71:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI70_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI70_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI70_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI70_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test71:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI70_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI70_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI70_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI70_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test71:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1210
+; CHECK-NEXT:    vldi $vr1, -954
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1718750000, double 0.1718750000 }
 }
 
 define dso_local { float, double } @test72() {
-; LA32-LABEL: test72:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI71_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI71_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI71_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI71_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test72:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI71_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI71_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI71_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI71_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test72:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1209
+; CHECK-NEXT:    vldi $vr1, -953
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1796875000, double 0.1796875000 }
 }
 
 define dso_local { float, double } @test73() {
-; LA32-LABEL: test73:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI72_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI72_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI72_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI72_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test73:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI72_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI72_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI72_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI72_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test73:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1208
+; CHECK-NEXT:    vldi $vr1, -952
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1875000000, double 0.1875000000 }
 }
 
 define dso_local { float, double } @test74() {
-; LA32-LABEL: test74:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI73_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI73_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI73_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI73_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test74:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI73_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI73_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI73_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI73_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test74:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1207
+; CHECK-NEXT:    vldi $vr1, -951
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.1953125000, double 0.1953125000 }
 }
 
 define dso_local { float, double } @test75() {
-; LA32-LABEL: test75:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI74_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI74_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI74_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI74_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test75:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI74_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI74_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI74_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI74_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test75:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1206
+; CHECK-NEXT:    vldi $vr1, -950
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2031250000, double 0.2031250000 }
 }
 
 define dso_local { float, double } @test76() {
-; LA32-LABEL: test76:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI75_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI75_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI75_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI75_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test76:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI75_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI75_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI75_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI75_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test76:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1205
+; CHECK-NEXT:    vldi $vr1, -949
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2109375000, double 0.2109375000 }
 }
 
 define dso_local { float, double } @test77() {
-; LA32-LABEL: test77:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI76_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI76_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI76_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI76_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test77:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI76_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI76_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI76_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI76_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test77:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1204
+; CHECK-NEXT:    vldi $vr1, -948
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2187500000, double 0.2187500000 }
 }
 
 define dso_local { float, double } @test78() {
-; LA32-LABEL: test78:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI77_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI77_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI77_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI77_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test78:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI77_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI77_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI77_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI77_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test78:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1203
+; CHECK-NEXT:    vldi $vr1, -947
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2265625000, double 0.2265625000 }
 }
 
 define dso_local { float, double } @test79() {
-; LA32-LABEL: test79:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI78_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI78_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI78_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI78_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test79:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI78_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI78_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI78_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI78_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test79:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1202
+; CHECK-NEXT:    vldi $vr1, -946
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2343750000, double 0.2343750000 }
 }
 
 define dso_local { float, double } @test80() {
-; LA32-LABEL: test80:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI79_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI79_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI79_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI79_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test80:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI79_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI79_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI79_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI79_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test80:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1201
+; CHECK-NEXT:    vldi $vr1, -945
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2421875000, double 0.2421875000 }
 }
 
 define dso_local { float, double } @test81() {
-; LA32-LABEL: test81:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI80_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI80_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI80_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI80_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test81:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI80_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI80_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI80_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI80_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test81:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1200
+; CHECK-NEXT:    vldi $vr1, -944
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2500000000, double 0.2500000000 }
 }
 
 define dso_local { float, double } @test82() {
-; LA32-LABEL: test82:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI81_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI81_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI81_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI81_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test82:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI81_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI81_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI81_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI81_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test82:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1199
+; CHECK-NEXT:    vldi $vr1, -943
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2656250000, double 0.2656250000 }
 }
 
 define dso_local { float, double } @test83() {
-; LA32-LABEL: test83:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI82_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI82_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI82_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI82_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test83:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI82_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI82_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI82_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI82_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test83:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1198
+; CHECK-NEXT:    vldi $vr1, -942
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2812500000, double 0.2812500000 }
 }
 
 define dso_local { float, double } @test84() {
-; LA32-LABEL: test84:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI83_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI83_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI83_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI83_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test84:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI83_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI83_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI83_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI83_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test84:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1197
+; CHECK-NEXT:    vldi $vr1, -941
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.2968750000, double 0.2968750000 }
 }
 
 define dso_local { float, double } @test85() {
-; LA32-LABEL: test85:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI84_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI84_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI84_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI84_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test85:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI84_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI84_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI84_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI84_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test85:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1196
+; CHECK-NEXT:    vldi $vr1, -940
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.3125000000, double 0.3125000000 }
 }
 
 define dso_local { float, double } @test86() {
-; LA32-LABEL: test86:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI85_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI85_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI85_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI85_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test86:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI85_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI85_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI85_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI85_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test86:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1195
+; CHECK-NEXT:    vldi $vr1, -939
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.3281250000, double 0.3281250000 }
 }
 
 define dso_local { float, double } @test87() {
-; LA32-LABEL: test87:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI86_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI86_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI86_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI86_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test87:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI86_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI86_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI86_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI86_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test87:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1194
+; CHECK-NEXT:    vldi $vr1, -938
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.3437500000, double 0.3437500000 }
 }
 
 define dso_local { float, double } @test88() {
-; LA32-LABEL: test88:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI87_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI87_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI87_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI87_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test88:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI87_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI87_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI87_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI87_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test88:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1193
+; CHECK-NEXT:    vldi $vr1, -937
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.3593750000, double 0.3593750000 }
 }
 
 define dso_local { float, double } @test89() {
-; LA32-LABEL: test89:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI88_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI88_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI88_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI88_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test89:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI88_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI88_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI88_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI88_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test89:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1192
+; CHECK-NEXT:    vldi $vr1, -936
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.3750000000, double 0.3750000000 }
 }
 
 define dso_local { float, double } @test90() {
-; LA32-LABEL: test90:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI89_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI89_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI89_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI89_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test90:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI89_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI89_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI89_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI89_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test90:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1191
+; CHECK-NEXT:    vldi $vr1, -935
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.3906250000, double 0.3906250000 }
 }
 
 define dso_local { float, double } @test91() {
-; LA32-LABEL: test91:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI90_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI90_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI90_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI90_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test91:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI90_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI90_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI90_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI90_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test91:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1190
+; CHECK-NEXT:    vldi $vr1, -934
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.4062500000, double 0.4062500000 }
 }
 
 define dso_local { float, double } @test92() {
-; LA32-LABEL: test92:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI91_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI91_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI91_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI91_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test92:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI91_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI91_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI91_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI91_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test92:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1189
+; CHECK-NEXT:    vldi $vr1, -933
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.4218750000, double 0.4218750000 }
 }
 
 define dso_local { float, double } @test93() {
-; LA32-LABEL: test93:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI92_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI92_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI92_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI92_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test93:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI92_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI92_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI92_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI92_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test93:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1188
+; CHECK-NEXT:    vldi $vr1, -932
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.4375000000, double 0.4375000000 }
 }
 
 define dso_local { float, double } @test94() {
-; LA32-LABEL: test94:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI93_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI93_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI93_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI93_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test94:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI93_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI93_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI93_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI93_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test94:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1187
+; CHECK-NEXT:    vldi $vr1, -931
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.4531250000, double 0.4531250000 }
 }
 
 define dso_local { float, double } @test95() {
-; LA32-LABEL: test95:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI94_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI94_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI94_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI94_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test95:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI94_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI94_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI94_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI94_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test95:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1186
+; CHECK-NEXT:    vldi $vr1, -930
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.4687500000, double 0.4687500000 }
 }
 
 define dso_local { float, double } @test96() {
-; LA32-LABEL: test96:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI95_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI95_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI95_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI95_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test96:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI95_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI95_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI95_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI95_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test96:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1185
+; CHECK-NEXT:    vldi $vr1, -929
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.4843750000, double 0.4843750000 }
 }
 
 define dso_local { float, double } @test97() {
-; LA32-LABEL: test97:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI96_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI96_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI96_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI96_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test97:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI96_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI96_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI96_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI96_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test97:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1184
+; CHECK-NEXT:    vldi $vr1, -928
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.5000000000, double 0.5000000000 }
 }
 
 define dso_local { float, double } @test98() {
-; LA32-LABEL: test98:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI97_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI97_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI97_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI97_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test98:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI97_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI97_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI97_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI97_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test98:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1183
+; CHECK-NEXT:    vldi $vr1, -927
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.5312500000, double 0.5312500000 }
 }
 
 define dso_local { float, double } @test99() {
-; LA32-LABEL: test99:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI98_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI98_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI98_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI98_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test99:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI98_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI98_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI98_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI98_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test99:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1182
+; CHECK-NEXT:    vldi $vr1, -926
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.5625000000, double 0.5625000000 }
 }
 
 define dso_local { float, double } @test100() {
-; LA32-LABEL: test100:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI99_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI99_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI99_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI99_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test100:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI99_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI99_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI99_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI99_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test100:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1181
+; CHECK-NEXT:    vldi $vr1, -925
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.5937500000, double 0.5937500000 }
 }
 
 define dso_local { float, double } @test101() {
-; LA32-LABEL: test101:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI100_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI100_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI100_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI100_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test101:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI100_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI100_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI100_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI100_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test101:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1180
+; CHECK-NEXT:    vldi $vr1, -924
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.6250000000, double 0.6250000000 }
 }
 
 define dso_local { float, double } @test102() {
-; LA32-LABEL: test102:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI101_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI101_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI101_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI101_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test102:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI101_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI101_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI101_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI101_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test102:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1179
+; CHECK-NEXT:    vldi $vr1, -923
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.6562500000, double 0.6562500000 }
 }
 
 define dso_local { float, double } @test103() {
-; LA32-LABEL: test103:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI102_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI102_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI102_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI102_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test103:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI102_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI102_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI102_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI102_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test103:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1178
+; CHECK-NEXT:    vldi $vr1, -922
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.6875000000, double 0.6875000000 }
 }
 
 define dso_local { float, double } @test104() {
-; LA32-LABEL: test104:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI103_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI103_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI103_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI103_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test104:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI103_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI103_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI103_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI103_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test104:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1177
+; CHECK-NEXT:    vldi $vr1, -921
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.7187500000, double 0.7187500000 }
 }
 
 define dso_local { float, double } @test105() {
-; LA32-LABEL: test105:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI104_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI104_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI104_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI104_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test105:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI104_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI104_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI104_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI104_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test105:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1176
+; CHECK-NEXT:    vldi $vr1, -920
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.7500000000, double 0.7500000000 }
 }
 
 define dso_local { float, double } @test106() {
-; LA32-LABEL: test106:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI105_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI105_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI105_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI105_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test106:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI105_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI105_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI105_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI105_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test106:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1175
+; CHECK-NEXT:    vldi $vr1, -919
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.7812500000, double 0.7812500000 }
 }
 
 define dso_local { float, double } @test107() {
-; LA32-LABEL: test107:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI106_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI106_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI106_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI106_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test107:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI106_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI106_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI106_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI106_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test107:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1174
+; CHECK-NEXT:    vldi $vr1, -918
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.8125000000, double 0.8125000000 }
 }
 
 define dso_local { float, double } @test108() {
-; LA32-LABEL: test108:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI107_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI107_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI107_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI107_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test108:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI107_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI107_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI107_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI107_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test108:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1173
+; CHECK-NEXT:    vldi $vr1, -917
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.8437500000, double 0.8437500000 }
 }
 
 define dso_local { float, double } @test109() {
-; LA32-LABEL: test109:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI108_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI108_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI108_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI108_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test109:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI108_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI108_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI108_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI108_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test109:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1172
+; CHECK-NEXT:    vldi $vr1, -916
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.8750000000, double 0.8750000000 }
 }
 
 define dso_local { float, double } @test110() {
-; LA32-LABEL: test110:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI109_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI109_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI109_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI109_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test110:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI109_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI109_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI109_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI109_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test110:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1171
+; CHECK-NEXT:    vldi $vr1, -915
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.9062500000, double 0.9062500000 }
 }
 
 define dso_local { float, double } @test111() {
-; LA32-LABEL: test111:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI110_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI110_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI110_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI110_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test111:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI110_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI110_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI110_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI110_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test111:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1170
+; CHECK-NEXT:    vldi $vr1, -914
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.9375000000, double 0.9375000000 }
 }
 
 define dso_local { float, double } @test112() {
-; LA32-LABEL: test112:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI111_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI111_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI111_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI111_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test112:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI111_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI111_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI111_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI111_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test112:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1169
+; CHECK-NEXT:    vldi $vr1, -913
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 0.9687500000, double 0.9687500000 }
 }
 
 define dso_local { float, double } @test113() {
-; LA32-LABEL: test113:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    addi.w $a0, $zero, 1
-; LA32-NEXT:    movgr2fr.w $fa0, $a0
-; LA32-NEXT:    ffint.s.w $fa0, $fa0
-; LA32-NEXT:    fcvt.d.s $fa1, $fa0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test113:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.w $a0, $zero, 1
-; LA64-NEXT:    movgr2fr.w $fa0, $a0
-; LA64-NEXT:    ffint.s.w $fa0, $fa0
-; LA64-NEXT:    addi.d $a0, $zero, 1
-; LA64-NEXT:    movgr2fr.d $fa1, $a0
-; LA64-NEXT:    ffint.d.l $fa1, $fa1
-; LA64-NEXT:    ret
+; CHECK-LABEL: test113:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1168
+; CHECK-NEXT:    vldi $vr1, -912
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.0000000000, double 1.0000000000 }
 }
 
 define dso_local { float, double } @test114() {
-; LA32-LABEL: test114:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI113_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI113_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI113_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI113_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test114:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI113_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI113_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI113_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI113_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test114:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1167
+; CHECK-NEXT:    vldi $vr1, -911
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.0625000000, double 1.0625000000 }
 }
 
 define dso_local { float, double } @test115() {
-; LA32-LABEL: test115:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI114_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI114_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI114_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI114_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test115:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI114_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI114_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI114_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI114_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test115:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1166
+; CHECK-NEXT:    vldi $vr1, -910
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.1250000000, double 1.1250000000 }
 }
 
 define dso_local { float, double } @test116() {
-; LA32-LABEL: test116:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI115_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI115_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI115_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI115_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test116:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI115_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI115_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI115_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI115_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test116:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1165
+; CHECK-NEXT:    vldi $vr1, -909
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.1875000000, double 1.1875000000 }
 }
 
 define dso_local { float, double } @test117() {
-; LA32-LABEL: test117:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI116_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI116_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI116_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI116_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test117:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI116_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI116_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI116_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI116_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test117:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1164
+; CHECK-NEXT:    vldi $vr1, -908
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.2500000000, double 1.2500000000 }
 }
 
 define dso_local { float, double } @test118() {
-; LA32-LABEL: test118:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI117_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI117_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI117_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI117_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test118:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI117_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI117_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI117_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI117_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test118:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1163
+; CHECK-NEXT:    vldi $vr1, -907
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.3125000000, double 1.3125000000 }
 }
 
 define dso_local { float, double } @test119() {
-; LA32-LABEL: test119:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI118_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI118_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI118_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI118_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test119:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI118_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI118_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI118_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI118_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test119:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1162
+; CHECK-NEXT:    vldi $vr1, -906
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.3750000000, double 1.3750000000 }
 }
 
 define dso_local { float, double } @test120() {
-; LA32-LABEL: test120:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI119_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI119_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI119_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI119_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test120:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI119_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI119_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI119_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI119_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test120:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1161
+; CHECK-NEXT:    vldi $vr1, -905
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.4375000000, double 1.4375000000 }
 }
 
 define dso_local { float, double } @test121() {
-; LA32-LABEL: test121:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI120_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI120_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI120_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI120_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test121:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI120_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI120_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI120_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI120_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test121:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1160
+; CHECK-NEXT:    vldi $vr1, -904
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.5000000000, double 1.5000000000 }
 }
 
 define dso_local { float, double } @test122() {
-; LA32-LABEL: test122:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI121_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI121_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI121_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI121_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test122:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI121_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI121_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI121_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI121_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test122:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1159
+; CHECK-NEXT:    vldi $vr1, -903
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.5625000000, double 1.5625000000 }
 }
 
 define dso_local { float, double } @test123() {
-; LA32-LABEL: test123:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI122_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI122_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI122_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI122_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test123:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI122_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI122_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI122_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI122_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test123:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1158
+; CHECK-NEXT:    vldi $vr1, -902
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.6250000000, double 1.6250000000 }
 }
 
 define dso_local { float, double } @test124() {
-; LA32-LABEL: test124:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI123_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI123_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI123_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI123_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test124:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI123_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI123_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI123_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI123_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test124:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1157
+; CHECK-NEXT:    vldi $vr1, -901
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.6875000000, double 1.6875000000 }
 }
 
 define dso_local { float, double } @test125() {
-; LA32-LABEL: test125:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI124_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI124_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI124_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI124_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test125:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI124_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI124_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI124_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI124_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test125:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1156
+; CHECK-NEXT:    vldi $vr1, -900
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.7500000000, double 1.7500000000 }
 }
 
 define dso_local { float, double } @test126() {
-; LA32-LABEL: test126:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI125_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI125_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI125_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI125_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test126:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI125_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI125_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI125_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI125_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test126:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1155
+; CHECK-NEXT:    vldi $vr1, -899
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.8125000000, double 1.8125000000 }
 }
 
 define dso_local { float, double } @test127() {
-; LA32-LABEL: test127:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI126_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI126_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI126_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI126_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test127:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI126_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI126_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI126_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI126_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test127:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1154
+; CHECK-NEXT:    vldi $vr1, -898
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.8750000000, double 1.8750000000 }
 }
 
 define dso_local { float, double } @test128() {
-; LA32-LABEL: test128:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI127_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI127_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI127_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI127_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test128:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI127_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI127_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI127_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI127_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1153
+; CHECK-NEXT:    vldi $vr1, -897
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float 1.9375000000, double 1.9375000000 }
 }
 
 define dso_local { float, double } @test129() {
-; LA32-LABEL: test129:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI128_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI128_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI128_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI128_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test129:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI128_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI128_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI128_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI128_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test129:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1152
+; CHECK-NEXT:    vldi $vr1, -896
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.0000000000, double -2.0000000000 }
 }
 
 define dso_local { float, double } @test130() {
-; LA32-LABEL: test130:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI129_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI129_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI129_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI129_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test130:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI129_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI129_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI129_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI129_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test130:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1151
+; CHECK-NEXT:    vldi $vr1, -895
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.1250000000, double -2.1250000000 }
 }
 
 define dso_local { float, double } @test131() {
-; LA32-LABEL: test131:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI130_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI130_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI130_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI130_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test131:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI130_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI130_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI130_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI130_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test131:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1150
+; CHECK-NEXT:    vldi $vr1, -894
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.2500000000, double -2.2500000000 }
 }
 
 define dso_local { float, double } @test132() {
-; LA32-LABEL: test132:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI131_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI131_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI131_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI131_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test132:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI131_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI131_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI131_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI131_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test132:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1149
+; CHECK-NEXT:    vldi $vr1, -893
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.3750000000, double -2.3750000000 }
 }
 
 define dso_local { float, double } @test133() {
-; LA32-LABEL: test133:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI132_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI132_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI132_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI132_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test133:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI132_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI132_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI132_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI132_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test133:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1148
+; CHECK-NEXT:    vldi $vr1, -892
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.5000000000, double -2.5000000000 }
 }
 
 define dso_local { float, double } @test134() {
-; LA32-LABEL: test134:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI133_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI133_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI133_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI133_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test134:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI133_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI133_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI133_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI133_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test134:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1147
+; CHECK-NEXT:    vldi $vr1, -891
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.6250000000, double -2.6250000000 }
 }
 
 define dso_local { float, double } @test135() {
-; LA32-LABEL: test135:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI134_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI134_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI134_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI134_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test135:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI134_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI134_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI134_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI134_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test135:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1146
+; CHECK-NEXT:    vldi $vr1, -890
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.7500000000, double -2.7500000000 }
 }
 
 define dso_local { float, double } @test136() {
-; LA32-LABEL: test136:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI135_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI135_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI135_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI135_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test136:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI135_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI135_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI135_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI135_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test136:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1145
+; CHECK-NEXT:    vldi $vr1, -889
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -2.8750000000, double -2.8750000000 }
 }
 
 define dso_local { float, double } @test137() {
-; LA32-LABEL: test137:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI136_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI136_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI136_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI136_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test137:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI136_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI136_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI136_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI136_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test137:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1144
+; CHECK-NEXT:    vldi $vr1, -888
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.0000000000, double -3.0000000000 }
 }
 
 define dso_local { float, double } @test138() {
-; LA32-LABEL: test138:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI137_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI137_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI137_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI137_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test138:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI137_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI137_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI137_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI137_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test138:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1143
+; CHECK-NEXT:    vldi $vr1, -887
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.1250000000, double -3.1250000000 }
 }
 
 define dso_local { float, double } @test139() {
-; LA32-LABEL: test139:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI138_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI138_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI138_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI138_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test139:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI138_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI138_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI138_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI138_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test139:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1142
+; CHECK-NEXT:    vldi $vr1, -886
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.2500000000, double -3.2500000000 }
 }
 
 define dso_local { float, double } @test140() {
-; LA32-LABEL: test140:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI139_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI139_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI139_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI139_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test140:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI139_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI139_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI139_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI139_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test140:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1141
+; CHECK-NEXT:    vldi $vr1, -885
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.3750000000, double -3.3750000000 }
 }
 
 define dso_local { float, double } @test141() {
-; LA32-LABEL: test141:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI140_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI140_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI140_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI140_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test141:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI140_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI140_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI140_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI140_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test141:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1140
+; CHECK-NEXT:    vldi $vr1, -884
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.5000000000, double -3.5000000000 }
 }
 
 define dso_local { float, double } @test142() {
-; LA32-LABEL: test142:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI141_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI141_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI141_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI141_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test142:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI141_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI141_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI141_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI141_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test142:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1139
+; CHECK-NEXT:    vldi $vr1, -883
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.6250000000, double -3.6250000000 }
 }
 
 define dso_local { float, double } @test143() {
-; LA32-LABEL: test143:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI142_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI142_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI142_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI142_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test143:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI142_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI142_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI142_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI142_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test143:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1138
+; CHECK-NEXT:    vldi $vr1, -882
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.7500000000, double -3.7500000000 }
 }
 
 define dso_local { float, double } @test144() {
-; LA32-LABEL: test144:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI143_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI143_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI143_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI143_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test144:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI143_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI143_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI143_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI143_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test144:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1137
+; CHECK-NEXT:    vldi $vr1, -881
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -3.8750000000, double -3.8750000000 }
 }
 
 define dso_local { float, double } @test145() {
-; LA32-LABEL: test145:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI144_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI144_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI144_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI144_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test145:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI144_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI144_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI144_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI144_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test145:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1136
+; CHECK-NEXT:    vldi $vr1, -880
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -4.0000000000, double -4.0000000000 }
 }
 
 define dso_local { float, double } @test146() {
-; LA32-LABEL: test146:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI145_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI145_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI145_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI145_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test146:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI145_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI145_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI145_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI145_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test146:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1135
+; CHECK-NEXT:    vldi $vr1, -879
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -4.2500000000, double -4.2500000000 }
 }
 
 define dso_local { float, double } @test147() {
-; LA32-LABEL: test147:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI146_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI146_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI146_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI146_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test147:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI146_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI146_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI146_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI146_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test147:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1134
+; CHECK-NEXT:    vldi $vr1, -878
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -4.5000000000, double -4.5000000000 }
 }
 
 define dso_local { float, double } @test148() {
-; LA32-LABEL: test148:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI147_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI147_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI147_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI147_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test148:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI147_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI147_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI147_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI147_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test148:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1133
+; CHECK-NEXT:    vldi $vr1, -877
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -4.7500000000, double -4.7500000000 }
 }
 
 define dso_local { float, double } @test149() {
-; LA32-LABEL: test149:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI148_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI148_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI148_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI148_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test149:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI148_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI148_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI148_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI148_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test149:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1132
+; CHECK-NEXT:    vldi $vr1, -876
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -5.0000000000, double -5.0000000000 }
 }
 
 define dso_local { float, double } @test150() {
-; LA32-LABEL: test150:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI149_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI149_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI149_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI149_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test150:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI149_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI149_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI149_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI149_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test150:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1131
+; CHECK-NEXT:    vldi $vr1, -875
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -5.2500000000, double -5.2500000000 }
 }
 
 define dso_local { float, double } @test151() {
-; LA32-LABEL: test151:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI150_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI150_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI150_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI150_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test151:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI150_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI150_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI150_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI150_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test151:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1130
+; CHECK-NEXT:    vldi $vr1, -874
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -5.5000000000, double -5.5000000000 }
 }
 
 define dso_local { float, double } @test152() {
-; LA32-LABEL: test152:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI151_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI151_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI151_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI151_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test152:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI151_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI151_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI151_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI151_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test152:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1129
+; CHECK-NEXT:    vldi $vr1, -873
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -5.7500000000, double -5.7500000000 }
 }
 
 define dso_local { float, double } @test153() {
-; LA32-LABEL: test153:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI152_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI152_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI152_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI152_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test153:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI152_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI152_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI152_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI152_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test153:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1128
+; CHECK-NEXT:    vldi $vr1, -872
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -6.0000000000, double -6.0000000000 }
 }
 
 define dso_local { float, double } @test154() {
-; LA32-LABEL: test154:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI153_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI153_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI153_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI153_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test154:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI153_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI153_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI153_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI153_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test154:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1127
+; CHECK-NEXT:    vldi $vr1, -871
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -6.2500000000, double -6.2500000000 }
 }
 
 define dso_local { float, double } @test155() {
-; LA32-LABEL: test155:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI154_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI154_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI154_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI154_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test155:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI154_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI154_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI154_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI154_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test155:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1126
+; CHECK-NEXT:    vldi $vr1, -870
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -6.5000000000, double -6.5000000000 }
 }
 
 define dso_local { float, double } @test156() {
-; LA32-LABEL: test156:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI155_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI155_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI155_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI155_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test156:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI155_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI155_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI155_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI155_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test156:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1125
+; CHECK-NEXT:    vldi $vr1, -869
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -6.7500000000, double -6.7500000000 }
 }
 
 define dso_local { float, double } @test157() {
-; LA32-LABEL: test157:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI156_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI156_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI156_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI156_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test157:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI156_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI156_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI156_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI156_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test157:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1124
+; CHECK-NEXT:    vldi $vr1, -868
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -7.0000000000, double -7.0000000000 }
 }
 
 define dso_local { float, double } @test158() {
-; LA32-LABEL: test158:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI157_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI157_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI157_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI157_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test158:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI157_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI157_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI157_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI157_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test158:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1123
+; CHECK-NEXT:    vldi $vr1, -867
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -7.2500000000, double -7.2500000000 }
 }
 
 define dso_local { float, double } @test159() {
-; LA32-LABEL: test159:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI158_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI158_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI158_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI158_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test159:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI158_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI158_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI158_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI158_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test159:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1122
+; CHECK-NEXT:    vldi $vr1, -866
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -7.5000000000, double -7.5000000000 }
 }
 
 define dso_local { float, double } @test160() {
-; LA32-LABEL: test160:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI159_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI159_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI159_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI159_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test160:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI159_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI159_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI159_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI159_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test160:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1121
+; CHECK-NEXT:    vldi $vr1, -865
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -7.7500000000, double -7.7500000000 }
 }
 
 define dso_local { float, double } @test161() {
-; LA32-LABEL: test161:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI160_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI160_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI160_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI160_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test161:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI160_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI160_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI160_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI160_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test161:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1120
+; CHECK-NEXT:    vldi $vr1, -864
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -8.0000000000, double -8.0000000000 }
 }
 
 define dso_local { float, double } @test162() {
-; LA32-LABEL: test162:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI161_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI161_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI161_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI161_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test162:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI161_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI161_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI161_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI161_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test162:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1119
+; CHECK-NEXT:    vldi $vr1, -863
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -8.5000000000, double -8.5000000000 }
 }
 
 define dso_local { float, double } @test163() {
-; LA32-LABEL: test163:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI162_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI162_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI162_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI162_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test163:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI162_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI162_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI162_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI162_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test163:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1118
+; CHECK-NEXT:    vldi $vr1, -862
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -9.0000000000, double -9.0000000000 }
 }
 
 define dso_local { float, double } @test164() {
-; LA32-LABEL: test164:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI163_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI163_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI163_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI163_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test164:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI163_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI163_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI163_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI163_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test164:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1117
+; CHECK-NEXT:    vldi $vr1, -861
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -9.5000000000, double -9.5000000000 }
 }
 
 define dso_local { float, double } @test165() {
-; LA32-LABEL: test165:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI164_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI164_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI164_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI164_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test165:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI164_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI164_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI164_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI164_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test165:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1116
+; CHECK-NEXT:    vldi $vr1, -860
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -10.0000000000, double -10.0000000000 }
 }
 
 define dso_local { float, double } @test166() {
-; LA32-LABEL: test166:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI165_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI165_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI165_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI165_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test166:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI165_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI165_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI165_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI165_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test166:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1115
+; CHECK-NEXT:    vldi $vr1, -859
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -10.5000000000, double -10.5000000000 }
 }
 
 define dso_local { float, double } @test167() {
-; LA32-LABEL: test167:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI166_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI166_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI166_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI166_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test167:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI166_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI166_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI166_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI166_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test167:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1114
+; CHECK-NEXT:    vldi $vr1, -858
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -11.0000000000, double -11.0000000000 }
 }
 
 define dso_local { float, double } @test168() {
-; LA32-LABEL: test168:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI167_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI167_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI167_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI167_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test168:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI167_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI167_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI167_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI167_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test168:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1113
+; CHECK-NEXT:    vldi $vr1, -857
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -11.5000000000, double -11.5000000000 }
 }
 
 define dso_local { float, double } @test169() {
-; LA32-LABEL: test169:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI168_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI168_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI168_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI168_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test169:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI168_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI168_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI168_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI168_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test169:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1112
+; CHECK-NEXT:    vldi $vr1, -856
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -12.0000000000, double -12.0000000000 }
 }
 
 define dso_local { float, double } @test170() {
-; LA32-LABEL: test170:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI169_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI169_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI169_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI169_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test170:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI169_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI169_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI169_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI169_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test170:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1111
+; CHECK-NEXT:    vldi $vr1, -855
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -12.5000000000, double -12.5000000000 }
 }
 
 define dso_local { float, double } @test171() {
-; LA32-LABEL: test171:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI170_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI170_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI170_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI170_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test171:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI170_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI170_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI170_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI170_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test171:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1110
+; CHECK-NEXT:    vldi $vr1, -854
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -13.0000000000, double -13.0000000000 }
 }
 
 define dso_local { float, double } @test172() {
-; LA32-LABEL: test172:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI171_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI171_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI171_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI171_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test172:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI171_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI171_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI171_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI171_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test172:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1109
+; CHECK-NEXT:    vldi $vr1, -853
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -13.5000000000, double -13.5000000000 }
 }
 
 define dso_local { float, double } @test173() {
-; LA32-LABEL: test173:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI172_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI172_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI172_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI172_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test173:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI172_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI172_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI172_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI172_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test173:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1108
+; CHECK-NEXT:    vldi $vr1, -852
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -14.0000000000, double -14.0000000000 }
 }
 
 define dso_local { float, double } @test174() {
-; LA32-LABEL: test174:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI173_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI173_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI173_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI173_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test174:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI173_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI173_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI173_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI173_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test174:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1107
+; CHECK-NEXT:    vldi $vr1, -851
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -14.5000000000, double -14.5000000000 }
 }
 
 define dso_local { float, double } @test175() {
-; LA32-LABEL: test175:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI174_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI174_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI174_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI174_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test175:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI174_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI174_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI174_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI174_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test175:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1106
+; CHECK-NEXT:    vldi $vr1, -850
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -15.0000000000, double -15.0000000000 }
 }
 
 define dso_local { float, double } @test176() {
-; LA32-LABEL: test176:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI175_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI175_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI175_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI175_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test176:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI175_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI175_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI175_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI175_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test176:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1105
+; CHECK-NEXT:    vldi $vr1, -849
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -15.5000000000, double -15.5000000000 }
 }
 
 define dso_local { float, double } @test177() {
-; LA32-LABEL: test177:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI176_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI176_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI176_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI176_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test177:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI176_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI176_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI176_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI176_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test177:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1104
+; CHECK-NEXT:    vldi $vr1, -848
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -16.0000000000, double -16.0000000000 }
 }
 
 define dso_local { float, double } @test178() {
-; LA32-LABEL: test178:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI177_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI177_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI177_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI177_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test178:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI177_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI177_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI177_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI177_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test178:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1103
+; CHECK-NEXT:    vldi $vr1, -847
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -17.0000000000, double -17.0000000000 }
 }
 
 define dso_local { float, double } @test179() {
-; LA32-LABEL: test179:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI178_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI178_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI178_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI178_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test179:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI178_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI178_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI178_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI178_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test179:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1102
+; CHECK-NEXT:    vldi $vr1, -846
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -18.0000000000, double -18.0000000000 }
 }
 
 define dso_local { float, double } @test180() {
-; LA32-LABEL: test180:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI179_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI179_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI179_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI179_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test180:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI179_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI179_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI179_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI179_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test180:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1101
+; CHECK-NEXT:    vldi $vr1, -845
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -19.0000000000, double -19.0000000000 }
 }
 
 define dso_local { float, double } @test181() {
-; LA32-LABEL: test181:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI180_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI180_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI180_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI180_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test181:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI180_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI180_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI180_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI180_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test181:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1100
+; CHECK-NEXT:    vldi $vr1, -844
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -20.0000000000, double -20.0000000000 }
 }
 
 define dso_local { float, double } @test182() {
-; LA32-LABEL: test182:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI181_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI181_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI181_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI181_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test182:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI181_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI181_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI181_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI181_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test182:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1099
+; CHECK-NEXT:    vldi $vr1, -843
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -21.0000000000, double -21.0000000000 }
 }
 
 define dso_local { float, double } @test183() {
-; LA32-LABEL: test183:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI182_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI182_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI182_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI182_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test183:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI182_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI182_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI182_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI182_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test183:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1098
+; CHECK-NEXT:    vldi $vr1, -842
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -22.0000000000, double -22.0000000000 }
 }
 
 define dso_local { float, double } @test184() {
-; LA32-LABEL: test184:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI183_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI183_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI183_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI183_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test184:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI183_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI183_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI183_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI183_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test184:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1097
+; CHECK-NEXT:    vldi $vr1, -841
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -23.0000000000, double -23.0000000000 }
 }
 
 define dso_local { float, double } @test185() {
-; LA32-LABEL: test185:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI184_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI184_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI184_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI184_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test185:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI184_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI184_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI184_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI184_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test185:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1096
+; CHECK-NEXT:    vldi $vr1, -840
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -24.0000000000, double -24.0000000000 }
 }
 
 define dso_local { float, double } @test186() {
-; LA32-LABEL: test186:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI185_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI185_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI185_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI185_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test186:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI185_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI185_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI185_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI185_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test186:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1095
+; CHECK-NEXT:    vldi $vr1, -839
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -25.0000000000, double -25.0000000000 }
 }
 
 define dso_local { float, double } @test187() {
-; LA32-LABEL: test187:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI186_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI186_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI186_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI186_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test187:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI186_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI186_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI186_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI186_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test187:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1094
+; CHECK-NEXT:    vldi $vr1, -838
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -26.0000000000, double -26.0000000000 }
 }
 
 define dso_local { float, double } @test188() {
-; LA32-LABEL: test188:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI187_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI187_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI187_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI187_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test188:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI187_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI187_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI187_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI187_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test188:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1093
+; CHECK-NEXT:    vldi $vr1, -837
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -27.0000000000, double -27.0000000000 }
 }
 
 define dso_local { float, double } @test189() {
-; LA32-LABEL: test189:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI188_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI188_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI188_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI188_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test189:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI188_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI188_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI188_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI188_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test189:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1092
+; CHECK-NEXT:    vldi $vr1, -836
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -28.0000000000, double -28.0000000000 }
 }
 
 define dso_local { float, double } @test190() {
-; LA32-LABEL: test190:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI189_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI189_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI189_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI189_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test190:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI189_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI189_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI189_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI189_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test190:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1091
+; CHECK-NEXT:    vldi $vr1, -835
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -29.0000000000, double -29.0000000000 }
 }
 
 define dso_local { float, double } @test191() {
-; LA32-LABEL: test191:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI190_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI190_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI190_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI190_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test191:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI190_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI190_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI190_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI190_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test191:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1090
+; CHECK-NEXT:    vldi $vr1, -834
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -30.0000000000, double -30.0000000000 }
 }
 
 define dso_local { float, double } @test192() {
-; LA32-LABEL: test192:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI191_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI191_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI191_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI191_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test192:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI191_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI191_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI191_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI191_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test192:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1089
+; CHECK-NEXT:    vldi $vr1, -833
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -31.0000000000, double -31.0000000000 }
 }
 
 define dso_local { float, double } @test193() {
-; LA32-LABEL: test193:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI192_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI192_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI192_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI192_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test193:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI192_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI192_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI192_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI192_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test193:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1088
+; CHECK-NEXT:    vldi $vr1, -832
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1250000000, double -0.1250000000 }
 }
 
 define dso_local { float, double } @test194() {
-; LA32-LABEL: test194:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI193_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI193_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI193_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI193_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test194:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI193_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI193_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI193_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI193_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test194:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1087
+; CHECK-NEXT:    vldi $vr1, -831
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1328125000, double -0.1328125000 }
 }
 
 define dso_local { float, double } @test195() {
-; LA32-LABEL: test195:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI194_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI194_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI194_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI194_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test195:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI194_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI194_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI194_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI194_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test195:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1086
+; CHECK-NEXT:    vldi $vr1, -830
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1406250000, double -0.1406250000 }
 }
 
 define dso_local { float, double } @test196() {
-; LA32-LABEL: test196:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI195_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI195_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI195_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI195_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test196:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI195_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI195_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI195_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI195_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test196:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1085
+; CHECK-NEXT:    vldi $vr1, -829
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1484375000, double -0.1484375000 }
 }
 
 define dso_local { float, double } @test197() {
-; LA32-LABEL: test197:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI196_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI196_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI196_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI196_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test197:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI196_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI196_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI196_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI196_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test197:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1084
+; CHECK-NEXT:    vldi $vr1, -828
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1562500000, double -0.1562500000 }
 }
 
 define dso_local { float, double } @test198() {
-; LA32-LABEL: test198:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI197_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI197_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI197_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI197_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test198:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI197_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI197_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI197_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI197_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test198:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1083
+; CHECK-NEXT:    vldi $vr1, -827
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1640625000, double -0.1640625000 }
 }
 
 define dso_local { float, double } @test199() {
-; LA32-LABEL: test199:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI198_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI198_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI198_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI198_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test199:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI198_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI198_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI198_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI198_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test199:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1082
+; CHECK-NEXT:    vldi $vr1, -826
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1718750000, double -0.1718750000 }
 }
 
 define dso_local { float, double } @test200() {
-; LA32-LABEL: test200:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI199_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI199_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI199_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI199_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test200:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI199_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI199_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI199_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI199_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test200:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1081
+; CHECK-NEXT:    vldi $vr1, -825
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1796875000, double -0.1796875000 }
 }
 
 define dso_local { float, double } @test201() {
-; LA32-LABEL: test201:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI200_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI200_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI200_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI200_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test201:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI200_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI200_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI200_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI200_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test201:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1080
+; CHECK-NEXT:    vldi $vr1, -824
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1875000000, double -0.1875000000 }
 }
 
 define dso_local { float, double } @test202() {
-; LA32-LABEL: test202:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI201_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI201_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI201_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI201_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test202:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI201_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI201_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI201_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI201_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test202:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1079
+; CHECK-NEXT:    vldi $vr1, -823
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.1953125000, double -0.1953125000 }
 }
 
 define dso_local { float, double } @test203() {
-; LA32-LABEL: test203:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI202_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI202_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI202_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI202_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test203:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI202_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI202_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI202_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI202_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test203:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1078
+; CHECK-NEXT:    vldi $vr1, -822
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2031250000, double -0.2031250000 }
 }
 
 define dso_local { float, double } @test204() {
-; LA32-LABEL: test204:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI203_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI203_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI203_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI203_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test204:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI203_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI203_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI203_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI203_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test204:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1077
+; CHECK-NEXT:    vldi $vr1, -821
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2109375000, double -0.2109375000 }
 }
 
 define dso_local { float, double } @test205() {
-; LA32-LABEL: test205:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI204_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI204_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI204_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI204_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test205:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI204_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI204_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI204_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI204_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test205:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1076
+; CHECK-NEXT:    vldi $vr1, -820
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2187500000, double -0.2187500000 }
 }
 
 define dso_local { float, double } @test206() {
-; LA32-LABEL: test206:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI205_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI205_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI205_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI205_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test206:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI205_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI205_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI205_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI205_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test206:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1075
+; CHECK-NEXT:    vldi $vr1, -819
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2265625000, double -0.2265625000 }
 }
 
 define dso_local { float, double } @test207() {
-; LA32-LABEL: test207:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI206_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI206_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI206_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI206_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test207:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI206_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI206_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI206_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI206_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test207:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1074
+; CHECK-NEXT:    vldi $vr1, -818
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2343750000, double -0.2343750000 }
 }
 
 define dso_local { float, double } @test208() {
-; LA32-LABEL: test208:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI207_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI207_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI207_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI207_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test208:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI207_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI207_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI207_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI207_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test208:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1073
+; CHECK-NEXT:    vldi $vr1, -817
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2421875000, double -0.2421875000 }
 }
 
 define dso_local { float, double } @test209() {
-; LA32-LABEL: test209:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI208_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI208_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI208_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI208_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test209:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI208_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI208_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI208_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI208_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test209:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1072
+; CHECK-NEXT:    vldi $vr1, -816
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2500000000, double -0.2500000000 }
 }
 
 define dso_local { float, double } @test210() {
-; LA32-LABEL: test210:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI209_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI209_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI209_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI209_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test210:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI209_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI209_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI209_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI209_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test210:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1071
+; CHECK-NEXT:    vldi $vr1, -815
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2656250000, double -0.2656250000 }
 }
 
 define dso_local { float, double } @test211() {
-; LA32-LABEL: test211:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI210_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI210_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI210_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI210_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test211:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI210_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI210_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI210_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI210_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test211:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1070
+; CHECK-NEXT:    vldi $vr1, -814
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2812500000, double -0.2812500000 }
 }
 
 define dso_local { float, double } @test212() {
-; LA32-LABEL: test212:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI211_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI211_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI211_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI211_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test212:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI211_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI211_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI211_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI211_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test212:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1069
+; CHECK-NEXT:    vldi $vr1, -813
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.2968750000, double -0.2968750000 }
 }
 
 define dso_local { float, double } @test213() {
-; LA32-LABEL: test213:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI212_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI212_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI212_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI212_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test213:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI212_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI212_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI212_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI212_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test213:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1068
+; CHECK-NEXT:    vldi $vr1, -812
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.3125000000, double -0.3125000000 }
 }
 
 define dso_local { float, double } @test214() {
-; LA32-LABEL: test214:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI213_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI213_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI213_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI213_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test214:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI213_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI213_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI213_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI213_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test214:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1067
+; CHECK-NEXT:    vldi $vr1, -811
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.3281250000, double -0.3281250000 }
 }
 
 define dso_local { float, double } @test215() {
-; LA32-LABEL: test215:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI214_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI214_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI214_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI214_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test215:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI214_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI214_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI214_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI214_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test215:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1066
+; CHECK-NEXT:    vldi $vr1, -810
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.3437500000, double -0.3437500000 }
 }
 
 define dso_local { float, double } @test216() {
-; LA32-LABEL: test216:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI215_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI215_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI215_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI215_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test216:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI215_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI215_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI215_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI215_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test216:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1065
+; CHECK-NEXT:    vldi $vr1, -809
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.3593750000, double -0.3593750000 }
 }
 
 define dso_local { float, double } @test217() {
-; LA32-LABEL: test217:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI216_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI216_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI216_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI216_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test217:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI216_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI216_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI216_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI216_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test217:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1064
+; CHECK-NEXT:    vldi $vr1, -808
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.3750000000, double -0.3750000000 }
 }
 
 define dso_local { float, double } @test218() {
-; LA32-LABEL: test218:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI217_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI217_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI217_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI217_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test218:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI217_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI217_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI217_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI217_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test218:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1063
+; CHECK-NEXT:    vldi $vr1, -807
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.3906250000, double -0.3906250000 }
 }
 
 define dso_local { float, double } @test219() {
-; LA32-LABEL: test219:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI218_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI218_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI218_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI218_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test219:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI218_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI218_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI218_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI218_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test219:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1062
+; CHECK-NEXT:    vldi $vr1, -806
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.4062500000, double -0.4062500000 }
 }
 
 define dso_local { float, double } @test220() {
-; LA32-LABEL: test220:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI219_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI219_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI219_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI219_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test220:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI219_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI219_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI219_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI219_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test220:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1061
+; CHECK-NEXT:    vldi $vr1, -805
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.4218750000, double -0.4218750000 }
 }
 
 define dso_local { float, double } @test221() {
-; LA32-LABEL: test221:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI220_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI220_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI220_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI220_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test221:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI220_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI220_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI220_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI220_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test221:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1060
+; CHECK-NEXT:    vldi $vr1, -804
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.4375000000, double -0.4375000000 }
 }
 
 define dso_local { float, double } @test222() {
-; LA32-LABEL: test222:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI221_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI221_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI221_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI221_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test222:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI221_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI221_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI221_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI221_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test222:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1059
+; CHECK-NEXT:    vldi $vr1, -803
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.4531250000, double -0.4531250000 }
 }
 
 define dso_local { float, double } @test223() {
-; LA32-LABEL: test223:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI222_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI222_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI222_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI222_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test223:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI222_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI222_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI222_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI222_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test223:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1058
+; CHECK-NEXT:    vldi $vr1, -802
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.4687500000, double -0.4687500000 }
 }
 
 define dso_local { float, double } @test224() {
-; LA32-LABEL: test224:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI223_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI223_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI223_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI223_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test224:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI223_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI223_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI223_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI223_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test224:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1057
+; CHECK-NEXT:    vldi $vr1, -801
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.4843750000, double -0.4843750000 }
 }
 
 define dso_local { float, double } @test225() {
-; LA32-LABEL: test225:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI224_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI224_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI224_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI224_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test225:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI224_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI224_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI224_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI224_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test225:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1056
+; CHECK-NEXT:    vldi $vr1, -800
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.5000000000, double -0.5000000000 }
 }
 
 define dso_local { float, double } @test226() {
-; LA32-LABEL: test226:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI225_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI225_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI225_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI225_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test226:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI225_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI225_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI225_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI225_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test226:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1055
+; CHECK-NEXT:    vldi $vr1, -799
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.5312500000, double -0.5312500000 }
 }
 
 define dso_local { float, double } @test227() {
-; LA32-LABEL: test227:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI226_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI226_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI226_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI226_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test227:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI226_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI226_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI226_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI226_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test227:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1054
+; CHECK-NEXT:    vldi $vr1, -798
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.5625000000, double -0.5625000000 }
 }
 
 define dso_local { float, double } @test228() {
-; LA32-LABEL: test228:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI227_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI227_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI227_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI227_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test228:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI227_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI227_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI227_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI227_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test228:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1053
+; CHECK-NEXT:    vldi $vr1, -797
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.5937500000, double -0.5937500000 }
 }
 
 define dso_local { float, double } @test229() {
-; LA32-LABEL: test229:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI228_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI228_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI228_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI228_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test229:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI228_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI228_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI228_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI228_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test229:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1052
+; CHECK-NEXT:    vldi $vr1, -796
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.6250000000, double -0.6250000000 }
 }
 
 define dso_local { float, double } @test230() {
-; LA32-LABEL: test230:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI229_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI229_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI229_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI229_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test230:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI229_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI229_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI229_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI229_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test230:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1051
+; CHECK-NEXT:    vldi $vr1, -795
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.6562500000, double -0.6562500000 }
 }
 
 define dso_local { float, double } @test231() {
-; LA32-LABEL: test231:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI230_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI230_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI230_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI230_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test231:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI230_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI230_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI230_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI230_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test231:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1050
+; CHECK-NEXT:    vldi $vr1, -794
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.6875000000, double -0.6875000000 }
 }
 
 define dso_local { float, double } @test232() {
-; LA32-LABEL: test232:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI231_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI231_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI231_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI231_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test232:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI231_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI231_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI231_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI231_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test232:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1049
+; CHECK-NEXT:    vldi $vr1, -793
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.7187500000, double -0.7187500000 }
 }
 
 define dso_local { float, double } @test233() {
-; LA32-LABEL: test233:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI232_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI232_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI232_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI232_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test233:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI232_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI232_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI232_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI232_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test233:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1048
+; CHECK-NEXT:    vldi $vr1, -792
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.7500000000, double -0.7500000000 }
 }
 
 define dso_local { float, double } @test234() {
-; LA32-LABEL: test234:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI233_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI233_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI233_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI233_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test234:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI233_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI233_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI233_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI233_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test234:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1047
+; CHECK-NEXT:    vldi $vr1, -791
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.7812500000, double -0.7812500000 }
 }
 
 define dso_local { float, double } @test235() {
-; LA32-LABEL: test235:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI234_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI234_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI234_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI234_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test235:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI234_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI234_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI234_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI234_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test235:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1046
+; CHECK-NEXT:    vldi $vr1, -790
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.8125000000, double -0.8125000000 }
 }
 
 define dso_local { float, double } @test236() {
-; LA32-LABEL: test236:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI235_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI235_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI235_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI235_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test236:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI235_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI235_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI235_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI235_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test236:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1045
+; CHECK-NEXT:    vldi $vr1, -789
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.8437500000, double -0.8437500000 }
 }
 
 define dso_local { float, double } @test237() {
-; LA32-LABEL: test237:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI236_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI236_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI236_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI236_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test237:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI236_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI236_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI236_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI236_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test237:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1044
+; CHECK-NEXT:    vldi $vr1, -788
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.8750000000, double -0.8750000000 }
 }
 
 define dso_local { float, double } @test238() {
-; LA32-LABEL: test238:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI237_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI237_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI237_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI237_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test238:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI237_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI237_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI237_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI237_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test238:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1043
+; CHECK-NEXT:    vldi $vr1, -787
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.9062500000, double -0.9062500000 }
 }
 
 define dso_local { float, double } @test239() {
-; LA32-LABEL: test239:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI238_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI238_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI238_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI238_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test239:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI238_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI238_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI238_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI238_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test239:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1042
+; CHECK-NEXT:    vldi $vr1, -786
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.9375000000, double -0.9375000000 }
 }
 
 define dso_local { float, double } @test240() {
-; LA32-LABEL: test240:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI239_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI239_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI239_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI239_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test240:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI239_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI239_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI239_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI239_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test240:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1041
+; CHECK-NEXT:    vldi $vr1, -785
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -0.9687500000, double -0.9687500000 }
 }
 
 define dso_local { float, double } @test241() {
-; LA32-LABEL: test241:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI240_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI240_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI240_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI240_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test241:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI240_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI240_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI240_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI240_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test241:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1040
+; CHECK-NEXT:    vldi $vr1, -784
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.0000000000, double -1.0000000000 }
 }
 
 define dso_local { float, double } @test242() {
-; LA32-LABEL: test242:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI241_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI241_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI241_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI241_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test242:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI241_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI241_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI241_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI241_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test242:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1039
+; CHECK-NEXT:    vldi $vr1, -783
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.0625000000, double -1.0625000000 }
 }
 
 define dso_local { float, double } @test243() {
-; LA32-LABEL: test243:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI242_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI242_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI242_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI242_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test243:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI242_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI242_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI242_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI242_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test243:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1038
+; CHECK-NEXT:    vldi $vr1, -782
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.1250000000, double -1.1250000000 }
 }
 
 define dso_local { float, double } @test244() {
-; LA32-LABEL: test244:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI243_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI243_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI243_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI243_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test244:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI243_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI243_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI243_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI243_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test244:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1037
+; CHECK-NEXT:    vldi $vr1, -781
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.1875000000, double -1.1875000000 }
 }
 
 define dso_local { float, double } @test245() {
-; LA32-LABEL: test245:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI244_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI244_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI244_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI244_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test245:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI244_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI244_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI244_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI244_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test245:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1036
+; CHECK-NEXT:    vldi $vr1, -780
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.2500000000, double -1.2500000000 }
 }
 
 define dso_local { float, double } @test246() {
-; LA32-LABEL: test246:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI245_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI245_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI245_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI245_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test246:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI245_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI245_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI245_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI245_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test246:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1035
+; CHECK-NEXT:    vldi $vr1, -779
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.3125000000, double -1.3125000000 }
 }
 
 define dso_local { float, double } @test247() {
-; LA32-LABEL: test247:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI246_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI246_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI246_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI246_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test247:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI246_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI246_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI246_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI246_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test247:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1034
+; CHECK-NEXT:    vldi $vr1, -778
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.3750000000, double -1.3750000000 }
 }
 
 define dso_local { float, double } @test248() {
-; LA32-LABEL: test248:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI247_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI247_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI247_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI247_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test248:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI247_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI247_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI247_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI247_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test248:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1033
+; CHECK-NEXT:    vldi $vr1, -777
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.4375000000, double -1.4375000000 }
 }
 
 define dso_local { float, double } @test249() {
-; LA32-LABEL: test249:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI248_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI248_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI248_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI248_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test249:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI248_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI248_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI248_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI248_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test249:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1032
+; CHECK-NEXT:    vldi $vr1, -776
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.5000000000, double -1.5000000000 }
 }
 
 define dso_local { float, double } @test250() {
-; LA32-LABEL: test250:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI249_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI249_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI249_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI249_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test250:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI249_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI249_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI249_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI249_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test250:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1031
+; CHECK-NEXT:    vldi $vr1, -775
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.5625000000, double -1.5625000000 }
 }
 
 define dso_local { float, double } @test251() {
-; LA32-LABEL: test251:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI250_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI250_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI250_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI250_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test251:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI250_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI250_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI250_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI250_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test251:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1030
+; CHECK-NEXT:    vldi $vr1, -774
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.6250000000, double -1.6250000000 }
 }
 
 define dso_local { float, double } @test252() {
-; LA32-LABEL: test252:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI251_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI251_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI251_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI251_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test252:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI251_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI251_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI251_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI251_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test252:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1029
+; CHECK-NEXT:    vldi $vr1, -773
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.6875000000, double -1.6875000000 }
 }
 
 define dso_local { float, double } @test253() {
-; LA32-LABEL: test253:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI252_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI252_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI252_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI252_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test253:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI252_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI252_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI252_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI252_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test253:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1028
+; CHECK-NEXT:    vldi $vr1, -772
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.7500000000, double -1.7500000000 }
 }
 
 define dso_local { float, double } @test254() {
-; LA32-LABEL: test254:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI253_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI253_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI253_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI253_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test254:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI253_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI253_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI253_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI253_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test254:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1027
+; CHECK-NEXT:    vldi $vr1, -771
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.8125000000, double -1.8125000000 }
 }
 
 define dso_local { float, double } @test255() {
-; LA32-LABEL: test255:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI254_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI254_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI254_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI254_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test255:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI254_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI254_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI254_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI254_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test255:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1026
+; CHECK-NEXT:    vldi $vr1, -770
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.8750000000, double -1.8750000000 }
 }
 
 define dso_local { float, double } @test256() {
-; LA32-LABEL: test256:
-; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI255_0)
-; LA32-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI255_0)
-; LA32-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI255_1)
-; LA32-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI255_1)
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: test256:
-; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI255_0)
-; LA64-NEXT:    fld.s $fa0, $a0, %pc_lo12(.LCPI255_0)
-; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI255_1)
-; LA64-NEXT:    fld.d $fa1, $a0, %pc_lo12(.LCPI255_1)
-; LA64-NEXT:    ret
+; CHECK-LABEL: test256:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vldi $vr0, -1025
+; CHECK-NEXT:    vldi $vr1, -769
+; CHECK-NEXT:    ret
 entry:
   ret { float, double } { float -1.9375000000, double -1.9375000000 }
 }

From 16dadecc05fa4986d4522c2c3a09a7628feb0fd4 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Fri, 9 Aug 2024 08:17:12 +0200
Subject: [PATCH 105/112] Fix clang flags in the SystemZ
 zos-mixed-ptr-sizes-malloc.c test

caught by a sandboxed test run.
---
 clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-malloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-malloc.c b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-malloc.c
index b84bca6b42f483e..1b05e8aa5052aff 100644
--- a/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-malloc.c
+++ b/clang/test/CodeGen/SystemZ/zos-mixed-ptr-sizes-malloc.c
@@ -1,4 +1,4 @@
-// RUN: %clang -target s390x-ibm-zos -emit-llvm -S -O2 %s -o - | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -triple s390x-ibm-zos -O2 -emit-llvm %s -o - | FileCheck %s --check-prefix=X64
 #include <stddef.h>
 void *__malloc31(size_t);
 

From 8410babc2b69ccb2259e77262ad2baaa17644bea Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 9 Aug 2024 14:48:56 +0800
Subject: [PATCH 106/112] [C++20] [Moduels] Correct the linkage of const
 variable in language linkage from module interfaces (#102574)

Close https://github.com/llvm/llvm-project/issues/99825

The root cause of the issue is that I didn't realize the things in
implicit global module (the language linkage in module interfaces)
should be considered in module purview.
---
 clang/lib/AST/Decl.cpp          | 14 +++++++-------
 clang/test/Modules/pr99825.cppm |  8 ++++++++
 2 files changed, 15 insertions(+), 7 deletions(-)
 create mode 100644 clang/test/Modules/pr99825.cppm

diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index d832ce4190ff1a5..e125143bc1b2703 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -583,12 +583,6 @@ static bool isSingleLineLanguageLinkage(const Decl &D) {
   return false;
 }
 
-static bool isDeclaredInModuleInterfaceOrPartition(const NamedDecl *D) {
-  if (auto *M = D->getOwningModule())
-    return M->isInterfaceOrPartition();
-  return false;
-}
-
 static LinkageInfo getExternalLinkageFor(const NamedDecl *D) {
   return LinkageInfo::external();
 }
@@ -642,7 +636,13 @@ LinkageComputer::getLVForNamespaceScopeDecl(const NamedDecl *D,
     // (There is no equivalent in C99.)
     if (Context.getLangOpts().CPlusPlus && Var->getType().isConstQualified() &&
         !Var->getType().isVolatileQualified() && !Var->isInline() &&
-        !isDeclaredInModuleInterfaceOrPartition(Var) &&
+        ![Var]() {
+          // Check if it is module purview except private module fragment
+          // and implementation unit.
+          if (auto *M = Var->getOwningModule())
+            return M->isInterfaceOrPartition() || M->isImplicitGlobalModule();
+          return false;
+        }() &&
         !isa<VarTemplateSpecializationDecl>(Var) &&
         !Var->getDescribedVarTemplate()) {
       const VarDecl *PrevVar = Var->getPreviousDecl();
diff --git a/clang/test/Modules/pr99825.cppm b/clang/test/Modules/pr99825.cppm
new file mode 100644
index 000000000000000..fe6541c6e68e506
--- /dev/null
+++ b/clang/test/Modules/pr99825.cppm
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -std=c++20 %s -fsyntax-only -verify
+// expected-no-diagnostics
+export module mod;
+
+extern "C++"
+{
+    export constexpr auto x = 10;
+}

From 67cb04035fe831a3b5df98dabc44b53ba5ff7126 Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <jonas.hahnfeld@cern.ch>
Date: Fri, 9 Aug 2024 09:00:23 +0200
Subject: [PATCH 107/112] [clang] Remove dead incremental Parser code (#102450)

When incremental processing is enabled, the Parser will never report
tok::eof but tok::annot_repl_input_end. However, that case is already
taken care of in IncrementalParser::ParseOrWrapTopLevelDecl() so this
check was never executing.
---
 clang/lib/Parse/Parser.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 5ebe71e496a2e83..04c2f1d380bc48b 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -629,11 +629,6 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result,
                                Sema::ModuleImportState &ImportState) {
   DestroyTemplateIdAnnotationsRAIIObj CleanupRAII(*this);
 
-  // Skip over the EOF token, flagging end of previous input for incremental
-  // processing
-  if (PP.isIncrementalProcessingEnabled() && Tok.is(tok::eof))
-    ConsumeToken();
-
   Result = nullptr;
   switch (Tok.getKind()) {
   case tok::annot_pragma_unused:

From a50b9633357007ff886f3fd228ca4b8a9b9b9852 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 9 Aug 2024 08:14:58 +0100
Subject: [PATCH 108/112] [mlir][vector] Add tests for xfer-flatten patterns
 (#102210)

Adds tests for scalable vectors in:
  * vector-transfer-flatten.mlir

This is rather straightfoward as the tested patterns (*) do not support
scalable vectors.

(*) `FlattenContiguousRowMajorTransferReadPattern` and
`FlattenContiguousRowMajorTransferWritePattern`
---
 .../Vector/vector-transfer-flatten.mlir       | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
index df0a5c5fa0ce869..85afdf7a7dc7718 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir
@@ -6,6 +6,8 @@
 ///----------------------------------------------------------------------------------------
 /// vector.transfer_read
 /// [Pattern: FlattenContiguousRowMajorTransferReadPattern]
+///
+/// NOTE: Scalable vectors are not supported
 ///----------------------------------------------------------------------------------------
 
 func.func @transfer_read_dims_match_contiguous(
@@ -28,6 +30,22 @@ func.func @transfer_read_dims_match_contiguous(
 // CHECK-128B-LABEL: func @transfer_read_dims_match_contiguous
 //       CHECK-128B:   memref.collapse_shape
 
+func.func @transfer_read_dims_match_contiguous_scalable(
+    %mem : memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>) -> vector<5x4x3x[2]xi8> {
+
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0 : i8
+  %res = vector.transfer_read %mem[%c0, %c0, %c0, %c0], %cst :
+    memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>, vector<5x4x3x[2]xi8>
+  return %res : vector<5x4x3x[2]xi8>
+}
+
+// CHECK-LABEL: func @transfer_read_dims_match_contiguous_scalable
+// CHECK-NOT: memref.collapse_shape
+
+// CHECK-128B-LABEL: func @transfer_read_dims_match_contiguous_scalable
+//   CHECK-128B-NOT:   memref.collapse_shape
+
 // -----
 
 func.func @transfer_read_dims_match_contiguous_empty_stride(
@@ -259,6 +277,8 @@ func.func @transfer_read_non_contiguous_src(
 ///----------------------------------------------------------------------------------------
 /// vector.transfer_write
 /// [Pattern: FlattenContiguousRowMajorTransferWritePattern]
+///
+/// NOTE: Scalable vectors are not supported
 ///----------------------------------------------------------------------------------------
 
 func.func @transfer_write_dims_match_contiguous(
@@ -281,6 +301,22 @@ func.func @transfer_write_dims_match_contiguous(
 // CHECK-128B-LABEL: func @transfer_write_dims_match_contiguous(
 //       CHECK-128B:   memref.collapse_shape
 
+func.func @transfer_write_dims_match_contiguous_scalable(
+    %mem : memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>,
+    %vec : vector<5x4x3x[2]xi8>) {
+
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %vec, %mem [%c0, %c0, %c0, %c0] :
+    vector<5x4x3x[2]xi8>, memref<5x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>
+  return
+}
+
+// CHECK-LABEL: func @transfer_write_dims_match_contiguous_scalable(
+// CHECK-NOT:   memref.collapse_shape
+
+// CHECK-128B-LABEL: func @transfer_write_dims_match_contiguous_scalable
+//   CHECK-128B-NOT:   memref.collapse_shape
+
 // -----
 
 func.func @transfer_write_dims_match_contiguous_empty_stride(
@@ -504,7 +540,11 @@ func.func @transfer_write_non_contiguous_src(
 
 ///----------------------------------------------------------------------------------------
 /// [Pattern: DropUnitDimFromElementwiseOps]
+///
 /// TODO: Move to a dedicated file - there's no "flattening" in the following tests
+/// TODO: Potential duplication with tests from:
+///   * "vector-dropleadunitdim-transforms.mlir" 
+///   * "vector-transfer-drop-unit-dims-patterns.mlir"
 ///----------------------------------------------------------------------------------------
 
 func.func @fold_unit_dim_add_basic(%vec : vector<1x8xi32>) -> vector<1x8xi32> {

From a9e75b1d4d18d09a63f120df4781013c1866b4ff Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 9 Aug 2024 16:51:42 +1000
Subject: [PATCH 109/112] [ORC][MachO] Fix race condition during MachOPlatform
 bootstrap.

In 93509b4462a74 MachOPlatform was updated to store object symbols in a shared
vector during bootstrap (this table is later attached to the
bootstrap-completion graph once the ORC runtime's symbol table registration
code is ready). The shared vector was not guarded with a mutex, so use of a
concurrent dispatcher could lead to races during bootstrap. This commit fixes
the issue by guarding access to the table with the BootstrapInfo mutex.

No testcase since this only manifests rarely when both a concurrent dispatcher
and the ORC runtime are used. Once we add a concurrent dispatcher option to
llvm-jitlink we may be able to test this with a regression test in the ORC
runtime and TSan enabled.

rdar://133520308
---
 .../lib/ExecutionEngine/Orc/MachOPlatform.cpp | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 0d117f7cf8734f8..a71afe1a3162f03 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -1697,22 +1697,24 @@ Error MachOPlatform::MachOPlatformPlugin::addSymbolTableRegistration(
     HeaderAddr = I->second;
   }
 
-  SymbolTableVector LocalSymTab;
-  auto &SymTab = LLVM_LIKELY(!InBootstrapPhase) ? LocalSymTab
-                                                : MP.Bootstrap.load()->SymTab;
+  if (LLVM_UNLIKELY(InBootstrapPhase)) {
+    // If we're in the bootstrap phase then just record these symbols in the
+    // bootstrap object and then bail out -- registration will be attached to
+    // the bootstrap graph.
+    std::lock_guard<std::mutex> Lock(MP.Bootstrap.load()->Mutex);
+    auto &SymTab = MP.Bootstrap.load()->SymTab;
+    for (auto &[OriginalSymbol, NameSym] : JITSymTabInfo)
+      SymTab.push_back({NameSym->getAddress(), OriginalSymbol->getAddress(),
+                        flagsForSymbol(*OriginalSymbol)});
+    return Error::success();
+  }
+
+  SymbolTableVector SymTab;
   for (auto &[OriginalSymbol, NameSym] : JITSymTabInfo)
     SymTab.push_back({NameSym->getAddress(), OriginalSymbol->getAddress(),
                       flagsForSymbol(*OriginalSymbol)});
 
-  // Bail out if we're in the bootstrap phase -- registration of thees symbols
-  // will be attached to the bootstrap graph.
-  if (LLVM_UNLIKELY(InBootstrapPhase))
-    return Error::success();
-
-  shared::AllocActions &allocActions = LLVM_LIKELY(!InBootstrapPhase)
-                                           ? G.allocActions()
-                                           : MP.Bootstrap.load()->DeferredAAs;
-  allocActions.push_back(
+  G.allocActions().push_back(
       {cantFail(WrapperFunctionCall::Create<SPSRegisterSymbolsArgs>(
            MP.RegisterObjectSymbolTable.Addr, HeaderAddr, SymTab)),
        cantFail(WrapperFunctionCall::Create<SPSRegisterSymbolsArgs>(

From d38bae3c773f0fda578097dc5a35114a83cfc2af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= <juamarti@amd.com>
Date: Fri, 9 Aug 2024 09:33:31 +0200
Subject: [PATCH 110/112] [update_llc_test_checks][AMDGPU] Update AMDGPU regexp
 in update_llc_test_checks.py (#102480)

Updating `llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll` with
`update_llc_test_checks.py` ended with several kernels with no checks.

Llc's output contained the line ".amdgpu_hsa_kernel <funcname>" after
the ".type <funcname>,@function" entry which was not considered by the
regexp.
---
 llvm/utils/UpdateTestChecks/asm.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py
index f150098eaaeef48..f05d8b89e73b933 100644
--- a/llvm/utils/UpdateTestChecks/asm.py
+++ b/llvm/utils/UpdateTestChecks/asm.py
@@ -52,6 +52,7 @@ class string:
 
 ASM_FUNCTION_AMDGPU_RE = re.compile(
     r"\.type\s+_?(?P<func>[^,\n]+),@function\n"
+    r"(^\s*\.amdgpu_hsa_kernel (?P=func)\n)?"
     r'^_?(?P=func):(?:[ \t]*;+[ \t]*@"?(?P=func)"?)?\n'
     r"(?P<body>.*?)\n"  # (body of the function)
     # This list is incomplete

From 50a2b31800a554b35367a5f3bcebb640703a48e1 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Fri, 9 Aug 2024 08:50:21 +0100
Subject: [PATCH 111/112] [ARM] Be more precise about conditions for indirect
 tail-calls (#102451)

This code was trying to predict the conditions in which an indirect
tail call will have a free register to hold the target address, and
falling back to a non-tail call if all non-callee-saved registers are
used for arguments or return address authentication.

However, it was only taking the number of arguments into account, not
which registers they are allocated to, so floating-point arguments could
cause this to give the wrong result, causing either a later error due to
the lack of a free register, or a missed optimisation of not doing the
tail call.

The assignments of arguments to registers is available at this point in
the code, so we can calculate exactly which registers will be available
for the tail-call.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |  33 ++++--
 .../indirect-tail-call-free-registers.ll      | 111 ++++++++++++++++++
 2 files changed, 132 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 75d16a42d0205a8..476b7b349294ab3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3024,18 +3024,27 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
 
   assert(Subtarget->supportsTailCall());
 
-  // Indirect tail calls cannot be optimized for Thumb1 if the args
-  // to the call take up r0-r3. The reason is that there are no legal registers
-  // left to hold the pointer to the function to be called.
-  // Similarly, if the function uses return address sign and authentication,
-  // r12 is needed to hold the PAC and is not available to hold the callee
-  // address.
-  if (Outs.size() >= 4 &&
-      (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {
-    if (Subtarget->isThumb1Only())
-      return false;
-    // Conservatively assume the function spills LR.
-    if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true))
+  // Indirect tail-calls require a register to hold the target address. That
+  // register must be:
+  // * Allocatable (i.e. r0-r7 if the target is Thumb1).
+  // * Not callee-saved, so must be one of r0-r3 or r12.
+  // * Not used to hold an argument to the tail-called function, which might be
+  //   in r0-r3.
+  // * Not used to hold the return address authentication code, which is in r12
+  //   if enabled.
+  // Sometimes, no register matches all of these conditions, so we can't do a
+  // tail-call.
+  if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
+    SmallSet<MCPhysReg, 5> AddressRegisters;
+    for (Register R : {ARM::R0, ARM::R1, ARM::R2, ARM::R3})
+      AddressRegisters.insert(R);
+    if (!(Subtarget->isThumb1Only() ||
+          MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
+      AddressRegisters.insert(ARM::R12);
+    for (const CCValAssign &AL : ArgLocs)
+      if (AL.isRegLoc())
+        AddressRegisters.erase(AL.getLocReg());
+    if (AddressRegisters.empty())
       return false;
   }
 
diff --git a/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll b/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll
new file mode 100644
index 000000000000000..c6ace3eb55b287b
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/indirect-tail-call-free-registers.ll
@@ -0,0 +1,111 @@
+; RUN: llc %s -o - -mtriple=thumbv8m.main -mattr=+vfp4 | FileCheck %s
+
+;; No outgoing arguments, plenty of free registers to hold the target address.
+define void @test0(ptr %fptr) {
+; CHECK-LABEL: test0:
+; CHECK: bx {{r0|r1|r2|r3|r12}}
+entry:
+  tail call void %fptr()
+  ret void
+}
+
+;; Four integer outgoing arguments, which use up r0-r3.
+define void @test1(ptr %fptr) {
+; CHECK-LABEL: test1:
+; CHECK: bx r12
+entry:
+  tail call void %fptr(i32 0, i32 0, i32 0, i32 0)
+  ret void
+}
+
+;; Four integer outgoing arguments, which use up r0-r3, and sign-return-address
+;; uses r12, so we can never tail-call this.
+define void @test2(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test2:
+; CHECK: blx
+  entry:
+  tail call void %fptr(i32 0, i32 0, i32 0, i32 0)
+  ret void
+}
+
+;; An i32 and an i64 argument, which uses r0, r2 and r3 for arguments, leaving
+;; r1 free for the address.
+define void @test3(ptr %fptr) {
+; CHECK-LABEL: test3:
+; CHECK: bx {{r1|r12}}
+entry:
+  tail call void %fptr(i32 0, i64 0)
+  ret void
+}
+
+;; Four float arguments, using the soft-float calling convention, which uses
+;; r0-r3.
+define void @test4(ptr %fptr) {
+; CHECK-LABEL: test4:
+; CHECK: bx r12
+entry:
+  tail call arm_aapcscc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Four float arguments, using the soft-float calling convention, which uses
+;; r0-r3, and sign-return-address uses r12. Currently fails with "ran out of
+;; registers during register allocation".
+define void @test5(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test5:
+; CHECK: blx
+entry:
+  tail call arm_aapcscc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Four float arguments, using the hard-float calling convention, which uses
+;; s0-s3, leaving the all of the integer registers free for the address.
+define void @test6(ptr %fptr) {
+; CHECK-LABEL: test6:
+; CHECK: bx {{r0|r1|r2|r3|r12}}
+entry:
+  tail call arm_aapcs_vfpcc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Four float arguments, using the hard-float calling convention, which uses
+;; s0-s3, leaving r0-r3 free for the address, with r12 used for
+;; sign-return-address.
+define void @test7(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test7:
+; CHECK: bx {{r0|r1|r2|r3}}
+entry:
+  tail call arm_aapcs_vfpcc void %fptr(float 0.0, float 0.0, float 0.0, float 0.0)
+  ret void
+}
+
+;; Two double arguments, using the soft-float calling convention, which uses
+;; r0-r3.
+define void @test8(ptr %fptr) {
+; CHECK-LABEL: test8:
+; CHECK: bx r12
+entry:
+  tail call arm_aapcscc void %fptr(double 0.0, double 0.0)
+  ret void
+}
+
+;; Two double arguments, using the soft-float calling convention, which uses
+;; r0-r3, and sign-return-address uses r12, so we can't tail-call this.
+define void @test9(ptr %fptr) "sign-return-address"="all" {
+; CHECK-LABEL: test9:
+; CHECK: blx
+entry:
+  tail call arm_aapcscc void %fptr(double 0.0, double 0.0)
+  ret void
+}
+
+;; Four integer arguments (one on the stack), but dut to alignment r1 is left
+;; empty, so can be used for the tail-call.
+define void @test10(ptr %fptr, i64 %b, i32 %c) "sign-return-address"="all" {
+; CHECK-LABEL: test10:
+; CHECK: bx r1
+entry:
+  tail call void %fptr(i32 0, i64 %b, i32 %c)
+  ret void
+}

From 7e175b307e12a616b287f58af0fbfd8d9164e364 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 9 Aug 2024 09:06:57 +0100
Subject: [PATCH 112/112] [mlir][vector] Add more tests for ConvertVectorToLLVM
 (2/n) (#102203)

Adds tests with scalable vectors for the Vector-To-LLVM conversion pass.
Covers the following Ops:
  * vector.outerproduct
---
 .../VectorToLLVM/vector-to-llvm.mlir          | 180 ++++++++++++++++++
 1 file changed, 180 insertions(+)

diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 69c4b5e5c87e521..d164e8750979689 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -600,6 +600,30 @@ func.func @outerproduct(%arg0: vector<2xf32>, %arg1: vector<3xf32>) -> vector<2x
 // CHECK:       %[[T14:.*]] = builtin.unrealized_conversion_cast %[[T13]] : !llvm.array<2 x vector<3xf32>> to vector<2x3xf32>
 // CHECK:       return %[[T14]] : vector<2x3xf32>
 
+func.func @outerproduct_scalable(%arg0: vector<2xf32>, %arg1: vector<[3]xf32>) -> vector<2x[3]xf32> {
+  %2 = vector.outerproduct %arg0, %arg1 : vector<2xf32>, vector<[3]xf32>
+  return %2 : vector<2x[3]xf32>
+}
+// CHECK-LABEL: @outerproduct_scalable
+// CHECK-SAME:  %[[A:.*]]: vector<2xf32>,
+// CHECK-SAME:  %[[B:.*]]: vector<[3]xf32>)
+// CHECK:       %[[T2:.*]] = arith.constant dense<0.000000e+00> : vector<2x[3]xf32>
+// CHECK:       %[[T7:.*]] = builtin.unrealized_conversion_cast %[[T2]] : vector<2x[3]xf32> to !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T3:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:       %[[T4:.*]] = llvm.extractelement %[[A]]{{\[}}%[[T3]] : i64] : vector<2xf32>
+// CHECK:       %[[T5Insert:.*]] = llvm.insertelement %[[T4]]
+// CHECK:       %[[T5:.*]] = llvm.shufflevector %[[T5Insert]]
+// CHECK:       %[[T6:.*]] = arith.mulf %[[T5]], %[[B]] : vector<[3]xf32>
+// CHECK:       %[[T8:.*]] = llvm.insertvalue %[[T6]], %[[T7]][0] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T9:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:       %[[T10:.*]] = llvm.extractelement %[[A]]{{\[}}%[[T9]] : i64] : vector<2xf32>
+// CHECK:       %[[T11Insert:.*]] = llvm.insertelement %[[T10]]
+// CHECK:       %[[T11:.*]] = llvm.shufflevector %[[T11Insert]]
+// CHECK:       %[[T12:.*]] = arith.mulf %[[T11]], %[[B]] : vector<[3]xf32>
+// CHECK:       %[[T13:.*]] = llvm.insertvalue %[[T12]], %[[T8]][1] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T14:.*]] = builtin.unrealized_conversion_cast %[[T13]] : !llvm.array<2 x vector<[3]xf32>> to vector<2x[3]xf32>
+// CHECK:       return %[[T14]] : vector<2x[3]xf32>
+
 // -----
 
 func.func @outerproduct_index(%arg0: vector<2xindex>, %arg1: vector<3xindex>) -> vector<2x3xindex> {
@@ -621,6 +645,25 @@ func.func @outerproduct_index(%arg0: vector<2xindex>, %arg1: vector<3xindex>) ->
 // CHECK:       %[[T7:.*]] = builtin.unrealized_conversion_cast %[[T6]] : vector<3xindex> to vector<3xi64>
 // CHECK:       %{{.*}} = llvm.insertvalue %[[T7]], %[[T8]][0] : !llvm.array<2 x vector<3xi64>>
 
+func.func @outerproduct_index_scalable(%arg0: vector<2xindex>, %arg1: vector<[3]xindex>) -> vector<2x[3]xindex> {
+  %2 = vector.outerproduct %arg0, %arg1 : vector<2xindex>, vector<[3]xindex>
+  return %2 : vector<2x[3]xindex>
+}
+// CHECK-LABEL: @outerproduct_index_scalable
+// CHECK-SAME:  %[[A:.*]]: vector<2xindex>,
+// CHECK-SAME:  %[[B:.*]]: vector<[3]xindex>)
+// CHECK:       %[[T1:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<2xindex> to vector<2xi64>
+// CHECK:       %[[T0:.*]] = arith.constant dense<0> : vector<2x[3]xindex>
+// CHECK:       %[[T8:.*]] = builtin.unrealized_conversion_cast %[[T0]] : vector<2x[3]xindex> to !llvm.array<2 x vector<[3]xi64>>
+// CHECK:       %[[T2:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:       %[[T3:.*]] = llvm.extractelement %[[T1]]{{\[}}%[[T2]] : i64] : vector<2xi64>
+// CHECK:       %[[T4:.*]] = llvm.insertelement %[[T3]]
+// CHECK:       %[[T5:.*]] = llvm.shufflevector %[[T4]]
+// CHECK:       %[[T5Cast:.*]] = builtin.unrealized_conversion_cast %[[T5]] : vector<[3]xi64> to vector<[3]xindex>
+// CHECK:       %[[T6:.*]] = arith.muli %[[T5Cast]], %[[B]] : vector<[3]xindex>
+// CHECK:       %[[T7:.*]] = builtin.unrealized_conversion_cast %[[T6]] : vector<[3]xindex> to vector<[3]xi64>
+// CHECK:       %{{.*}} = llvm.insertvalue %[[T7]], %[[T8]][0] : !llvm.array<2 x vector<[3]xi64>>
+
 // -----
 
 func.func @outerproduct_add(%arg0: vector<2xf32>, %arg1: vector<3xf32>, %arg2: vector<2x3xf32>) -> vector<2x3xf32> {
@@ -651,6 +694,34 @@ func.func @outerproduct_add(%arg0: vector<2xf32>, %arg1: vector<3xf32>, %arg2: v
 // CHECK:       %[[T19:.*]] = builtin.unrealized_conversion_cast %[[T18]] : !llvm.array<2 x vector<3xf32>> to vector<2x3xf32>
 // CHECK:       return %[[T19]] : vector<2x3xf32>
 
+func.func @outerproduct_add_scalable(%arg0: vector<2xf32>, %arg1: vector<[3]xf32>, %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32> {
+  %2 = vector.outerproduct %arg0, %arg1, %arg2 : vector<2xf32>, vector<[3]xf32>
+  return %2 : vector<2x[3]xf32>
+}
+// CHECK-LABEL: @outerproduct_add_scalable
+// CHECK-SAME:  %[[A:.*]]: vector<2xf32>,
+// CHECK-SAME:  %[[B:.*]]: vector<[3]xf32>,
+// CHECK-SAME:  %[[C:.*]]: vector<2x[3]xf32>) -> vector<2x[3]xf32>
+// CHECK:       %[[T7:.*]] = builtin.unrealized_conversion_cast %[[C]] : vector<2x[3]xf32> to !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T3:.*]] = arith.constant dense<0.000000e+00> : vector<2x[3]xf32>
+// CHECK:       %[[T10:.*]] = builtin.unrealized_conversion_cast %[[T3]] : vector<2x[3]xf32> to !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T4:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK:       %[[T5:.*]] = llvm.extractelement %[[A]]{{\[}}%[[T4]] : i64] : vector<2xf32>
+// CHECK:       %[[T6Insert:.*]] = llvm.insertelement %[[T5]]
+// CHECK:       %[[T6:.*]] = llvm.shufflevector %[[T6Insert]]
+// CHECK:       %[[T8:.*]] = llvm.extractvalue %[[T7]][0] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T9:.*]] = llvm.intr.fmuladd(%[[T6]], %[[B]], %[[T8]]) : (vector<[3]xf32>, vector<[3]xf32>, vector<[3]xf32>) -> vector<[3]xf32>
+// CHECK:       %[[T11:.*]] = llvm.insertvalue %[[T9]], %[[T10]][0] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T12:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK:       %[[T13:.*]] = llvm.extractelement %[[A]]{{\[}}%[[T12]] : i64] : vector<2xf32>
+// CHECK:       %[[T14Insert:.*]] = llvm.insertelement %[[T13]]
+// CHECK:       %[[T14:.*]] = llvm.shufflevector %[[T14Insert]]
+// CHECK:       %[[T16:.*]] = llvm.extractvalue %[[T7]][1] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T17:.*]] = llvm.intr.fmuladd(%[[T14]], %[[B]], %[[T16]]) : (vector<[3]xf32>, vector<[3]xf32>, vector<[3]xf32>) -> vector<[3]xf32>
+// CHECK:       %[[T18:.*]] = llvm.insertvalue %[[T17]], %[[T11]][1] : !llvm.array<2 x vector<[3]xf32>>
+// CHECK:       %[[T19:.*]] = builtin.unrealized_conversion_cast %[[T18]] : !llvm.array<2 x vector<[3]xf32>> to vector<2x[3]xf32>
+// CHECK:       return %[[T19]] : vector<2x[3]xf32>
+
 // -----
 
 func.func @masked_float_add_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: vector<2xf32>, %m: vector<2xi1>) -> vector<2xf32> {
@@ -663,6 +734,16 @@ func.func @masked_float_add_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: v
 // CHECK:           %[[VAL_8:.*]] = llvm.intr.fmuladd(%[[VAL_0]], %{{.*}}, %[[VAL_2]])  : (vector<2xf32>, vector<2xf32>, vector<2xf32>) -> vector<2xf32>
 // CHECK:           %[[VAL_9:.*]] = arith.select %[[VAL_3]], %[[VAL_8]], %[[VAL_2]] : vector<2xi1>, vector<2xf32>
 
+func.func @masked_float_add_outerprod_scalable(%arg0: vector<[2]xf32>, %arg1: f32, %arg2: vector<[2]xf32>, %m: vector<[2]xi1>) -> vector<[2]xf32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<add>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+
+// CHECK-LABEL:   func.func @masked_float_add_outerprod_scalable(
+// CHECK-SAME:                                                   %[[VAL_0:.*]]: vector<[2]xf32>, %[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: vector<[2]xf32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xf32> {
+// CHECK:           %[[VAL_8:.*]] = llvm.intr.fmuladd(%[[VAL_0]], %{{.*}}, %[[VAL_2]])  : (vector<[2]xf32>, vector<[2]xf32>, vector<[2]xf32>) -> vector<[2]xf32>
+// CHECK:           %[[VAL_9:.*]] = arith.select %[[VAL_3]], %[[VAL_8]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xf32>
+
 // -----
 
 func.func @masked_float_mul_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: vector<2xf32>, %m: vector<2xi1>) -> vector<2xf32> {
@@ -676,6 +757,17 @@ func.func @masked_float_mul_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: v
 // CHECK:           %[[VAL_9:.*]] = arith.mulf %[[VAL_8]], %[[VAL_2]] : vector<2xf32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xf32>
 
+func.func @masked_float_mul_outerprod_scalable(%arg0: vector<[2]xf32>, %arg1: f32, %arg2: vector<[2]xf32>, %m: vector<[2]xi1>) -> vector<[2]xf32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<mul>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+
+// CHECK-LABEL:   func.func @masked_float_mul_outerprod_scalable(
+// CHECK-SAME:                                                   %[[VAL_0:.*]]: vector<[2]xf32>, %[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: vector<[2]xf32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xf32> {
+// CHECK:           %[[VAL_8:.*]] = arith.mulf %[[VAL_0]], %{{.*}} : vector<[2]xf32>
+// CHECK:           %[[VAL_9:.*]] = arith.mulf %[[VAL_8]], %[[VAL_2]] : vector<[2]xf32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xf32>
+
 // -----
 
 func.func @masked_float_max_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: vector<2xf32>, %m: vector<2xi1>) -> vector<2xf32> {
@@ -689,6 +781,17 @@ func.func @masked_float_max_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: v
 // CHECK:           %[[VAL_9:.*]] = arith.maxnumf %[[VAL_8]], %[[VAL_2]] : vector<2xf32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xf32>
 
+func.func @masked_float_max_outerprod_scalable(%arg0: vector<[2]xf32>, %arg1: f32, %arg2: vector<[2]xf32>, %m: vector<[2]xi1>) -> vector<[2]xf32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<maxnumf>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+
+// CHECK-LABEL:   func.func @masked_float_max_outerprod_scalable(
+// CHECK-SAME:                                                   %[[VAL_0:.*]]: vector<[2]xf32>, %[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: vector<[2]xf32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xf32> {
+// CHECK:           %[[VAL_8:.*]] = arith.mulf %[[VAL_0]], %{{.*}} : vector<[2]xf32>
+// CHECK:           %[[VAL_9:.*]] = arith.maxnumf %[[VAL_8]], %[[VAL_2]] : vector<[2]xf32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xf32>
+
 // -----
 
 func.func @masked_float_min_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: vector<2xf32>, %m: vector<2xi1>) -> vector<2xf32> {
@@ -702,6 +805,17 @@ func.func @masked_float_min_outerprod(%arg0: vector<2xf32>, %arg1: f32, %arg2: v
 // CHECK:           %[[VAL_9:.*]] = arith.minnumf %[[VAL_8]], %[[VAL_2]] : vector<2xf32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xf32>
 
+func.func @masked_float_min_outerprod_scalable(%arg0: vector<[2]xf32>, %arg1: f32, %arg2: vector<[2]xf32>, %m: vector<[2]xi1>) -> vector<[2]xf32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<minnumf>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+
+// CHECK-LABEL:   func.func @masked_float_min_outerprod_scalable(
+// CHECK-SAME:                                                   %[[VAL_0:.*]]: vector<[2]xf32>, %[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: vector<[2]xf32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xf32> {
+// CHECK:           %[[VAL_8:.*]] = arith.mulf %[[VAL_0]], %{{.*}} : vector<[2]xf32>
+// CHECK:           %[[VAL_9:.*]] = arith.minnumf %[[VAL_8]], %[[VAL_2]] : vector<[2]xf32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xf32>
+
 // -----
 
 func.func @masked_int_add_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vector<2xi32>, %m: vector<2xi1>) -> vector<2xi32> {
@@ -715,6 +829,17 @@ func.func @masked_int_add_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vec
 // CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_2]] : vector<2xi32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xi32>
 
+func.func @masked_int_add_outerprod_scalable(%arg0: vector<[2]xi32>, %arg1: i32, %arg2: vector<[2]xi32>, %m: vector<[2]xi1>) -> vector<[2]xi32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<add>} : vector<[2]xi32>, i32 } : vector<[2]xi1> -> vector<[2]xi32>
+  return %0 : vector<[2]xi32>
+}
+
+// CHECK-LABEL:   func.func @masked_int_add_outerprod_scalable(
+// CHECK-SAME:                                                 %[[VAL_0:.*]]: vector<[2]xi32>, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: vector<[2]xi32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xi32> {
+// CHECK:           %[[VAL_8:.*]] = arith.muli %[[VAL_0]], %{{.*}} : vector<[2]xi32>
+// CHECK:           %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_2]] : vector<[2]xi32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xi32>
+
 // -----
 
 func.func @masked_int_mul_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vector<2xi32>, %m: vector<2xi1>) -> vector<2xi32> {
@@ -728,6 +853,17 @@ func.func @masked_int_mul_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vec
 // CHECK:           %[[VAL_9:.*]] = arith.muli %[[VAL_8]], %[[VAL_2]] : vector<2xi32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xi32>
 
+func.func @masked_int_mul_outerprod_scalable(%arg0: vector<[2]xi32>, %arg1: i32, %arg2: vector<[2]xi32>, %m: vector<[2]xi1>) -> vector<[2]xi32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<mul>} : vector<[2]xi32>, i32 } : vector<[2]xi1> -> vector<[2]xi32>
+  return %0 : vector<[2]xi32>
+}
+
+// CHECK-LABEL:   func.func @masked_int_mul_outerprod_scalable(
+// CHECK-SAME:                                                 %[[VAL_0:.*]]: vector<[2]xi32>, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: vector<[2]xi32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xi32> {
+// CHECK:           %[[VAL_8:.*]] = arith.muli %[[VAL_0]], %{{.*}} : vector<[2]xi32>
+// CHECK:           %[[VAL_9:.*]] = arith.muli %[[VAL_8]], %[[VAL_2]] : vector<[2]xi32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xi32>
+
 // -----
 
 func.func @masked_int_max_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vector<2xi32>, %m: vector<2xi1>) -> vector<2xi32> {
@@ -741,6 +877,17 @@ func.func @masked_int_max_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vec
 // CHECK:           %[[VAL_9:.*]] = arith.maxsi %[[VAL_8]], %[[VAL_2]] : vector<2xi32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xi32>
 
+func.func @masked_int_max_outerprod_scalable(%arg0: vector<[2]xi32>, %arg1: i32, %arg2: vector<[2]xi32>, %m: vector<[2]xi1>) -> vector<[2]xi32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<maxsi>} : vector<[2]xi32>, i32 } : vector<[2]xi1> -> vector<[2]xi32>
+  return %0 : vector<[2]xi32>
+}
+
+// CHECK-LABEL:   func.func @masked_int_max_outerprod_scalable(
+// CHECK-SAME:                                                 %[[VAL_0:.*]]: vector<[2]xi32>, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: vector<[2]xi32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xi32> {
+// CHECK:           %[[VAL_8:.*]] = arith.muli %[[VAL_0]], %{{.*}} : vector<[2]xi32>
+// CHECK:           %[[VAL_9:.*]] = arith.maxsi %[[VAL_8]], %[[VAL_2]] : vector<[2]xi32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xi32>
+
 // -----
 
 func.func @masked_int_min_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vector<2xi32>, %m: vector<2xi1>) -> vector<2xi32> {
@@ -754,6 +901,17 @@ func.func @masked_int_min_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vec
 // CHECK:           %[[VAL_9:.*]] = arith.minui %[[VAL_8]], %[[VAL_2]] : vector<2xi32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xi32>
 
+func.func @masked_int_min_outerprod_scalable(%arg0: vector<[2]xi32>, %arg1: i32, %arg2: vector<[2]xi32>, %m: vector<[2]xi1>) -> vector<[2]xi32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<minui>} : vector<[2]xi32>, i32 } : vector<[2]xi1> -> vector<[2]xi32>
+  return %0 : vector<[2]xi32>
+}
+
+// CHECK-LABEL:   func.func @masked_int_min_outerprod_scalable(
+// CHECK-SAME:                                                 %[[VAL_0:.*]]: vector<[2]xi32>, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: vector<[2]xi32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xi32> {
+// CHECK:           %[[VAL_8:.*]] = arith.muli %[[VAL_0]], %{{.*}} : vector<[2]xi32>
+// CHECK:           %[[VAL_9:.*]] = arith.minui %[[VAL_8]], %[[VAL_2]] : vector<[2]xi32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xi32>
+
 // -----
 
 func.func @masked_int_and_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vector<2xi32>, %m: vector<2xi1>) -> vector<2xi32> {
@@ -767,6 +925,17 @@ func.func @masked_int_and_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vec
 // CHECK:           %[[VAL_9:.*]] = arith.andi %[[VAL_8]], %[[VAL_2]] : vector<2xi32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xi32>
 
+func.func @masked_int_and_outerprod_scalable(%arg0: vector<[2]xi32>, %arg1: i32, %arg2: vector<[2]xi32>, %m: vector<[2]xi1>) -> vector<[2]xi32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<and>} : vector<[2]xi32>, i32 } : vector<[2]xi1> -> vector<[2]xi32>
+  return %0 : vector<[2]xi32>
+}
+
+// CHECK-LABEL:   func.func @masked_int_and_outerprod_scalable(
+// CHECK-SAME:                                                 %[[VAL_0:.*]]: vector<[2]xi32>, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: vector<[2]xi32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xi32> {
+// CHECK:           %[[VAL_8:.*]] = arith.muli %[[VAL_0]], %{{.*}} : vector<[2]xi32>
+// CHECK:           %[[VAL_9:.*]] = arith.andi %[[VAL_8]], %[[VAL_2]] : vector<[2]xi32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xi32>
+
 // -----
 
 func.func @masked_int_or_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vector<2xi32>, %m: vector<2xi1>) -> vector<2xi32> {
@@ -780,6 +949,17 @@ func.func @masked_int_or_outerprod(%arg0: vector<2xi32>, %arg1: i32, %arg2: vect
 // CHECK:           %[[VAL_9:.*]] = arith.ori %[[VAL_8]], %[[VAL_2]] : vector<2xi32>
 // CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<2xi1>, vector<2xi32>
 
+func.func @masked_int_or_outerprod_scalable(%arg0: vector<[2]xi32>, %arg1: i32, %arg2: vector<[2]xi32>, %m: vector<[2]xi1>) -> vector<[2]xi32> {
+  %0 = vector.mask %m { vector.outerproduct %arg0, %arg1, %arg2 {kind = #vector.kind<or>} : vector<[2]xi32>, i32 } : vector<[2]xi1> -> vector<[2]xi32>
+  return %0 : vector<[2]xi32>
+}
+
+// CHECK-LABEL:   func.func @masked_int_or_outerprod_scalable
+// CHECK-SAME:                                       %[[VAL_0:.*]]: vector<[2]xi32>, %[[VAL_1:.*]]: i32, %[[VAL_2:.*]]: vector<[2]xi32>, %[[VAL_3:.*]]: vector<[2]xi1>) -> vector<[2]xi32> {
+// CHECK:           %[[VAL_8:.*]] = arith.muli %[[VAL_0]], %{{.*}} : vector<[2]xi32>
+// CHECK:           %[[VAL_9:.*]] = arith.ori %[[VAL_8]], %[[VAL_2]] : vector<[2]xi32>
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_3]], %[[VAL_9]], %[[VAL_2]] : vector<[2]xi1>, vector<[2]xi32>
+
 // -----
 
 func.func @shuffle_0D_direct(%arg0: vector<f32>) -> vector<3xf32> {