diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index eeb7609ff6b5f1..6bb76d1b917db3 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1183,11 +1183,16 @@ class MCPlusBuilder {
   bool clearOffset(MCInst &Inst) const;
 
   /// Return the label of \p Inst, if available.
-  MCSymbol *getLabel(const MCInst &Inst) const;
+  MCSymbol *getInstLabel(const MCInst &Inst) const;
+
+  /// Set the label of \p Inst or return the existing label for the instruction.
+  /// This label will be emitted right before \p Inst is emitted to MCStreamer.
+  MCSymbol *getOrCreateInstLabel(MCInst &Inst, const Twine &Name,
+                                 MCContext *Ctx) const;
 
   /// Set the label of \p Inst. This label will be emitted right before \p Inst
   /// is emitted to MCStreamer.
-  bool setLabel(MCInst &Inst, MCSymbol *Label) const;
+  void setInstLabel(MCInst &Inst, MCSymbol *Label) const;
 
   /// Get instruction size specified via annotation.
   std::optional<uint32_t> getSize(const MCInst &Inst) const;
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index d544ece13a832f..b29ebbbfa18c4b 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1967,7 +1967,7 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction,
     OS << " # Offset: " << *Offset;
   if (std::optional<uint32_t> Size = MIB->getSize(Instruction))
     OS << " # Size: " << *Size;
-  if (MCSymbol *Label = MIB->getLabel(Instruction))
+  if (MCSymbol *Label = MIB->getInstLabel(Instruction))
     OS << " # Label: " << *Label;
 
   MIB->printAnnotations(Instruction, OS);
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index d4b668c1d7e7bd..97d19b75200f51 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -489,7 +489,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF,
 
       if (!EmitCodeOnly) {
         // A symbol to be emitted before the instruction to mark its location.
-        MCSymbol *InstrLabel = BC.MIB->getLabel(Instr);
+        MCSymbol *InstrLabel = BC.MIB->getInstLabel(Instr);
 
         if (opts::UpdateDebugSections && BF.getDWARFUnit()) {
           LastLocSeen = emitLineInfo(BF, Instr.getLoc(), LastLocSeen,
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 54f2f9d972a461..00df42c11e2239 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -1424,7 +1424,7 @@ Error BinaryFunction::disassemble() {
     InstrMapType::iterator II = Instructions.find(Offset);
     assert(II != Instructions.end() && "reference to non-existing instruction");
 
-    BC.MIB->setLabel(II->second, Label);
+    BC.MIB->setInstLabel(II->second, Label);
   }
 
   // Reset symbolizer for the disassembler.
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index 54618aeb95cccb..82bddf76d5b8c9 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -408,12 +408,11 @@ void BinaryFunction::updateEHRanges() {
 
         // Same symbol is used for the beginning and the end of the range.
         MCSymbol *EHSymbol;
-        if (MCSymbol *InstrLabel = BC.MIB->getLabel(Instr)) {
+        if (MCSymbol *InstrLabel = BC.MIB->getInstLabel(Instr)) {
           EHSymbol = InstrLabel;
         } else {
           std::unique_lock<llvm::sys::RWMutex> Lock(BC.CtxMutex);
-          EHSymbol = BC.Ctx->createNamedTempSymbol("EH");
-          BC.MIB->setLabel(Instr, EHSymbol);
+          EHSymbol = BC.MIB->getOrCreateInstLabel(Instr, "EH", BC.Ctx.get());
         }
 
         // At this point we could be in one of the following states:
diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp
index 44e5f88d8950fc..bd9bd0c45922a5 100644
--- a/bolt/lib/Core/MCPlusBuilder.cpp
+++ b/bolt/lib/Core/MCPlusBuilder.cpp
@@ -12,6 +12,7 @@
 
 #include "bolt/Core/MCPlusBuilder.h"
 #include "bolt/Core/MCPlus.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -266,17 +267,29 @@ bool MCPlusBuilder::clearOffset(MCInst &Inst) const {
   return true;
 }
 
-MCSymbol *MCPlusBuilder::getLabel(const MCInst &Inst) const {
+MCSymbol *MCPlusBuilder::getInstLabel(const MCInst &Inst) const {
   if (std::optional<int64_t> Label =
           getAnnotationOpValue(Inst, MCAnnotation::kLabel))
     return reinterpret_cast<MCSymbol *>(*Label);
   return nullptr;
 }
 
-bool MCPlusBuilder::setLabel(MCInst &Inst, MCSymbol *Label) const {
+MCSymbol *MCPlusBuilder::getOrCreateInstLabel(MCInst &Inst, const Twine &Name,
+                                              MCContext *Ctx) const {
+  MCSymbol *Label = getInstLabel(Inst);
+  if (Label)
+    return Label;
+
+  Label = Ctx->createNamedTempSymbol(Name);
+  setAnnotationOpValue(Inst, MCAnnotation::kLabel,
+                       reinterpret_cast<int64_t>(Label));
+  return Label;
+}
+
+void MCPlusBuilder::setInstLabel(MCInst &Inst, MCSymbol *Label) const {
+  assert(!getInstLabel(Inst) && "Instruction already has assigned label.");
   setAnnotationOpValue(Inst, MCAnnotation::kLabel,
                        reinterpret_cast<int64_t>(Label));
-  return true;
 }
 
 std::optional<uint32_t> MCPlusBuilder::getSize(const MCInst &Inst) const {
diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index 6377c1197253c8..0d7dc1070ce75e 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -770,11 +770,8 @@ Error LinuxKernelRewriter::rewriteORCTables() {
           continue;
 
         // Issue label for the instruction.
-        MCSymbol *Label = BC.MIB->getLabel(Inst);
-        if (!Label) {
-          Label = BC.Ctx->createTempSymbol("__ORC_");
-          BC.MIB->setLabel(Inst, Label);
-        }
+        MCSymbol *Label =
+            BC.MIB->getOrCreateInstLabel(Inst, "__ORC_", BC.Ctx.get());
 
         if (Error E = emitORCEntry(0, *ErrorOrState, Label))
           return E;
@@ -908,11 +905,8 @@ Error LinuxKernelRewriter::readStaticCalls() {
 
     BC.MIB->addAnnotation(*Inst, "StaticCall", EntryID);
 
-    MCSymbol *Label = BC.MIB->getLabel(*Inst);
-    if (!Label) {
-      Label = BC.Ctx->createTempSymbol("__SC_");
-      BC.MIB->setLabel(*Inst, Label);
-    }
+    MCSymbol *Label =
+        BC.MIB->getOrCreateInstLabel(*Inst, "__SC_", BC.Ctx.get());
 
     StaticCallEntries.push_back({EntryID, BF, Label});
   }
diff --git a/bolt/test/runtime/instrument-wrong-target.s b/bolt/test/runtime/X86/instrument-wrong-target.s
similarity index 88%
rename from bolt/test/runtime/instrument-wrong-target.s
rename to bolt/test/runtime/X86/instrument-wrong-target.s
index b25c924ffbcc0f..343d93a89ed130 100644
--- a/bolt/test/runtime/instrument-wrong-target.s
+++ b/bolt/test/runtime/X86/instrument-wrong-target.s
@@ -1,8 +1,8 @@
 # Test that BOLT errs when trying to instrument a binary with a different
 # architecture than the one BOLT is built for.
 
-# REQUIRES: x86_64-linux,bolt-runtime
-# REQUIRES: target-x86_64 && aarch64-registered-target
+# REQUIRES: system-linux,bolt-runtime
+# REQUIRES: aarch64-registered-target
 
 # RUN: llvm-mc -triple aarch64 -filetype=obj %s -o %t.o
 # RUN: ld.lld -q -pie -o %t.exe %t.o
diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index c40d679e383bb2..8a06084955aa6b 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -7049,7 +7049,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXFoldExpr.html">CXXFoldExpr</a>&gt;</td><td class="name" onclick="toggle('hasFoldInit0')"><a name="hasFoldInit0Anchor">hasFoldInit</a></td><td>ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMacher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXFoldExpr.html">CXXFoldExpr</a>&gt;</td><td class="name" onclick="toggle('hasFoldInit0')"><a name="hasFoldInit0Anchor">hasFoldInit</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMacher</td></tr>
 <tr><td colspan="4" class="doc" id="hasFoldInit0"><pre>Matches the operand that does not contain the parameter pack.
 
 Example matches `(0 + ... + args)` and `(args * ... * 1)`
@@ -7089,7 +7089,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXFoldExpr.html">CXXFoldExpr</a>&gt;</td><td class="name" onclick="toggle('hasPattern0')"><a name="hasPattern0Anchor">hasPattern</a></td><td>ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMacher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXFoldExpr.html">CXXFoldExpr</a>&gt;</td><td class="name" onclick="toggle('hasPattern0')"><a name="hasPattern0Anchor">hasPattern</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMacher</td></tr>
 <tr><td colspan="4" class="doc" id="hasPattern0"><pre>Matches the operand that contains the parameter pack.
 
 Example matches `(0 + ... + args)`
@@ -7859,7 +7859,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument0')"><a name="forEachTemplateArgument0Anchor">forEachTemplateArgument</a></td><td>clang::ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument0')"><a name="forEachTemplateArgument0Anchor">forEachTemplateArgument</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachTemplateArgument0"><pre>Matches classTemplateSpecialization, templateSpecializationType and
 functionDecl nodes where the template argument matches the inner matcher.
 This matcher may produce multiple matches.
@@ -8454,7 +8454,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('ignoringElidableConstructorCall0')"><a name="ignoringElidableConstructorCall0Anchor">ignoringElidableConstructorCall</a></td><td>ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('ignoringElidableConstructorCall0')"><a name="ignoringElidableConstructorCall0Anchor">ignoringElidableConstructorCall</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="ignoringElidableConstructorCall0"><pre>Matches expressions that match InnerMatcher that are possibly wrapped in an
 elidable constructor and other corresponding bookkeeping nodes.
 
@@ -8691,7 +8691,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument2')"><a name="forEachTemplateArgument2Anchor">forEachTemplateArgument</a></td><td>clang::ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument2')"><a name="forEachTemplateArgument2Anchor">forEachTemplateArgument</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachTemplateArgument2"><pre>Matches classTemplateSpecialization, templateSpecializationType and
 functionDecl nodes where the template argument matches the inner matcher.
 This matcher may produce multiple matches.
@@ -8959,7 +8959,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InitListExpr.html">InitListExpr</a>&gt;</td><td class="name" onclick="toggle('hasInit0')"><a name="hasInit0Anchor">hasInit</a></td><td>unsigned N, ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InitListExpr.html">InitListExpr</a>&gt;</td><td class="name" onclick="toggle('hasInit0')"><a name="hasInit0Anchor">hasInit</a></td><td>unsigned N, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasInit0"><pre>Matches the n'th item of an initializer list expression.
 
 Example matches y.
@@ -10026,7 +10026,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument1')"><a name="forEachTemplateArgument1Anchor">forEachTemplateArgument</a></td><td>clang::ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument1')"><a name="forEachTemplateArgument1Anchor">forEachTemplateArgument</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachTemplateArgument1"><pre>Matches classTemplateSpecialization, templateSpecializationType and
 functionDecl nodes where the template argument matches the inner matcher.
 This matcher may produce multiple matches.
diff --git a/clang/docs/tools/dump_ast_matchers.py b/clang/docs/tools/dump_ast_matchers.py
index cc7024d1627b97..705ff0d4d40985 100755
--- a/clang/docs/tools/dump_ast_matchers.py
+++ b/clang/docs/tools/dump_ast_matchers.py
@@ -116,6 +116,8 @@ def strip_doxygen(comment):
 
 def unify_arguments(args):
     """Gets rid of anything the user doesn't care about in the argument list."""
+    args = re.sub(r"clang::ast_matchers::internal::", r"", args)
+    args = re.sub(r"ast_matchers::internal::", r"", args)
     args = re.sub(r"internal::", r"", args)
     args = re.sub(r"extern const\s+(.*)&", r"\1 ", args)
     args = re.sub(r"&", r" ", args)
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index dc1f49525a004a..ced89ff127ab3e 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -4580,8 +4580,7 @@ AST_POLYMORPHIC_MATCHER_P2(hasArgument,
 ///       return (args * ... * 1);
 ///   }
 /// \endcode
-AST_MATCHER_P(CXXFoldExpr, hasFoldInit, ast_matchers::internal::Matcher<Expr>,
-              InnerMacher) {
+AST_MATCHER_P(CXXFoldExpr, hasFoldInit, internal::Matcher<Expr>, InnerMacher) {
   const auto *const Init = Node.getInit();
   return Init && InnerMacher.matches(*Init, Finder, Builder);
 }
@@ -4603,8 +4602,7 @@ AST_MATCHER_P(CXXFoldExpr, hasFoldInit, ast_matchers::internal::Matcher<Expr>,
 ///       return (args * ... * 1);
 ///   }
 /// \endcode
-AST_MATCHER_P(CXXFoldExpr, hasPattern, ast_matchers::internal::Matcher<Expr>,
-              InnerMacher) {
+AST_MATCHER_P(CXXFoldExpr, hasPattern, internal::Matcher<Expr>, InnerMacher) {
   const Expr *const Pattern = Node.getPattern();
   return Pattern && InnerMacher.matches(*Pattern, Finder, Builder);
 }
@@ -4685,8 +4683,8 @@ AST_MATCHER(CXXFoldExpr, isBinaryFold) { return Node.getInit() != nullptr; }
 /// \code
 ///   int x{y}.
 /// \endcode
-AST_MATCHER_P2(InitListExpr, hasInit, unsigned, N,
-               ast_matchers::internal::Matcher<Expr>, InnerMatcher) {
+AST_MATCHER_P2(InitListExpr, hasInit, unsigned, N, internal::Matcher<Expr>,
+               InnerMatcher) {
   return N < Node.getNumInits() &&
           InnerMatcher.matches(*Node.getInit(N), Finder, Builder);
 }
@@ -5309,7 +5307,7 @@ AST_POLYMORPHIC_MATCHER_P(
     forEachTemplateArgument,
     AST_POLYMORPHIC_SUPPORTED_TYPES(ClassTemplateSpecializationDecl,
                                     TemplateSpecializationType, FunctionDecl),
-    clang::ast_matchers::internal::Matcher<TemplateArgument>, InnerMatcher) {
+    internal::Matcher<TemplateArgument>, InnerMatcher) {
   ArrayRef<TemplateArgument> TemplateArgs =
       clang::ast_matchers::internal::getTemplateSpecializationArgs(Node);
   clang::ast_matchers::internal::BoundNodesTreeBuilder Result;
@@ -8525,8 +8523,8 @@ AST_MATCHER(FunctionDecl, hasTrailingReturn) {
 ///
 /// ``varDecl(hasInitializer(ignoringElidableConstructorCall(callExpr())))``
 /// matches ``H D = G()`` in C++11 through C++17 (and beyond).
-AST_MATCHER_P(Expr, ignoringElidableConstructorCall,
-              ast_matchers::internal::Matcher<Expr>, InnerMatcher) {
+AST_MATCHER_P(Expr, ignoringElidableConstructorCall, internal::Matcher<Expr>,
+              InnerMatcher) {
   // E tracks the node that we are examining.
   const Expr *E = &Node;
   // If present, remove an outer `ExprWithCleanups` corresponding to the
diff --git a/clang/include/clang/InstallAPI/Context.h b/clang/include/clang/InstallAPI/Context.h
index 7d105920734fde..3e2046642c7fe8 100644
--- a/clang/include/clang/InstallAPI/Context.h
+++ b/clang/include/clang/InstallAPI/Context.h
@@ -9,6 +9,9 @@
 #ifndef LLVM_CLANG_INSTALLAPI_CONTEXT_H
 #define LLVM_CLANG_INSTALLAPI_CONTEXT_H
 
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/InstallAPI/HeaderFile.h"
 #include "llvm/TextAPI/InterfaceFile.h"
 #include "llvm/TextAPI/RecordVisitor.h"
 #include "llvm/TextAPI/RecordsSlice.h"
@@ -24,8 +27,23 @@ struct InstallAPIContext {
   /// Library attributes that are typically passed as linker inputs.
   llvm::MachO::RecordsSlice::BinaryAttrs BA;
 
-  /// Active target triple to parse.
-  llvm::Triple TargetTriple{};
+  /// All headers that represent a library.
+  HeaderSeq InputHeaders;
+
+  /// Active language mode to parse in.
+  Language LangMode = Language::ObjC;
+
+  /// Active header access type.
+  HeaderType Type = HeaderType::Unknown;
+
+  /// Active TargetSlice for symbol record collection.
+  std::shared_ptr<llvm::MachO::RecordsSlice> Slice;
+
+  /// FileManager for all I/O operations.
+  FileManager *FM = nullptr;
+
+  /// DiagnosticsEngine for all error reporting.
+  DiagnosticsEngine *Diags = nullptr;
 
   /// File Path of output location.
   llvm::StringRef OutputLoc{};
diff --git a/clang/include/clang/InstallAPI/Frontend.h b/clang/include/clang/InstallAPI/Frontend.h
new file mode 100644
index 00000000000000..7ee87ae028d079
--- /dev/null
+++ b/clang/include/clang/InstallAPI/Frontend.h
@@ -0,0 +1,48 @@
+//===- InstallAPI/Frontend.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Top level wrappers for InstallAPI frontend operations.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_INSTALLAPI_FRONTEND_H
+#define LLVM_CLANG_INSTALLAPI_FRONTEND_H
+
+#include "clang/AST/ASTConsumer.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/FrontendActions.h"
+#include "clang/InstallAPI/Context.h"
+#include "clang/InstallAPI/Visitor.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace clang {
+namespace installapi {
+
+/// Create a buffer that contains all headers to scan
+/// for global symbols with.
+std::unique_ptr<llvm::MemoryBuffer>
+createInputBuffer(const InstallAPIContext &Ctx);
+
+class InstallAPIAction : public ASTFrontendAction {
+public:
+  explicit InstallAPIAction(llvm::MachO::RecordsSlice &Records)
+      : Records(Records) {}
+
+  std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &CI,
+                                                 StringRef InFile) override {
+    return std::make_unique<InstallAPIVisitor>(CI.getASTContext(), Records);
+  }
+
+private:
+  llvm::MachO::RecordsSlice &Records;
+};
+} // namespace installapi
+} // namespace clang
+
+#endif // LLVM_CLANG_INSTALLAPI_FRONTEND_H
diff --git a/clang/include/clang/InstallAPI/HeaderFile.h b/clang/include/clang/InstallAPI/HeaderFile.h
index fc64a43b3def5c..70e83bbb3e76f6 100644
--- a/clang/include/clang/InstallAPI/HeaderFile.h
+++ b/clang/include/clang/InstallAPI/HeaderFile.h
@@ -15,6 +15,7 @@
 
 #include "clang/Basic/LangStandard.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Regex.h"
 #include <optional>
 #include <string>
@@ -32,6 +33,20 @@ enum class HeaderType {
   Project,
 };
 
+inline StringRef getName(const HeaderType T) {
+  switch (T) {
+  case HeaderType::Public:
+    return "Public";
+  case HeaderType::Private:
+    return "Private";
+  case HeaderType::Project:
+    return "Project";
+  case HeaderType::Unknown:
+    return "Unknown";
+  }
+  llvm_unreachable("unexpected header type");
+}
+
 class HeaderFile {
   /// Full input path to header.
   std::string FullPath;
@@ -52,6 +67,14 @@ class HeaderFile {
 
   static llvm::Regex getFrameworkIncludeRule();
 
+  HeaderType getType() const { return Type; }
+  StringRef getIncludeName() const { return IncludeName; }
+  StringRef getPath() const { return FullPath; }
+
+  bool useIncludeName() const {
+    return Type != HeaderType::Project && !IncludeName.empty();
+  }
+
   bool operator==(const HeaderFile &Other) const {
     return std::tie(Type, FullPath, IncludeName, Language) ==
            std::tie(Other.Type, Other.FullPath, Other.IncludeName,
diff --git a/clang/include/clang/InstallAPI/Visitor.h b/clang/include/clang/InstallAPI/Visitor.h
new file mode 100644
index 00000000000000..95d669688e4f9c
--- /dev/null
+++ b/clang/include/clang/InstallAPI/Visitor.h
@@ -0,0 +1,51 @@
+//===- InstallAPI/Visitor.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// ASTVisitor Interface for InstallAPI frontend operations.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_INSTALLAPI_VISITOR_H
+#define LLVM_CLANG_INSTALLAPI_VISITOR_H
+
+#include "clang/AST/Mangle.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Frontend/FrontendActions.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/TextAPI/RecordsSlice.h"
+
+namespace clang {
+namespace installapi {
+
+/// ASTVisitor for collecting declarations that represent global symbols.
+class InstallAPIVisitor final : public ASTConsumer,
+                                public RecursiveASTVisitor<InstallAPIVisitor> {
+public:
+  InstallAPIVisitor(ASTContext &ASTCtx, llvm::MachO::RecordsSlice &Slice)
+      : Slice(Slice),
+        MC(ItaniumMangleContext::create(ASTCtx, ASTCtx.getDiagnostics())),
+        Layout(ASTCtx.getTargetInfo().getDataLayoutString()) {}
+  void HandleTranslationUnit(ASTContext &ASTCtx) override;
+
+  /// Collect global variables.
+  bool VisitVarDecl(const VarDecl *D);
+
+private:
+  std::string getMangledName(const NamedDecl *D) const;
+  std::string getBackendMangledName(llvm::Twine Name) const;
+
+  llvm::MachO::RecordsSlice &Slice;
+  std::unique_ptr<clang::ItaniumMangleContext> MC;
+  StringRef Layout;
+};
+
+} // namespace installapi
+} // namespace clang
+
+#endif // LLVM_CLANG_INSTALLAPI_VISITOR_H
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index f193f959d3a6c0..b151f8d0d7a79c 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2587,7 +2587,10 @@ bool ByteCodeExprGen<Emitter>::visitExpr(const Expr *E) {
 
     if (!this->emitFinishInit(E))
       return false;
-    return this->emitRetValue(E);
+    // We are destroying the locals AFTER the Ret op.
+    // The Ret op needs to copy the (alive) values, but the
+    // destructors may still turn the entire expression invalid.
+    return this->emitRetValue(E) && RootScope.destroyLocals();
   }
 
   return false;
@@ -3414,14 +3417,15 @@ bool ByteCodeExprGen<Emitter>::emitRecordDestruction(const Record *R) {
   // Now emit the destructor and recurse into base classes.
   if (const CXXDestructorDecl *Dtor = R->getDestructor();
       Dtor && !Dtor->isTrivial()) {
-    if (const Function *DtorFunc = getFunction(Dtor)) {
-      assert(DtorFunc->hasThisPointer());
-      assert(DtorFunc->getNumParams() == 1);
-      if (!this->emitDupPtr(SourceInfo{}))
-        return false;
-      if (!this->emitCall(DtorFunc, 0, SourceInfo{}))
-        return false;
-    }
+    const Function *DtorFunc = getFunction(Dtor);
+    if (!DtorFunc)
+      return false;
+    assert(DtorFunc->hasThisPointer());
+    assert(DtorFunc->getNumParams() == 1);
+    if (!this->emitDupPtr(SourceInfo{}))
+      return false;
+    if (!this->emitCall(DtorFunc, 0, SourceInfo{}))
+      return false;
   }
 
   for (const Record::Base &Base : llvm::reverse(R->bases())) {
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 5b3b533dba3877..acbbcc3dc9619a 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -332,7 +332,7 @@ template <class Emitter> class VariableScope {
   }
 
   virtual void emitDestruction() {}
-  virtual void emitDestructors() {}
+  virtual bool emitDestructors() { return true; }
   VariableScope *getParent() const { return Parent; }
 
 protected:
@@ -356,13 +356,18 @@ template <class Emitter> class LocalScope : public VariableScope<Emitter> {
   }
 
   /// Overriden to support explicit destruction.
-  void emitDestruction() override {
+  void emitDestruction() override { destroyLocals(); }
+
+  /// Explicit destruction of local variables.
+  bool destroyLocals() {
     if (!Idx)
-      return;
-    this->emitDestructors();
+      return true;
+
+    bool Success = this->emitDestructors();
     this->Ctx->emitDestroy(*Idx, SourceInfo{});
     removeStoredOpaqueValues();
     this->Idx = std::nullopt;
+    return Success;
   }
 
   void addLocal(const Scope::Local &Local) override {
@@ -374,19 +379,25 @@ template <class Emitter> class LocalScope : public VariableScope<Emitter> {
     this->Ctx->Descriptors[*Idx].emplace_back(Local);
   }
 
-  void emitDestructors() override {
+  bool emitDestructors() override {
     if (!Idx)
-      return;
+      return true;
     // Emit destructor calls for local variables of record
     // type with a destructor.
     for (Scope::Local &Local : this->Ctx->Descriptors[*Idx]) {
       if (!Local.Desc->isPrimitive() && !Local.Desc->isPrimitiveArray()) {
-        this->Ctx->emitGetPtrLocal(Local.Offset, SourceInfo{});
-        this->Ctx->emitDestruction(Local.Desc);
-        this->Ctx->emitPopPtr(SourceInfo{});
+        if (!this->Ctx->emitGetPtrLocal(Local.Offset, SourceInfo{}))
+          return false;
+
+        if (!this->Ctx->emitDestruction(Local.Desc))
+          return false;
+
+        if (!this->Ctx->emitPopPtr(SourceInfo{}))
+          return false;
         removeIfStoredOpaqueValue(Local);
       }
     }
+    return true;
   }
 
   void removeStoredOpaqueValues() {
diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp
index 9cae25f5c4d642..c9c2bf9b145b22 100644
--- a/clang/lib/AST/Interp/EvalEmitter.cpp
+++ b/clang/lib/AST/Interp/EvalEmitter.cpp
@@ -38,8 +38,11 @@ EvaluationResult EvalEmitter::interpretExpr(const Expr *E,
   this->ConvertResultToRValue = ConvertResultToRValue;
   EvalResult.setSource(E);
 
-  if (!this->visitExpr(E) && EvalResult.empty())
+  if (!this->visitExpr(E)) {
+    // EvalResult may already have a result set, but something failed
+    // after that (e.g. evaluating destructors).
     EvalResult.setInvalid();
+  }
 
   return std::move(this->EvalResult);
 }
diff --git a/clang/lib/AST/Interp/EvaluationResult.h b/clang/lib/AST/Interp/EvaluationResult.h
index 28e1ae6ba3e7ab..ecf2250074cc9e 100644
--- a/clang/lib/AST/Interp/EvaluationResult.h
+++ b/clang/lib/AST/Interp/EvaluationResult.h
@@ -72,7 +72,8 @@ class EvaluationResult final {
     Kind = LValue;
   }
   void setInvalid() {
-    assert(empty());
+    // We are NOT asserting empty() here, since setting it to invalid
+    // is allowed even if there is already a result.
     Kind = Invalid;
   }
   void setValid() {
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index db52f6649c18ba..13e004371f912c 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1680,7 +1680,7 @@ bool CastFloatingIntegral(InterpState &S, CodePtr OpPC) {
     auto Status = F.convertToInteger(Result);
 
     // Float-to-Integral overflow check.
-    if ((Status & APFloat::opStatus::opInvalidOp) && F.isFinite()) {
+    if ((Status & APFloat::opStatus::opInvalidOp)) {
       const Expr *E = S.Current->getExpr(OpPC);
       QualType Type = E->getType();
 
@@ -1933,7 +1933,7 @@ inline bool ArrayElemPop(InterpState &S, CodePtr OpPC, uint32_t Index) {
 inline bool ArrayDecay(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (Ptr.isDummy()) {
+  if (Ptr.isZero() || Ptr.isDummy()) {
     S.Stk.push<Pointer>(Ptr);
     return true;
   }
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index fa2e03d71190f5..34ecdb967960d5 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -215,7 +215,6 @@ class Pointer {
       assert(Offset == PastEndMark && "cannot get base of a block");
       return Pointer(Pointee, Base, 0);
     }
-    assert(Offset == Base && "not an inner field");
     unsigned NewBase = Base - getInlineDesc()->Offset;
     return Pointer(Pointee, NewBase, NewBase);
   }
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 701f1ac852c256..e1ff0d92f6b2f8 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -130,42 +130,42 @@ class MatchDescendantVisitor
 
   bool TraverseGenericSelectionExpr(GenericSelectionExpr *Node) {
     // These are unevaluated, except the result expression.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return TraverseStmt(Node->getResultExpr());
     return VisitorBase::TraverseGenericSelectionExpr(Node);
   }
 
   bool TraverseUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr *Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseUnaryExprOrTypeTraitExpr(Node);
   }
 
   bool TraverseTypeOfExprTypeLoc(TypeOfExprTypeLoc Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseTypeOfExprTypeLoc(Node);
   }
 
   bool TraverseDecltypeTypeLoc(DecltypeTypeLoc Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseDecltypeTypeLoc(Node);
   }
 
   bool TraverseCXXNoexceptExpr(CXXNoexceptExpr *Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseCXXNoexceptExpr(Node);
   }
 
   bool TraverseCXXTypeidExpr(CXXTypeidExpr *Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseCXXTypeidExpr(Node);
   }
@@ -213,24 +213,26 @@ class MatchDescendantVisitor
 
 // Because we're dealing with raw pointers, let's define what we mean by that.
 static auto hasPointerType() {
-    return hasType(hasCanonicalType(pointerType()));
+  return hasType(hasCanonicalType(pointerType()));
 }
 
-static auto hasArrayType() {
-    return hasType(hasCanonicalType(arrayType()));
-}
+static auto hasArrayType() { return hasType(hasCanonicalType(arrayType())); }
 
-AST_MATCHER_P(Stmt, forEachDescendantEvaluatedStmt, internal::Matcher<Stmt>, innerMatcher) {
+AST_MATCHER_P(Stmt, forEachDescendantEvaluatedStmt, internal::Matcher<Stmt>,
+              innerMatcher) {
   const DynTypedMatcher &DTM = static_cast<DynTypedMatcher>(innerMatcher);
 
-  MatchDescendantVisitor Visitor(&DTM, Finder, Builder, ASTMatchFinder::BK_All, true);
+  MatchDescendantVisitor Visitor(&DTM, Finder, Builder, ASTMatchFinder::BK_All,
+                                 true);
   return Visitor.findMatch(DynTypedNode::create(Node));
 }
 
-AST_MATCHER_P(Stmt, forEachDescendantStmt, internal::Matcher<Stmt>, innerMatcher) {
+AST_MATCHER_P(Stmt, forEachDescendantStmt, internal::Matcher<Stmt>,
+              innerMatcher) {
   const DynTypedMatcher &DTM = static_cast<DynTypedMatcher>(innerMatcher);
 
-  MatchDescendantVisitor Visitor(&DTM, Finder, Builder, ASTMatchFinder::BK_All, false);
+  MatchDescendantVisitor Visitor(&DTM, Finder, Builder, ASTMatchFinder::BK_All,
+                                 false);
   return Visitor.findMatch(DynTypedNode::create(Node));
 }
 
@@ -268,10 +270,9 @@ static auto isInUnspecifiedLvalueContext(internal::Matcher<Expr> innerMatcher) {
         hasLHS(innerMatcher)
       )
     ));
-// clang-format on
+  // clang-format on
 }
 
-
 // Returns a matcher that matches any expression `e` such that `InnerMatcher`
 // matches `e` and `e` is in an Unspecified Pointer Context (UPC).
 static internal::Matcher<Stmt>
@@ -315,7 +316,7 @@ isInUnspecifiedPointerContext(internal::Matcher<Stmt> InnerMatcher) {
   // clang-format on
 
   return stmt(anyOf(CallArgMatcher, CastOperandMatcher, CompOperandMatcher,
-		    PtrSubtractionMatcher));
+                    PtrSubtractionMatcher));
   // FIXME: any more cases? (UPC excludes the RHS of an assignment.  For now we
   // don't have to check that.)
 }
@@ -481,7 +482,9 @@ class Gadget {
 #ifndef NDEBUG
   StringRef getDebugName() const {
     switch (K) {
-#define GADGET(x) case Kind::x: return #x;
+#define GADGET(x)                                                              \
+  case Kind::x:                                                                \
+    return #x;
 #include "clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def"
     }
     llvm_unreachable("Unhandled Gadget::Kind enum");
@@ -502,7 +505,6 @@ class Gadget {
   Kind K;
 };
 
-
 /// Warning gadgets correspond to unsafe code patterns that warrants
 /// an immediate warning.
 class WarningGadget : public Gadget {
@@ -513,10 +515,10 @@ class WarningGadget : public Gadget {
   bool isWarningGadget() const final { return true; }
 };
 
-/// Fixable gadgets correspond to code patterns that aren't always unsafe but need to be
-/// properly recognized in order to emit fixes. For example, if a raw pointer-type
-/// variable is replaced by a safe C++ container, every use of such variable must be
-/// carefully considered and possibly updated.
+/// Fixable gadgets correspond to code patterns that aren't always unsafe but
+/// need to be properly recognized in order to emit fixes. For example, if a raw
+/// pointer-type variable is replaced by a safe C++ container, every use of such
+/// variable must be carefully considered and possibly updated.
 class FixableGadget : public Gadget {
 public:
   FixableGadget(Kind K) : Gadget(K) {}
@@ -531,20 +533,19 @@ class FixableGadget : public Gadget {
     return std::nullopt;
   }
 
-  /// Returns a list of two elements where the first element is the LHS of a pointer assignment
-  /// statement and the second element is the RHS. This two-element list represents the fact that
-  /// the LHS buffer gets its bounds information from the RHS buffer. This information will be used
-  /// later to group all those variables whose types must be modified together to prevent type
-  /// mismatches.
+  /// Returns a list of two elements where the first element is the LHS of a
+  /// pointer assignment statement and the second element is the RHS. This
+  /// two-element list represents the fact that the LHS buffer gets its bounds
+  /// information from the RHS buffer. This information will be used later to
+  /// group all those variables whose types must be modified together to prevent
+  /// type mismatches.
   virtual std::optional<std::pair<const VarDecl *, const VarDecl *>>
   getStrategyImplications() const {
     return std::nullopt;
   }
 };
 
-static auto toSupportedVariable() {
-  return to(varDecl());
-}
+static auto toSupportedVariable() { return to(varDecl()); }
 
 using FixableGadgetList = std::vector<std::unique_ptr<FixableGadget>>;
 using WarningGadgetList = std::vector<std::unique_ptr<WarningGadget>>;
@@ -565,10 +566,10 @@ class IncrementGadget : public WarningGadget {
   }
 
   static Matcher matcher() {
-    return stmt(unaryOperator(
-      hasOperatorName("++"),
-      hasUnaryOperand(ignoringParenImpCasts(hasPointerType()))
-    ).bind(OpTag));
+    return stmt(
+        unaryOperator(hasOperatorName("++"),
+                      hasUnaryOperand(ignoringParenImpCasts(hasPointerType())))
+            .bind(OpTag));
   }
 
   const UnaryOperator *getBaseStmt() const override { return Op; }
@@ -600,10 +601,10 @@ class DecrementGadget : public WarningGadget {
   }
 
   static Matcher matcher() {
-    return stmt(unaryOperator(
-      hasOperatorName("--"),
-      hasUnaryOperand(ignoringParenImpCasts(hasPointerType()))
-    ).bind(OpTag));
+    return stmt(
+        unaryOperator(hasOperatorName("--"),
+                      hasUnaryOperand(ignoringParenImpCasts(hasPointerType())))
+            .bind(OpTag));
   }
 
   const UnaryOperator *getBaseStmt() const override { return Op; }
@@ -754,26 +755,25 @@ class PointerInitGadget : public FixableGadget {
 private:
   static constexpr const char *const PointerInitLHSTag = "ptrInitLHS";
   static constexpr const char *const PointerInitRHSTag = "ptrInitRHS";
-  const VarDecl * PtrInitLHS;         // the LHS pointer expression in `PI`
-  const DeclRefExpr * PtrInitRHS;         // the RHS pointer expression in `PI`
+  const VarDecl *PtrInitLHS;     // the LHS pointer expression in `PI`
+  const DeclRefExpr *PtrInitRHS; // the RHS pointer expression in `PI`
 
 public:
   PointerInitGadget(const MatchFinder::MatchResult &Result)
       : FixableGadget(Kind::PointerInit),
-    PtrInitLHS(Result.Nodes.getNodeAs<VarDecl>(PointerInitLHSTag)),
-    PtrInitRHS(Result.Nodes.getNodeAs<DeclRefExpr>(PointerInitRHSTag)) {}
+        PtrInitLHS(Result.Nodes.getNodeAs<VarDecl>(PointerInitLHSTag)),
+        PtrInitRHS(Result.Nodes.getNodeAs<DeclRefExpr>(PointerInitRHSTag)) {}
 
   static bool classof(const Gadget *G) {
     return G->getKind() == Kind::PointerInit;
   }
 
   static Matcher matcher() {
-    auto PtrInitStmt = declStmt(hasSingleDecl(varDecl(
-                                 hasInitializer(ignoringImpCasts(declRefExpr(
-                                                  hasPointerType(),
-                                                    toSupportedVariable()).
-                                                  bind(PointerInitRHSTag)))).
-                                              bind(PointerInitLHSTag)));
+    auto PtrInitStmt = declStmt(hasSingleDecl(
+        varDecl(hasInitializer(ignoringImpCasts(
+                    declRefExpr(hasPointerType(), toSupportedVariable())
+                        .bind(PointerInitRHSTag))))
+            .bind(PointerInitLHSTag)));
 
     return stmt(PtrInitStmt);
   }
@@ -793,8 +793,7 @@ class PointerInitGadget : public FixableGadget {
 
   virtual std::optional<std::pair<const VarDecl *, const VarDecl *>>
   getStrategyImplications() const override {
-      return std::make_pair(PtrInitLHS,
-                            cast<VarDecl>(PtrInitRHS->getDecl()));
+    return std::make_pair(PtrInitLHS, cast<VarDecl>(PtrInitRHS->getDecl()));
   }
 };
 
@@ -807,8 +806,8 @@ class PtrToPtrAssignmentGadget : public FixableGadget {
 private:
   static constexpr const char *const PointerAssignLHSTag = "ptrLHS";
   static constexpr const char *const PointerAssignRHSTag = "ptrRHS";
-  const DeclRefExpr * PtrLHS;         // the LHS pointer expression in `PA`
-  const DeclRefExpr * PtrRHS;         // the RHS pointer expression in `PA`
+  const DeclRefExpr *PtrLHS; // the LHS pointer expression in `PA`
+  const DeclRefExpr *PtrRHS; // the RHS pointer expression in `PA`
 
 public:
   PtrToPtrAssignmentGadget(const MatchFinder::MatchResult &Result)
@@ -821,13 +820,13 @@ class PtrToPtrAssignmentGadget : public FixableGadget {
   }
 
   static Matcher matcher() {
-    auto PtrAssignExpr = binaryOperator(allOf(hasOperatorName("="),
-      hasRHS(ignoringParenImpCasts(declRefExpr(hasPointerType(),
-                                               toSupportedVariable()).
-                                   bind(PointerAssignRHSTag))),
-                                   hasLHS(declRefExpr(hasPointerType(),
-                                                      toSupportedVariable()).
-                                          bind(PointerAssignLHSTag))));
+    auto PtrAssignExpr = binaryOperator(
+        allOf(hasOperatorName("="),
+              hasRHS(ignoringParenImpCasts(
+                  declRefExpr(hasPointerType(), toSupportedVariable())
+                      .bind(PointerAssignRHSTag))),
+              hasLHS(declRefExpr(hasPointerType(), toSupportedVariable())
+                         .bind(PointerAssignLHSTag))));
 
     return stmt(isInUnspecifiedUntypedContext(PtrAssignExpr));
   }
@@ -981,9 +980,8 @@ class ULCArraySubscriptGadget : public FixableGadget {
 
   static Matcher matcher() {
     auto ArrayOrPtr = anyOf(hasPointerType(), hasArrayType());
-    auto BaseIsArrayOrPtrDRE =
-        hasBase(ignoringParenImpCasts(declRefExpr(ArrayOrPtr,
-                                                  toSupportedVariable())));
+    auto BaseIsArrayOrPtrDRE = hasBase(
+        ignoringParenImpCasts(declRefExpr(ArrayOrPtr, toSupportedVariable())));
     auto Target =
         arraySubscriptExpr(BaseIsArrayOrPtrDRE).bind(ULCArraySubscriptTag);
 
@@ -1025,9 +1023,9 @@ class UPCStandalonePointerGadget : public FixableGadget {
 
   static Matcher matcher() {
     auto ArrayOrPtr = anyOf(hasPointerType(), hasArrayType());
-    auto target = expr(
-        ignoringParenImpCasts(declRefExpr(allOf(ArrayOrPtr,
-                              toSupportedVariable())).bind(DeclRefExprTag)));
+    auto target = expr(ignoringParenImpCasts(
+        declRefExpr(allOf(ArrayOrPtr, toSupportedVariable()))
+            .bind(DeclRefExprTag)));
     return stmt(isInUnspecifiedPointerContext(target));
   }
 
@@ -1036,9 +1034,7 @@ class UPCStandalonePointerGadget : public FixableGadget {
 
   virtual const Stmt *getBaseStmt() const override { return Node; }
 
-  virtual DeclUseList getClaimedVarUseSites() const override {
-    return {Node};
-  }
+  virtual DeclUseList getClaimedVarUseSites() const override { return {Node}; }
 };
 
 class PointerDereferenceGadget : public FixableGadget {
@@ -1103,10 +1099,10 @@ class UPCAddressofArraySubscriptGadget : public FixableGadget {
 
   static Matcher matcher() {
     return expr(isInUnspecifiedPointerContext(expr(ignoringImpCasts(
-        unaryOperator(hasOperatorName("&"),
-                      hasUnaryOperand(arraySubscriptExpr(
-                          hasBase(ignoringParenImpCasts(declRefExpr(
-                                                  toSupportedVariable()))))))
+        unaryOperator(
+            hasOperatorName("&"),
+            hasUnaryOperand(arraySubscriptExpr(hasBase(
+                ignoringParenImpCasts(declRefExpr(toSupportedVariable()))))))
             .bind(UPCAddressofArraySubscriptTag)))));
   }
 
@@ -1195,13 +1191,13 @@ class DeclUseTracker {
 class UPCPreIncrementGadget : public FixableGadget {
 private:
   static constexpr const char *const UPCPreIncrementTag =
-    "PointerPreIncrementUnderUPC";
+      "PointerPreIncrementUnderUPC";
   const UnaryOperator *Node; // the `++Ptr` node
 
 public:
   UPCPreIncrementGadget(const MatchFinder::MatchResult &Result)
-    : FixableGadget(Kind::UPCPreIncrement),
-      Node(Result.Nodes.getNodeAs<UnaryOperator>(UPCPreIncrementTag)) {
+      : FixableGadget(Kind::UPCPreIncrement),
+        Node(Result.Nodes.getNodeAs<UnaryOperator>(UPCPreIncrementTag)) {
     assert(Node != nullptr && "Expecting a non-null matching result");
   }
 
@@ -1215,10 +1211,9 @@ class UPCPreIncrementGadget : public FixableGadget {
     // can have the matcher be general, so long as `getClaimedVarUseSites` does
     // things right.
     return stmt(isInUnspecifiedPointerContext(expr(ignoringImpCasts(
-								    unaryOperator(isPreInc(),
-										  hasUnaryOperand(declRefExpr(
-                                                    toSupportedVariable()))
-										  ).bind(UPCPreIncrementTag)))));
+        unaryOperator(isPreInc(),
+                      hasUnaryOperand(declRefExpr(toSupportedVariable())))
+            .bind(UPCPreIncrementTag)))));
   }
 
   virtual std::optional<FixItList>
@@ -1782,9 +1777,9 @@ static SourceRange getSourceRangeToTokenEnd(const Decl *D,
                                             const LangOptions &LangOpts) {
   SourceLocation Begin = D->getBeginLoc();
   SourceLocation
-    End = // `D->getEndLoc` should always return the starting location of the
-    // last token, so we should get the end of the token
-    Lexer::getLocForEndOfToken(D->getEndLoc(), 0, SM, LangOpts);
+      End = // `D->getEndLoc` should always return the starting location of the
+      // last token, so we should get the end of the token
+      Lexer::getLocForEndOfToken(D->getEndLoc(), 0, SM, LangOpts);
 
   return SourceRange(Begin, End);
 }
@@ -1976,7 +1971,7 @@ PointerDereferenceGadget::getFixits(const FixitStrategy &S) const {
     if (auto LocPastOperand =
             getPastLoc(BaseDeclRefExpr, SM, Ctx.getLangOpts())) {
       return FixItList{{FixItHint::CreateRemoval(derefRange),
-			FixItHint::CreateInsertion(*LocPastOperand, "[0]")}};
+                        FixItHint::CreateInsertion(*LocPastOperand, "[0]")}};
     }
     break;
   }
@@ -2162,7 +2157,8 @@ FixVarInitializerWithSpan(const Expr *Init, ASTContext &Ctx,
   // NULL pointer, we use the default constructor to initialize the span
   // object, i.e., a `std:span` variable declaration with no initializer.
   // So the fix-it is just to remove the initializer.
-  if (Init->isNullPointerConstant(Ctx,
+  if (Init->isNullPointerConstant(
+          Ctx,
           // FIXME: Why does this function not ask for `const ASTContext
           // &`? It should. Maybe worth an NFC patch later.
           Expr::NullPointerConstantValueDependence::
@@ -2230,8 +2226,10 @@ FixVarInitializerWithSpan(const Expr *Init, ASTContext &Ctx,
 }
 
 #ifndef NDEBUG
-#define DEBUG_NOTE_DECL_FAIL(D, Msg)  \
-Handler.addDebugNoteForVar((D), (D)->getBeginLoc(), "failed to produce fixit for declaration '" + (D)->getNameAsString() + "'" + (Msg))
+#define DEBUG_NOTE_DECL_FAIL(D, Msg)                                           \
+  Handler.addDebugNoteForVar((D), (D)->getBeginLoc(),                          \
+                             "failed to produce fixit for declaration '" +     \
+                                 (D)->getNameAsString() + "'" + (Msg))
 #else
 #define DEBUG_NOTE_DECL_FAIL(D, Msg)
 #endif
@@ -2239,8 +2237,8 @@ Handler.addDebugNoteForVar((D), (D)->getBeginLoc(), "failed to produce fixit for
 // For the given variable declaration with a pointer-to-T type, returns the text
 // `std::span<T>`.  If it is unable to generate the text, returns
 // `std::nullopt`.
-static std::optional<std::string> createSpanTypeForVarDecl(const VarDecl *VD,
-                                                           const ASTContext &Ctx) {
+static std::optional<std::string>
+createSpanTypeForVarDecl(const VarDecl *VD, const ASTContext &Ctx) {
   assert(VD->getType()->isPointerType());
 
   std::optional<Qualifiers> PteTyQualifiers = std::nullopt;
@@ -2277,8 +2275,8 @@ static std::optional<std::string> createSpanTypeForVarDecl(const VarDecl *VD,
 //    the non-empty fix-it list, if fix-its are successfuly generated; empty
 //    list otherwise.
 static FixItList fixLocalVarDeclWithSpan(const VarDecl *D, ASTContext &Ctx,
-					 const StringRef UserFillPlaceHolder,
-					 UnsafeBufferUsageHandler &Handler) {
+                                         const StringRef UserFillPlaceHolder,
+                                         UnsafeBufferUsageHandler &Handler) {
   if (hasUnsupportedSpecifiers(D, Ctx.getSourceManager()))
     return {};
 
@@ -2431,9 +2429,9 @@ createOverloadsForFixedParams(const FixitStrategy &S, const FunctionDecl *FD,
         // print parameter name if provided:
         if (IdentifierInfo *II = Parm->getIdentifier())
           SS << ' ' << II->getName().str();
-      } else if (auto ParmTypeText = getRangeText(
-                     getSourceRangeToTokenEnd(Parm, SM, LangOpts),
-                     SM, LangOpts)) {
+      } else if (auto ParmTypeText =
+                     getRangeText(getSourceRangeToTokenEnd(Parm, SM, LangOpts),
+                                  SM, LangOpts)) {
         // print the whole `Parm` without modification:
         SS << ParmTypeText->str();
       } else
@@ -2577,7 +2575,8 @@ static FixItList fixVariableWithSpan(const VarDecl *VD,
                                      UnsafeBufferUsageHandler &Handler) {
   const DeclStmt *DS = Tracker.lookupDecl(VD);
   if (!DS) {
-    DEBUG_NOTE_DECL_FAIL(VD, " : variables declared this way not implemented yet");
+    DEBUG_NOTE_DECL_FAIL(VD,
+                         " : variables declared this way not implemented yet");
     return {};
   }
   if (!DS->isSingleDecl()) {
@@ -2979,8 +2978,8 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
 #endif
 
   assert(D && D->getBody());
-  // We do not want to visit a Lambda expression defined inside a method independently.
-  // Instead, it should be visited along with the outer method.
+  // We do not want to visit a Lambda expression defined inside a method
+  // independently. Instead, it should be visited along with the outer method.
   // FIXME: do we want to do the same thing for `BlockDecl`s?
   if (const auto *fd = dyn_cast<CXXMethodDecl>(D)) {
     if (fd->getParent()->isLambda() && fd->getParent()->isLocalClass())
@@ -2990,7 +2989,7 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   // Do not emit fixit suggestions for functions declared in an
   // extern "C" block.
   if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
-      for (FunctionDecl *FReDecl : FD->redecls()) {
+    for (FunctionDecl *FReDecl : FD->redecls()) {
       if (FReDecl->isExternC()) {
         EmitSuggestions = false;
         break;
@@ -3002,7 +3001,7 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   FixableGadgetSets FixablesForAllVars;
 
   auto [FixableGadgets, WarningGadgets, Tracker] =
-    findGadgets(D, Handler, EmitSuggestions);
+      findGadgets(D, Handler, EmitSuggestions);
 
   if (!EmitSuggestions) {
     // Our job is very easy without suggestions. Just warn about
@@ -3055,36 +3054,36 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   // Filter out non-local vars and vars with unclaimed DeclRefExpr-s.
   for (auto it = FixablesForAllVars.byVar.cbegin();
        it != FixablesForAllVars.byVar.cend();) {
-      // FIXME: need to deal with global variables later
-      if ((!it->first->isLocalVarDecl() && !isa<ParmVarDecl>(it->first))) {
+    // FIXME: need to deal with global variables later
+    if ((!it->first->isLocalVarDecl() && !isa<ParmVarDecl>(it->first))) {
 #ifndef NDEBUG
-          Handler.addDebugNoteForVar(
-              it->first, it->first->getBeginLoc(),
-              ("failed to produce fixit for '" + it->first->getNameAsString() +
-               "' : neither local nor a parameter"));
+      Handler.addDebugNoteForVar(it->first, it->first->getBeginLoc(),
+                                 ("failed to produce fixit for '" +
+                                  it->first->getNameAsString() +
+                                  "' : neither local nor a parameter"));
 #endif
-        it = FixablesForAllVars.byVar.erase(it);
-      } else if (it->first->getType().getCanonicalType()->isReferenceType()) {
+      it = FixablesForAllVars.byVar.erase(it);
+    } else if (it->first->getType().getCanonicalType()->isReferenceType()) {
 #ifndef NDEBUG
-        Handler.addDebugNoteForVar(it->first, it->first->getBeginLoc(),
-                                   ("failed to produce fixit for '" +
-                                    it->first->getNameAsString() +
-                                    "' : has a reference type"));
+      Handler.addDebugNoteForVar(it->first, it->first->getBeginLoc(),
+                                 ("failed to produce fixit for '" +
+                                  it->first->getNameAsString() +
+                                  "' : has a reference type"));
 #endif
-        it = FixablesForAllVars.byVar.erase(it);
-      } else if (Tracker.hasUnclaimedUses(it->first)) {
-        it = FixablesForAllVars.byVar.erase(it);
-      } else if (it->first->isInitCapture()) {
+      it = FixablesForAllVars.byVar.erase(it);
+    } else if (Tracker.hasUnclaimedUses(it->first)) {
+      it = FixablesForAllVars.byVar.erase(it);
+    } else if (it->first->isInitCapture()) {
 #ifndef NDEBUG
-        Handler.addDebugNoteForVar(
-            it->first, it->first->getBeginLoc(),
-                                   ("failed to produce fixit for '" + it->first->getNameAsString() +
-                                    "' : init capture"));
+      Handler.addDebugNoteForVar(it->first, it->first->getBeginLoc(),
+                                 ("failed to produce fixit for '" +
+                                  it->first->getNameAsString() +
+                                  "' : init capture"));
 #endif
-        it = FixablesForAllVars.byVar.erase(it);
-      } else {
-        ++it;
-      }
+      it = FixablesForAllVars.byVar.erase(it);
+    } else {
+      ++it;
+    }
   }
 
 #ifndef NDEBUG
@@ -3115,7 +3114,7 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   for (auto it : FixablesForAllVars.byVar) {
     for (const FixableGadget *fixable : it.second) {
       std::optional<std::pair<const VarDecl *, const VarDecl *>> ImplPair =
-                                  fixable->getStrategyImplications();
+          fixable->getStrategyImplications();
       if (ImplPair) {
         std::pair<const VarDecl *, const VarDecl *> Impl = std::move(*ImplPair);
         PtrAssignmentGraph[Impl.first].insert(Impl.second);
@@ -3144,10 +3143,10 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   for (const auto &[Var, ignore] : UnsafeOps.byVar) {
     if (VisitedVarsDirected.find(Var) == VisitedVarsDirected.end()) {
 
-      std::queue<const VarDecl*> QueueDirected{};
+      std::queue<const VarDecl *> QueueDirected{};
       QueueDirected.push(Var);
-      while(!QueueDirected.empty()) {
-        const VarDecl* CurrentVar = QueueDirected.front();
+      while (!QueueDirected.empty()) {
+        const VarDecl *CurrentVar = QueueDirected.front();
         QueueDirected.pop();
         VisitedVarsDirected.insert(CurrentVar);
         auto AdjacentNodes = PtrAssignmentGraph[CurrentVar];
@@ -3178,11 +3177,11 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   for (const auto &[Var, ignore] : UnsafeOps.byVar) {
     if (VisitedVars.find(Var) == VisitedVars.end()) {
       VarGrpTy &VarGroup = Groups.emplace_back();
-      std::queue<const VarDecl*> Queue{};
+      std::queue<const VarDecl *> Queue{};
 
       Queue.push(Var);
-      while(!Queue.empty()) {
-        const VarDecl* CurrentVar = Queue.front();
+      while (!Queue.empty()) {
+        const VarDecl *CurrentVar = Queue.front();
         Queue.pop();
         VisitedVars.insert(CurrentVar);
         VarGroup.push_back(CurrentVar);
diff --git a/clang/lib/CodeGen/ABIInfo.cpp b/clang/lib/CodeGen/ABIInfo.cpp
index 1b56cf7c596d06..efcff958ce5452 100644
--- a/clang/lib/CodeGen/ABIInfo.cpp
+++ b/clang/lib/CodeGen/ABIInfo.cpp
@@ -184,6 +184,58 @@ ABIArgInfo ABIInfo::getNaturalAlignIndirectInReg(QualType Ty,
                                       /*ByVal*/ false, Realign);
 }
 
+void ABIInfo::appendAttributeMangling(TargetAttr *Attr,
+                                      raw_ostream &Out) const {
+  if (Attr->isDefaultVersion())
+    return;
+  appendAttributeMangling(Attr->getFeaturesStr(), Out);
+}
+
+void ABIInfo::appendAttributeMangling(TargetVersionAttr *Attr,
+                                      raw_ostream &Out) const {
+  appendAttributeMangling(Attr->getNamesStr(), Out);
+}
+
+void ABIInfo::appendAttributeMangling(TargetClonesAttr *Attr, unsigned Index,
+                                      raw_ostream &Out) const {
+  appendAttributeMangling(Attr->getFeatureStr(Index), Out);
+  Out << '.' << Attr->getMangledIndex(Index);
+}
+
+void ABIInfo::appendAttributeMangling(StringRef AttrStr,
+                                      raw_ostream &Out) const {
+  if (AttrStr == "default") {
+    Out << ".default";
+    return;
+  }
+
+  Out << '.';
+  const TargetInfo &TI = CGT.getTarget();
+  ParsedTargetAttr Info = TI.parseTargetAttr(AttrStr);
+
+  llvm::sort(Info.Features, [&TI](StringRef LHS, StringRef RHS) {
+    // Multiversioning doesn't allow "no-${feature}", so we can
+    // only have "+" prefixes here.
+    assert(LHS.starts_with("+") && RHS.starts_with("+") &&
+           "Features should always have a prefix.");
+    return TI.multiVersionSortPriority(LHS.substr(1)) >
+           TI.multiVersionSortPriority(RHS.substr(1));
+  });
+
+  bool IsFirst = true;
+  if (!Info.CPU.empty()) {
+    IsFirst = false;
+    Out << "arch_" << Info.CPU;
+  }
+
+  for (StringRef Feat : Info.Features) {
+    if (!IsFirst)
+      Out << '_';
+    IsFirst = false;
+    Out << Feat.substr(1);
+  }
+}
+
 // Pin the vtable to this file.
 SwiftABIInfo::~SwiftABIInfo() = default;
 
diff --git a/clang/lib/CodeGen/ABIInfo.h b/clang/lib/CodeGen/ABIInfo.h
index b9a5ef6e436693..ff4ae44a42c332 100644
--- a/clang/lib/CodeGen/ABIInfo.h
+++ b/clang/lib/CodeGen/ABIInfo.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_LIB_CODEGEN_ABIINFO_H
 #define LLVM_CLANG_LIB_CODEGEN_ABIINFO_H
 
+#include "clang/AST/Attr.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/Type.h"
 #include "llvm/IR/CallingConv.h"
@@ -111,6 +112,15 @@ class ABIInfo {
 
   CodeGen::ABIArgInfo getNaturalAlignIndirectInReg(QualType Ty,
                                                    bool Realign = false) const;
+
+  virtual void appendAttributeMangling(TargetAttr *Attr,
+                                       raw_ostream &Out) const;
+  virtual void appendAttributeMangling(TargetVersionAttr *Attr,
+                                       raw_ostream &Out) const;
+  virtual void appendAttributeMangling(TargetClonesAttr *Attr, unsigned Index,
+                                       raw_ostream &Out) const;
+  virtual void appendAttributeMangling(StringRef AttrStr,
+                                       raw_ostream &Out) const;
 };
 
 /// Target specific hooks for defining how a type should be passed or returned
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index d05cf1c6e1814e..13f68237b464d6 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -3221,12 +3221,11 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
 
       llvm::StructType *STy =
           dyn_cast<llvm::StructType>(ArgI.getCoerceToType());
-      llvm::TypeSize StructSize;
-      llvm::TypeSize PtrElementSize;
       if (ArgI.isDirect() && !ArgI.getCanBeFlattened() && STy &&
           STy->getNumElements() > 1) {
-        StructSize = CGM.getDataLayout().getTypeAllocSize(STy);
-        PtrElementSize =
+        [[maybe_unused]] llvm::TypeSize StructSize =
+            CGM.getDataLayout().getTypeAllocSize(STy);
+        [[maybe_unused]] llvm::TypeSize PtrElementSize =
             CGM.getDataLayout().getTypeAllocSize(ConvertTypeForMem(Ty));
         if (STy->containsHomogeneousScalableVectorTypes()) {
           assert(StructSize == PtrElementSize &&
@@ -5310,12 +5309,12 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
 
       llvm::StructType *STy =
           dyn_cast<llvm::StructType>(ArgInfo.getCoerceToType());
-      llvm::Type *SrcTy = ConvertTypeForMem(I->Ty);
-      llvm::TypeSize SrcTypeSize;
-      llvm::TypeSize DstTypeSize;
       if (STy && ArgInfo.isDirect() && !ArgInfo.getCanBeFlattened()) {
-        SrcTypeSize = CGM.getDataLayout().getTypeAllocSize(SrcTy);
-        DstTypeSize = CGM.getDataLayout().getTypeAllocSize(STy);
+        llvm::Type *SrcTy = ConvertTypeForMem(I->Ty);
+        [[maybe_unused]] llvm::TypeSize SrcTypeSize =
+            CGM.getDataLayout().getTypeAllocSize(SrcTy);
+        [[maybe_unused]] llvm::TypeSize DstTypeSize =
+            CGM.getDataLayout().getTypeAllocSize(STy);
         if (STy->containsHomogeneousScalableVectorTypes()) {
           assert(SrcTypeSize == DstTypeSize &&
                  "Only allow non-fractional movement of structure with "
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index d16d12fac8b03c..82a97ecfaa0078 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1727,59 +1727,6 @@ static void AppendCPUSpecificCPUDispatchMangling(const CodeGenModule &CGM,
     Out << ".resolver";
 }
 
-static void AppendTargetVersionMangling(const CodeGenModule &CGM,
-                                        const TargetVersionAttr *Attr,
-                                        raw_ostream &Out) {
-  if (Attr->isDefaultVersion()) {
-    Out << ".default";
-    return;
-  }
-  Out << "._";
-  const TargetInfo &TI = CGM.getTarget();
-  llvm::SmallVector<StringRef, 8> Feats;
-  Attr->getFeatures(Feats);
-  llvm::stable_sort(Feats, [&TI](const StringRef FeatL, const StringRef FeatR) {
-    return TI.multiVersionSortPriority(FeatL) <
-           TI.multiVersionSortPriority(FeatR);
-  });
-  for (const auto &Feat : Feats) {
-    Out << 'M';
-    Out << Feat;
-  }
-}
-
-static void AppendTargetMangling(const CodeGenModule &CGM,
-                                 const TargetAttr *Attr, raw_ostream &Out) {
-  if (Attr->isDefaultVersion())
-    return;
-
-  Out << '.';
-  const TargetInfo &Target = CGM.getTarget();
-  ParsedTargetAttr Info = Target.parseTargetAttr(Attr->getFeaturesStr());
-  llvm::sort(Info.Features, [&Target](StringRef LHS, StringRef RHS) {
-    // Multiversioning doesn't allow "no-${feature}", so we can
-    // only have "+" prefixes here.
-    assert(LHS.starts_with("+") && RHS.starts_with("+") &&
-           "Features should always have a prefix.");
-    return Target.multiVersionSortPriority(LHS.substr(1)) >
-           Target.multiVersionSortPriority(RHS.substr(1));
-  });
-
-  bool IsFirst = true;
-
-  if (!Info.CPU.empty()) {
-    IsFirst = false;
-    Out << "arch_" << Info.CPU;
-  }
-
-  for (StringRef Feat : Info.Features) {
-    if (!IsFirst)
-      Out << '_';
-    IsFirst = false;
-    Out << Feat.substr(1);
-  }
-}
-
 // Returns true if GD is a function decl with internal linkage and
 // needs a unique suffix after the mangled name.
 static bool isUniqueInternalLinkageDecl(GlobalDecl GD,
@@ -1789,41 +1736,6 @@ static bool isUniqueInternalLinkageDecl(GlobalDecl GD,
          (CGM.getFunctionLinkage(GD) == llvm::GlobalValue::InternalLinkage);
 }
 
-static void AppendTargetClonesMangling(const CodeGenModule &CGM,
-                                       const TargetClonesAttr *Attr,
-                                       unsigned VersionIndex,
-                                       raw_ostream &Out) {
-  const TargetInfo &TI = CGM.getTarget();
-  if (TI.getTriple().isAArch64()) {
-    StringRef FeatureStr = Attr->getFeatureStr(VersionIndex);
-    if (FeatureStr == "default") {
-      Out << ".default";
-      return;
-    }
-    Out << "._";
-    SmallVector<StringRef, 8> Features;
-    FeatureStr.split(Features, "+");
-    llvm::stable_sort(Features,
-                      [&TI](const StringRef FeatL, const StringRef FeatR) {
-                        return TI.multiVersionSortPriority(FeatL) <
-                               TI.multiVersionSortPriority(FeatR);
-                      });
-    for (auto &Feat : Features) {
-      Out << 'M';
-      Out << Feat;
-    }
-  } else {
-    Out << '.';
-    StringRef FeatureStr = Attr->getFeatureStr(VersionIndex);
-    if (FeatureStr.starts_with("arch="))
-      Out << "arch_" << FeatureStr.substr(sizeof("arch=") - 1);
-    else
-      Out << FeatureStr;
-
-    Out << '.' << Attr->getMangledIndex(VersionIndex);
-  }
-}
-
 static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD,
                                       const NamedDecl *ND,
                                       bool OmitMultiVersionMangling = false) {
@@ -1877,16 +1789,25 @@ static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD,
                                              FD->getAttr<CPUSpecificAttr>(),
                                              GD.getMultiVersionIndex(), Out);
         break;
-      case MultiVersionKind::Target:
-        AppendTargetMangling(CGM, FD->getAttr<TargetAttr>(), Out);
+      case MultiVersionKind::Target: {
+        auto *Attr = FD->getAttr<TargetAttr>();
+        const ABIInfo &Info = CGM.getTargetCodeGenInfo().getABIInfo();
+        Info.appendAttributeMangling(Attr, Out);
         break;
-      case MultiVersionKind::TargetVersion:
-        AppendTargetVersionMangling(CGM, FD->getAttr<TargetVersionAttr>(), Out);
+      }
+      case MultiVersionKind::TargetVersion: {
+        auto *Attr = FD->getAttr<TargetVersionAttr>();
+        const ABIInfo &Info = CGM.getTargetCodeGenInfo().getABIInfo();
+        Info.appendAttributeMangling(Attr, Out);
         break;
-      case MultiVersionKind::TargetClones:
-        AppendTargetClonesMangling(CGM, FD->getAttr<TargetClonesAttr>(),
-                                   GD.getMultiVersionIndex(), Out);
+      }
+      case MultiVersionKind::TargetClones: {
+        auto *Attr = FD->getAttr<TargetClonesAttr>();
+        unsigned Index = GD.getMultiVersionIndex();
+        const ABIInfo &Info = CGM.getTargetCodeGenInfo().getABIInfo();
+        Info.appendAttributeMangling(Attr, Index, Out);
         break;
+      }
       case MultiVersionKind::None:
         llvm_unreachable("None multiversion type isn't valid here");
       }
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 94f8e7be2ee6eb..adfdd516351901 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -9,6 +9,7 @@
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
 #include "clang/Basic/DiagnosticFrontend.h"
+#include "llvm/TargetParser/AArch64TargetParser.h"
 
 using namespace clang;
 using namespace clang::CodeGen;
@@ -75,6 +76,12 @@ class AArch64ABIInfo : public ABIInfo {
   bool allowBFloatArgsAndRet() const override {
     return getTarget().hasBFloat16Type();
   }
+
+  using ABIInfo::appendAttributeMangling;
+  void appendAttributeMangling(TargetClonesAttr *Attr, unsigned Index,
+                               raw_ostream &Out) const override;
+  void appendAttributeMangling(StringRef AttrStr,
+                               raw_ostream &Out) const override;
 };
 
 class AArch64SwiftABIInfo : public SwiftABIInfo {
@@ -857,6 +864,39 @@ void AArch64TargetCodeGenInfo::checkFunctionCallABI(
           << Callee->getDeclName();
 }
 
+void AArch64ABIInfo::appendAttributeMangling(TargetClonesAttr *Attr,
+                                             unsigned Index,
+                                             raw_ostream &Out) const {
+  appendAttributeMangling(Attr->getFeatureStr(Index), Out);
+}
+
+void AArch64ABIInfo::appendAttributeMangling(StringRef AttrStr,
+                                             raw_ostream &Out) const {
+  if (AttrStr == "default") {
+    Out << ".default";
+    return;
+  }
+
+  Out << "._";
+  SmallVector<StringRef, 8> Features;
+  AttrStr.split(Features, "+");
+  for (auto &Feat : Features)
+    Feat = Feat.trim();
+
+  // FIXME: It was brought up in #79316 that sorting the features which are
+  // used for mangling based on their multiversion priority is not a good
+  // practice. Changing the feature priorities will break the ABI. Perhaps
+  // it would be preferable to perform a lexicographical sort instead.
+  const TargetInfo &TI = CGT.getTarget();
+  llvm::sort(Features, [&TI](const StringRef LHS, const StringRef RHS) {
+    return TI.multiVersionSortPriority(LHS) < TI.multiVersionSortPriority(RHS);
+  });
+
+  for (auto &Feat : Features)
+    if (auto Ext = llvm::AArch64::parseArchExtension(Feat))
+      Out << 'M' << Ext->Name;
+}
+
 std::unique_ptr<TargetCodeGenInfo>
 CodeGen::createAArch64TargetCodeGenInfo(CodeGenModule &CGM,
                                         AArch64ABIKind Kind) {
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index dbfc729bba24c7..66c3a237c12117 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4479,10 +4479,9 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T,
                       options::OPT_gpubnames, options::OPT_gno_pubnames);
   if (DwarfFission != DwarfFissionKind::None ||
       (PubnamesArg && checkDebugInfoOption(PubnamesArg, Args, D, TC)))
-    if (DebuggerTuning != llvm::DebuggerKind::LLDB &&
-        (!PubnamesArg ||
-         (!PubnamesArg->getOption().matches(options::OPT_gno_gnu_pubnames) &&
-          !PubnamesArg->getOption().matches(options::OPT_gno_pubnames))))
+    if (!PubnamesArg ||
+        (!PubnamesArg->getOption().matches(options::OPT_gno_gnu_pubnames) &&
+         !PubnamesArg->getOption().matches(options::OPT_gno_pubnames)))
       CmdArgs.push_back(PubnamesArg && PubnamesArg->getOption().matches(
                                            options::OPT_gpubnames)
                             ? "-gpubnames"
@@ -5959,7 +5958,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (Arg *A = Args.getLastArg(options::OPT_fbasic_block_address_map,
                                options::OPT_fno_basic_block_address_map)) {
-    if (Triple.isX86() && Triple.isOSBinFormatELF()) {
+    if ((Triple.isX86() || Triple.isAArch64()) && Triple.isOSBinFormatELF()) {
       if (A->getOption().matches(options::OPT_fbasic_block_address_map))
         A->render(Args, CmdArgs);
     } else {
diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h
index 34d1de0a060055..11e1e7d032586f 100644
--- a/clang/lib/Headers/__clang_hip_math.h
+++ b/clang/lib/Headers/__clang_hip_math.h
@@ -1306,75 +1306,15 @@ float min(float __x, float __y) { return __builtin_fminf(__x, __y); }
 __DEVICE__
 double min(double __x, double __y) { return __builtin_fmin(__x, __y); }
 
-// Define host min/max functions.
-#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) &&                  \
-    !defined(__HIP_NO_HOST_MIN_MAX_IN_GLOBAL_NAMESPACE__)
-
-#pragma push_macro("DEFINE_MIN_MAX_FUNCTIONS")
-#pragma push_macro("DEFINE_MIN_MAX_FUNCTIONS")
-#define DEFINE_MIN_MAX_FUNCTIONS(ret_type, type1, type2)                       \
-  inline ret_type min(const type1 __a, const type2 __b) {                      \
-    return (__a < __b) ? __a : __b;                                            \
-  }                                                                            \
-  inline ret_type max(const type1 __a, const type2 __b) {                      \
-    return (__a > __b) ? __a : __b;                                            \
-  }
-
-// Define min and max functions for same type comparisons
-DEFINE_MIN_MAX_FUNCTIONS(int, int, int)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned int, unsigned int, unsigned int)
-DEFINE_MIN_MAX_FUNCTIONS(long, long, long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long, unsigned long, unsigned long)
-DEFINE_MIN_MAX_FUNCTIONS(long long, long long, long long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, unsigned long long,
-                         unsigned long long)
-
-// The host min/max functions below accept mixed signed/unsigned integer
-// parameters and perform unsigned comparisons, which may produce unexpected
-// results if a signed integer was passed unintentionally. To avoid this
-// happening silently, these overloaded functions are not defined by default.
-// However, for compatibility with CUDA, they will be defined if users define
-// __HIP_DEFINE_MIXED_HOST_MIN_MAX__.
-#ifdef __HIP_DEFINE_MIXED_HOST_MIN_MAX__
-DEFINE_MIN_MAX_FUNCTIONS(unsigned int, int, unsigned int)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned int, unsigned int, int)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long, long, unsigned long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long, unsigned long, long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, long long, unsigned long long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, unsigned long long, long long)
-#endif // ifdef __HIP_DEFINE_MIXED_HOST_MIN_MAX__
-
-// Floating-point comparisons using built-in functions
-inline float min(float const __a, float const __b) {
-  return __builtin_fminf(__a, __b);
-}
-inline double min(double const __a, double const __b) {
-  return __builtin_fmin(__a, __b);
-}
-inline double min(float const __a, double const __b) {
-  return __builtin_fmin(__a, __b);
-}
-inline double min(double const __a, float const __b) {
-  return __builtin_fmin(__a, __b);
+#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
+__host__ inline static int min(int __arg1, int __arg2) {
+  return __arg1 < __arg2 ? __arg1 : __arg2;
 }
 
-inline float max(float const __a, float const __b) {
-  return __builtin_fmaxf(__a, __b);
-}
-inline double max(double const __a, double const __b) {
-  return __builtin_fmax(__a, __b);
-}
-inline double max(float const __a, double const __b) {
-  return __builtin_fmax(__a, __b);
+__host__ inline static int max(int __arg1, int __arg2) {
+  return __arg1 > __arg2 ? __arg1 : __arg2;
 }
-inline double max(double const __a, float const __b) {
-  return __builtin_fmax(__a, __b);
-}
-
-#pragma pop_macro("DEFINE_MIN_MAX_FUNCTIONS")
-
-#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) &&
-       // !defined(__HIP_NO_HOST_MIN_MAX_IN_GLOBAL_NAMESPACE__)
+#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
 #endif
 
 #pragma pop_macro("__DEVICE__")
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 96e3ebdecbdf83..1d451b5f5b25de 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -2099,9 +2099,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
 }
 
 /// Adds, with saturation, the corresponding elements of two 128-bit
-///    signed [16 x i8] vectors, saving each sum in the corresponding element of
-///    a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
-///    saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
+///    signed [16 x i8] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [16 x i8].
+///
+///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
+///    less than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2119,10 +2121,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
 }
 
 /// Adds, with saturation, the corresponding elements of two 128-bit
-///    signed [8 x i16] vectors, saving each sum in the corresponding element of
-///    a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
-///    are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-///    0x8000.
+///    signed [8 x i16] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [8 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2141,8 +2144,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
 
 /// Adds, with saturation, the corresponding elements of two 128-bit
 ///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
-///    of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
-///    are saturated to 0xFF. Negative sums are saturated to 0x00.
+///    of a 128-bit result vector of [16 x i8].
+///
+///    Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
+///    saturated to 0x00.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2161,8 +2166,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
 
 /// Adds, with saturation, the corresponding elements of two 128-bit
 ///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
-///    of a 128-bit result vector of [8 x i16]. Positive sums greater than
-///    0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
+///    of a 128-bit result vector of [8 x i16].
+///
+///    Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
+///    are saturated to 0x0000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2518,10 +2525,12 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
   return (__m128i)((__v2du)__a - (__v2du)__b);
 }
 
-/// Subtracts corresponding 8-bit signed integer values in the input and
-///    returns the differences in the corresponding bytes in the destination.
-///    Differences greater than 0x7F are saturated to 0x7F, and differences less
-///    than 0x80 are saturated to 0x80.
+/// Subtracts, with saturation, corresponding 8-bit signed integer values in
+///    the input and returns the differences in the corresponding bytes in the
+///    destination.
+///
+///    Differences greater than 0x7F are saturated to 0x7F, and differences
+///    less than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2538,8 +2547,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
   return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
 }
 
-/// Subtracts corresponding 16-bit signed integer values in the input and
-///    returns the differences in the corresponding bytes in the destination.
+/// Subtracts, with saturation, corresponding 16-bit signed integer values in
+///    the input and returns the differences in the corresponding bytes in the
+///    destination.
+///
 ///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
 ///    than 0x8000 are saturated to 0x8000.
 ///
@@ -2558,9 +2569,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
   return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
 }
 
-/// Subtracts corresponding 8-bit unsigned integer values in the input
-///    and returns the differences in the corresponding bytes in the
-///    destination. Differences less than 0x00 are saturated to 0x00.
+/// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
+///    the input and returns the differences in the corresponding bytes in the
+///    destination.
+///
+///    Differences less than 0x00 are saturated to 0x00.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2577,9 +2590,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
   return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
 }
 
-/// Subtracts corresponding 16-bit unsigned integer values in the input
-///    and returns the differences in the corresponding bytes in the
-///    destination. Differences less than 0x0000 are saturated to 0x0000.
+/// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
+///    the input and returns the differences in the corresponding bytes in the
+///    destination.
+///
+///    Differences less than 0x0000 are saturated to 0x0000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -4050,26 +4065,22 @@ void _mm_mfence(void);
 } // extern "C"
 #endif
 
-/// Converts 16-bit signed integers from both 128-bit integer vector
-///    operands into 8-bit signed integers, and packs the results into the
-///    destination. Positive values greater than 0x7F are saturated to 0x7F.
-///    Negative values less than 0x80 are saturated to 0x80.
+/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
+///    vector operands into 8-bit signed integers, and packs the results into
+///    the destination.
+///
+///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
+///    less than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
 ///
 /// \param __a
-///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-///   a signed integer and is converted to a 8-bit signed integer with
-///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
-///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///   written to the lower 64 bits of the result.
 /// \param __b
-///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-///   a signed integer and is converted to a 8-bit signed integer with
-///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
-///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///   written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
@@ -4077,26 +4088,22 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// Converts 32-bit signed integers from both 128-bit integer vector
-///    operands into 16-bit signed integers, and packs the results into the
-///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-///    Negative values less than 0x8000 are saturated to 0x8000.
+/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
+///    vector operands into 16-bit signed integers, and packs the results into
+///    the destination.
+///
+///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
+///    values less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
 ///
 /// \param __a
-///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
-///    a signed integer and is converted to a 16-bit signed integer with
-///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
-///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
 ///    are written to the lower 64 bits of the result.
 /// \param __b
-///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
-///    a signed integer and is converted to a 16-bit signed integer with
-///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
-///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
 ///    are written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
@@ -4104,26 +4111,22 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
 }
 
-/// Converts 16-bit signed integers from both 128-bit integer vector
-///    operands into 8-bit unsigned integers, and packs the results into the
-///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0x00 are saturated to 0x00.
+/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
+///    vector operands into 8-bit unsigned integers, and packs the results into
+///    the destination.
+///
+///    Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
+///    are saturated to 0x00.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
 ///
 /// \param __a
-///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-///    a signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///    written to the lower 64 bits of the result.
 /// \param __b
-///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-///    a signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///    written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h
index ea832fac4f9922..22d1a780bbfd4e 100644
--- a/clang/lib/Headers/fmaintrin.h
+++ b/clang/lib/Headers/fmaintrin.h
@@ -60,7 +60,8 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a scalar multiply-add of the single-precision values in the
 ///    low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@@ -88,7 +89,8 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a scalar multiply-add of the double-precision values in the
 ///    low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@@ -156,7 +158,8 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a scalar multiply-subtract of the single-precision values in
 ///    the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@@ -184,7 +187,8 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a scalar multiply-subtract of the double-precision values in
 ///    the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@@ -252,7 +256,8 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a scalar negated multiply-add of the single-precision values in
 ///    the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@@ -280,7 +285,8 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a scalar negated multiply-add of the double-precision values
 ///    in the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@@ -348,7 +354,8 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a scalar negated multiply-subtract of the single-precision
 ///    values in the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@@ -376,7 +383,8 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a scalar negated multiply-subtract of the double-precision
 ///    values in the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@@ -404,7 +412,8 @@ _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
@@ -430,7 +439,8 @@ _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
 /// \endcode
@@ -454,7 +464,8 @@ _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
@@ -480,7 +491,8 @@ _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
 /// \endcode
@@ -664,7 +676,8 @@ _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 
 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
 ///    [8 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
@@ -694,7 +707,8 @@ _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 
 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
 ///    [4 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
@@ -720,7 +734,8 @@ _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 
 /// Computes a vector multiply with alternating add/subtract of 256-bit
 ///    vectors of [8 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
@@ -750,7 +765,8 @@ _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 
 /// Computes a vector multiply with alternating add/subtract of 256-bit
 ///    vectors of [4 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 08849f01071aea..962d24738e7aa4 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -105,28 +105,23 @@ _mm_cvtm64_si64(__m64 __m)
     return (long long)__m;
 }
 
-/// Converts 16-bit signed integers from both 64-bit integer vector
-///    parameters of [4 x i16] into 8-bit signed integer values, and constructs
-///    a 64-bit integer vector of [8 x i8] as the result. Positive values
-///    greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
-///    are saturated to 0x80.
+/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
+///    vector parameters of [4 x i16] into 8-bit signed integer values, and
+///    constructs a 64-bit integer vector of [8 x i8] as the result.
+///
+///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
+///    less than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit signed integer with
-///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
-///    Negative values less than 0x80 are saturated to 0x80. The converted
-///    [4 x i8] values are written to the lower 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the lower 32 bits of the result.
 /// \param __m2
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit signed integer with
-///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
-///    Negative values less than 0x80 are saturated to 0x80. The converted
-///    [4 x i8] values are written to the upper 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
@@ -135,28 +130,23 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// Converts 32-bit signed integers from both 64-bit integer vector
-///    parameters of [2 x i32] into 16-bit signed integer values, and constructs
-///    a 64-bit integer vector of [4 x i16] as the result. Positive values
-///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
-///    0x8000 are saturated to 0x8000.
+/// Converts, with saturation, 32-bit signed integers from both 64-bit integer
+///    vector parameters of [2 x i32] into 16-bit signed integer values, and
+///    constructs a 64-bit integer vector of [4 x i16] as the result.
+///
+///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
+///    values less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
-///    32-bit signed integer and is converted to a 16-bit signed integer with
-///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-///    Negative values less than 0x8000 are saturated to 0x8000. The converted
-///    [2 x i16] values are written to the lower 32 bits of the result.
+///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
+///    written to the lower 32 bits of the result.
 /// \param __m2
-///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
-///    32-bit signed integer and is converted to a 16-bit signed integer with
-///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-///    Negative values less than 0x8000 are saturated to 0x8000. The converted
-///    [2 x i16] values are written to the upper 32 bits of the result.
+///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
+///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
@@ -165,28 +155,23 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
 }
 
-/// Converts 16-bit signed integers from both 64-bit integer vector
-///    parameters of [4 x i16] into 8-bit unsigned integer values, and
-///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
-///    greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
-///    to 0.
+/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
+///    vector parameters of [4 x i16] into 8-bit unsigned integer values, and
+///    constructs a 64-bit integer vector of [8 x i8] as the result.
+///
+///    Values greater than 0xFF are saturated to 0xFF. Values less than 0 are
+///    saturated to 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0 are saturated to 0. The converted [4 x i8] values are written to
-///    the lower 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the lower 32 bits of the result.
 /// \param __m2
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0 are saturated to 0. The converted [4 x i8] values are written to
-///    the upper 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
@@ -400,11 +385,13 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
 }
 
-/// Adds each 8-bit signed integer element of the first 64-bit integer
-///    vector of [8 x i8] to the corresponding 8-bit signed integer element of
-///    the second 64-bit integer vector of [8 x i8]. Positive sums greater than
-///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
-///    0x80. The results are packed into a 64-bit integer vector of [8 x i8].
+/// Adds, with saturation, each 8-bit signed integer element of the first
+///    64-bit integer vector of [8 x i8] to the corresponding 8-bit signed
+///    integer element of the second 64-bit integer vector of [8 x i8].
+///
+///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
+///    less than 0x80 are saturated to 0x80. The results are packed into a
+///    64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -422,12 +409,13 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// Adds each 16-bit signed integer element of the first 64-bit integer
-///    vector of [4 x i16] to the corresponding 16-bit signed integer element of
-///    the second 64-bit integer vector of [4 x i16]. Positive sums greater than
-///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
-///    saturated to 0x8000. The results are packed into a 64-bit integer vector
-///    of [4 x i16].
+/// Adds, with saturation, each 16-bit signed integer element of the first
+///    64-bit integer vector of [4 x i16] to the corresponding 16-bit signed
+///    integer element of the second 64-bit integer vector of [4 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000. The results are packed into a
+///    64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -445,11 +433,12 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// Adds each 8-bit unsigned integer element of the first 64-bit integer
-///    vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
-///    the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
-///    saturated to 0xFF. The results are packed into a 64-bit integer vector of
-///    [8 x i8].
+/// Adds, with saturation, each 8-bit unsigned integer element of the first
+///    64-bit integer vector of [8 x i8] to the corresponding 8-bit unsigned
+///    integer element of the second 64-bit integer vector of [8 x i8].
+///
+///    Sums greater than 0xFF are saturated to 0xFF. The results are packed
+///    into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -467,11 +456,12 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// Adds each 16-bit unsigned integer element of the first 64-bit integer
-///    vector of [4 x i16] to the corresponding 16-bit unsigned integer element
-///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
-///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
-///    integer vector of [4 x i16].
+/// Adds, with saturation, each 16-bit unsigned integer element of the first
+///    64-bit integer vector of [4 x i16] to the corresponding 16-bit unsigned
+///    integer element of the second 64-bit integer vector of [4 x i16].
+///
+///    Sums greater than 0xFFFF are saturated to 0xFFFF. The results are packed
+///    into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -552,12 +542,13 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
 }
 
-/// Subtracts each 8-bit signed integer element of the second 64-bit
-///    integer vector of [8 x i8] from the corresponding 8-bit signed integer
-///    element of the first 64-bit integer vector of [8 x i8]. Positive results
-///    greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
-///    are saturated to 0x80. The results are packed into a 64-bit integer
-///    vector of [8 x i8].
+/// Subtracts, with saturation, each 8-bit signed integer element of the second
+///    64-bit integer vector of [8 x i8] from the corresponding 8-bit signed
+///    integer element of the first 64-bit integer vector of [8 x i8].
+///
+///    Positive results greater than 0x7F are saturated to 0x7F. Negative
+///    results less than 0x80 are saturated to 0x80. The results are packed
+///    into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -575,12 +566,13 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// Subtracts each 16-bit signed integer element of the second 64-bit
-///    integer vector of [4 x i16] from the corresponding 16-bit signed integer
-///    element of the first 64-bit integer vector of [4 x i16]. Positive results
-///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
-///    0x8000 are saturated to 0x8000. The results are packed into a 64-bit
-///    integer vector of [4 x i16].
+/// Subtracts, with saturation, each 16-bit signed integer element of the
+///    second 64-bit integer vector of [4 x i16] from the corresponding 16-bit
+///    signed integer element of the first 64-bit integer vector of [4 x i16].
+///
+///    Positive results greater than 0x7FFF are saturated to 0x7FFF. Negative
+///    results less than 0x8000 are saturated to 0x8000. The results are packed
+///    into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/Headers/prfchwintrin.h b/clang/lib/Headers/prfchwintrin.h
index d2f91aa0123e0e..8a13784543c5f1 100644
--- a/clang/lib/Headers/prfchwintrin.h
+++ b/clang/lib/Headers/prfchwintrin.h
@@ -15,9 +15,10 @@
 #define __PRFCHWINTRIN_H
 
 /// Loads a memory sequence containing the specified memory address into
-///    all data cache levels. The cache-coherency state is set to exclusive.
-///    Data can be read from and written to the cache line without additional
-///    delay.
+///    all data cache levels.
+///
+///    The cache-coherency state is set to exclusive. Data can be read from
+///    and written to the cache line without additional delay.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -32,10 +33,11 @@ _m_prefetch(void *__P)
 }
 
 /// Loads a memory sequence containing the specified memory address into
-///    the L1 data cache and sets the cache-coherency to modified. This
-///    provides a hint to the processor that the cache line will be modified.
-///    It is intended for use when the cache line will be written to shortly
-///    after the prefetch is performed.
+///    the L1 data cache and sets the cache-coherency state to modified.
+///
+///    This provides a hint to the processor that the cache line will be
+///    modified. It is intended for use when the cache line will be written to
+///    shortly after the prefetch is performed.
 ///
 ///    Note that the effect of this intrinsic is dependent on the processor
 ///    implementation.
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 005d7db9c3c308..c52ffb77e33d5c 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1431,8 +1431,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
 }
 
 /* SSE4 Pack with Unsigned Saturation.  */
-/// Converts 32-bit signed integers from both 128-bit integer vector
-///    operands into 16-bit unsigned integers, and returns the packed result.
+/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
+///    vector operands into 16-bit unsigned integers, and returns the packed
+///    result.
+///
 ///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
 ///    0x0000 are saturated to 0x0000.
 ///
@@ -1441,17 +1443,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
 /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
 ///
 /// \param __V1
-///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
-///    signed integer and is converted to a 16-bit unsigned integer with
-///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
-///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
-///    are written to the lower 64 bits of the result.
+///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
+///    written to the lower 64 bits of the result.
 /// \param __V2
-///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
-///    signed integer and is converted to a 16-bit unsigned integer with
-///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
-///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
-///    are written to the higher 64 bits of the result.
+///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
+///    written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
                                                               __m128i __V2) {
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index 7d8dc46c57bfeb..bf8327b692d1cf 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -271,10 +271,11 @@ _mm_hadd_pi32(__m64 __a, __m64 __b)
     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 }
 
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
-///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-///    0x8000.
+/// Horizontally adds, with saturation, the adjacent pairs of values contained
+///    in two packed 128-bit vectors of [8 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -296,10 +297,11 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
-///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-///    0x8000.
+/// Horizontally adds, with saturation, the adjacent pairs of values contained
+///    in two packed 64-bit vectors of [4 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -413,10 +415,11 @@ _mm_hsub_pi32(__m64 __a, __m64 __b)
     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 }
 
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
-///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
-///    saturated to 0x8000.
+/// Horizontally subtracts, with saturation, the adjacent pairs of values
+///    contained in two packed 128-bit vectors of [8 x i16].
+///
+///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative differences less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -438,10 +441,11 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
-///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
-///    saturated to 0x8000.
+/// Horizontally subtracts, with saturation, the adjacent pairs of values
+///    contained in two packed 64-bit vectors of [4 x i16].
+///
+///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative differences less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/InstallAPI/CMakeLists.txt b/clang/lib/InstallAPI/CMakeLists.txt
index fdc4f064f29e95..19fc4c3abde53a 100644
--- a/clang/lib/InstallAPI/CMakeLists.txt
+++ b/clang/lib/InstallAPI/CMakeLists.txt
@@ -1,11 +1,14 @@
 set(LLVM_LINK_COMPONENTS
   Support
   TextAPI
+  Core
   )
 
 add_clang_library(clangInstallAPI
   FileList.cpp
+  Frontend.cpp
   HeaderFile.cpp
+  Visitor.cpp
 
   LINK_LIBS
   clangAST
diff --git a/clang/lib/InstallAPI/Frontend.cpp b/clang/lib/InstallAPI/Frontend.cpp
new file mode 100644
index 00000000000000..9f675ef7d1bd22
--- /dev/null
+++ b/clang/lib/InstallAPI/Frontend.cpp
@@ -0,0 +1,58 @@
+//===- Frontend.cpp ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/InstallAPI/Frontend.h"
+#include "clang/AST/Availability.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace llvm;
+using namespace llvm::MachO;
+
+namespace clang::installapi {
+
+static StringRef getFileExtension(clang::Language Lang) {
+  switch (Lang) {
+  default:
+    llvm_unreachable("Unexpected language option.");
+  case clang::Language::C:
+    return ".c";
+  case clang::Language::CXX:
+    return ".cpp";
+  case clang::Language::ObjC:
+    return ".m";
+  case clang::Language::ObjCXX:
+    return ".mm";
+  }
+}
+
+std::unique_ptr<MemoryBuffer> createInputBuffer(const InstallAPIContext &Ctx) {
+  assert(Ctx.Type != HeaderType::Unknown &&
+         "unexpected access level for parsing");
+  SmallString<4096> Contents;
+  raw_svector_ostream OS(Contents);
+  for (const HeaderFile &H : Ctx.InputHeaders) {
+    if (H.getType() != Ctx.Type)
+      continue;
+    if (Ctx.LangMode == Language::C || Ctx.LangMode == Language::CXX)
+      OS << "#include ";
+    else
+      OS << "#import ";
+    if (H.useIncludeName())
+      OS << "<" << H.getIncludeName() << ">";
+    else
+      OS << "\"" << H.getPath() << "\"";
+  }
+  if (Contents.empty())
+    return nullptr;
+
+  return llvm::MemoryBuffer::getMemBufferCopy(
+      Contents, "installapi-includes" + getFileExtension(Ctx.LangMode));
+}
+
+} // namespace clang::installapi
diff --git a/clang/lib/InstallAPI/Visitor.cpp b/clang/lib/InstallAPI/Visitor.cpp
new file mode 100644
index 00000000000000..9b333a6142ae8f
--- /dev/null
+++ b/clang/lib/InstallAPI/Visitor.cpp
@@ -0,0 +1,94 @@
+//===- Visitor.cpp ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/InstallAPI/Visitor.h"
+#include "clang/AST/Availability.h"
+#include "clang/Basic/Linkage.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Mangler.h"
+
+using namespace llvm;
+using namespace llvm::MachO;
+
+namespace clang::installapi {
+
+// Exported NamedDecl needs to have externally visibiliy linkage and
+// default visibility from LinkageComputer.
+static bool isExported(const NamedDecl *D) {
+  auto LV = D->getLinkageAndVisibility();
+  return isExternallyVisible(LV.getLinkage()) &&
+         (LV.getVisibility() == DefaultVisibility);
+}
+
+static SymbolFlags getFlags(bool WeakDef, bool ThreadLocal) {
+  SymbolFlags Result = SymbolFlags::None;
+  if (WeakDef)
+    Result |= SymbolFlags::WeakDefined;
+  if (ThreadLocal)
+    Result |= SymbolFlags::ThreadLocalValue;
+
+  return Result;
+}
+
+void InstallAPIVisitor::HandleTranslationUnit(ASTContext &ASTCtx) {
+  if (ASTCtx.getDiagnostics().hasErrorOccurred())
+    return;
+
+  auto *D = ASTCtx.getTranslationUnitDecl();
+  TraverseDecl(D);
+}
+
+std::string InstallAPIVisitor::getMangledName(const NamedDecl *D) const {
+  SmallString<256> Name;
+  if (MC->shouldMangleDeclName(D)) {
+    raw_svector_ostream NStream(Name);
+    MC->mangleName(D, NStream);
+  } else
+    Name += D->getNameAsString();
+
+  return getBackendMangledName(Name);
+}
+
+std::string InstallAPIVisitor::getBackendMangledName(Twine Name) const {
+  SmallString<256> FinalName;
+  Mangler::getNameWithPrefix(FinalName, Name, DataLayout(Layout));
+  return std::string(FinalName);
+}
+
+/// Collect all global variables.
+bool InstallAPIVisitor::VisitVarDecl(const VarDecl *D) {
+  // Skip function parameters.
+  if (isa<ParmVarDecl>(D))
+    return true;
+
+  // Skip variables in records. They are handled seperately for C++.
+  if (D->getDeclContext()->isRecord())
+    return true;
+
+  // Skip anything inside functions or methods.
+  if (!D->isDefinedOutsideFunctionOrMethod())
+    return true;
+
+  // If this is a template but not specialization or instantiation, skip.
+  if (D->getASTContext().getTemplateOrSpecializationInfo(D) &&
+      D->getTemplateSpecializationKind() == TSK_Undeclared)
+    return true;
+
+  // TODO: Capture SourceLocation & Availability for Decls.
+  const RecordLinkage Linkage =
+      isExported(D) ? RecordLinkage::Exported : RecordLinkage::Internal;
+  const bool WeakDef = D->hasAttr<WeakAttr>();
+  const bool ThreadLocal = D->getTLSKind() != VarDecl::TLS_None;
+  Slice.addGlobal(getMangledName(D), Linkage, GlobalRecord::Kind::Variable,
+                  getFlags(WeakDef, ThreadLocal));
+  return true;
+}
+
+} // namespace clang::installapi
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index defd83ec8e179c..01b191ab0eeaf4 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -310,8 +310,12 @@ class TrivialFunctionAnalysisVisitor
         return true;
       if (isa<EnumConstantDecl>(decl))
         return true;
-      if (auto *VD = dyn_cast<VarDecl>(decl))
-        return VD->hasConstantInitialization() && VD->getEvaluatedValue();
+      if (auto *VD = dyn_cast<VarDecl>(decl)) {
+        if (VD->hasConstantInitialization() && VD->getEvaluatedValue())
+          return true;
+        auto *Init = VD->getInit();
+        return !Init || Visit(Init);
+      }
     }
     return false;
   }
diff --git a/clang/test/AST/Interp/cxx11.cpp b/clang/test/AST/Interp/cxx11.cpp
index 0a1e0f3fd28e9d..993e3618a37848 100644
--- a/clang/test/AST/Interp/cxx11.cpp
+++ b/clang/test/AST/Interp/cxx11.cpp
@@ -22,3 +22,11 @@ int array2[recurse2]; // both-warning {{variable length arrays in C++}} \
                       // both-note {{initializer of 'recurse2' is not a constant expression}} \
                       // expected-error {{variable length array declaration not allowed at file scope}} \
                       // ref-warning {{variable length array folded to constant array as an extension}}
+
+struct S {
+  int m;
+};
+constexpr S s = { 5 };
+constexpr const int *p = &s.m + 1;
+
+constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; // ok
diff --git a/clang/test/AST/Interp/cxx20.cpp b/clang/test/AST/Interp/cxx20.cpp
index 5c9c6257965108..000ffe39eb94a7 100644
--- a/clang/test/AST/Interp/cxx20.cpp
+++ b/clang/test/AST/Interp/cxx20.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fcxx-exceptions -fexperimental-new-constant-interpreter -std=c++20 -verify %s
-// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -verify=ref %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexperimental-new-constant-interpreter -std=c++20 -verify=both,expected -fcxx-exceptions %s
+// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -verify=both,ref -fcxx-exceptions %s
 
 void test_alignas_operand() {
   alignas(8) char dummy;
@@ -58,13 +58,10 @@ static_assert(pointerAssign2() == 12, "");
 
 constexpr int unInitLocal() {
   int a;
-  return a; // ref-note {{read of uninitialized object}} \
-            // expected-note {{read of uninitialized object}}
+  return a; // both-note {{read of uninitialized object}}
 }
-static_assert(unInitLocal() == 0, ""); // ref-error {{not an integral constant expression}} \
-                                       // ref-note {{in call to 'unInitLocal()'}} \
-                                       // expected-error {{not an integral constant expression}} \
-                                       // expected-note {{in call to 'unInitLocal()'}} \
+static_assert(unInitLocal() == 0, ""); // both-error {{not an integral constant expression}} \
+                                       // both-note {{in call to 'unInitLocal()'}}
 
 constexpr int initializedLocal() {
   int a;
@@ -75,25 +72,19 @@ static_assert(initializedLocal() == 20);
 
 constexpr int initializedLocal2() {
   int a[2];
-  return *a; // expected-note {{read of uninitialized object is not allowed in a constant expression}} \
-             // ref-note {{read of uninitialized object is not allowed in a constant expression}}
+  return *a; // both-note {{read of uninitialized object is not allowed in a constant expression}}
 }
-static_assert(initializedLocal2() == 20); // expected-error {{not an integral constant expression}} \
-                                          // expected-note {{in call to}} \
-                                          // ref-error {{not an integral constant expression}} \
-                                          // ref-note {{in call to}}
+static_assert(initializedLocal2() == 20); // both-error {{not an integral constant expression}} \
+                                          // both-note {{in call to}}
 
 
 struct Int { int a; };
 constexpr int initializedLocal3() {
   Int i;
-  return i.a; // ref-note {{read of uninitialized object is not allowed in a constant expression}} \
-              // expected-note {{read of uninitialized object}}
+  return i.a; // both-note {{read of uninitialized object is not allowed in a constant expression}}
 }
-static_assert(initializedLocal3() == 20); // expected-error {{not an integral constant expression}} \
-                                          // expected-note {{in call to}} \
-                                          // ref-error {{not an integral constant expression}} \
-                                          // ref-note {{in call to}}
+static_assert(initializedLocal3() == 20); // both-error {{not an integral constant expression}} \
+                                          // both-note {{in call to}}
 
 
 
@@ -137,22 +128,16 @@ static_assert(!b4); // ref-error {{not an integral constant expression}} \
 namespace UninitializedFields {
   class A {
   public:
-    int a; // expected-note 4{{subobject declared here}} \
-           // ref-note 4{{subobject declared here}}
+    int a; // both-note 4{{subobject declared here}}
     constexpr A() {}
   };
-  constexpr A a; // expected-error {{must be initialized by a constant expression}} \
-                 // expected-note {{subobject 'a' is not initialized}} \
-                 // ref-error {{must be initialized by a constant expression}} \
-                 // ref-note {{subobject 'a' is not initialized}}
-  constexpr A aarr[2]; // expected-error {{must be initialized by a constant expression}} \
-                       // expected-note {{subobject 'a' is not initialized}} \
-                       // ref-error {{must be initialized by a constant expression}} \
-                       // ref-note {{subobject 'a' is not initialized}}
+  constexpr A a; // both-error {{must be initialized by a constant expression}} \
+                 // both-note {{subobject 'a' is not initialized}}
+  constexpr A aarr[2]; // both-error {{must be initialized by a constant expression}} \
+                       // both-note {{subobject 'a' is not initialized}}
   class F {
     public:
-      int f; // expected-note 3{{subobject declared here}} \
-             // ref-note 3{{subobject declared here}}
+      int f; // both-note 3{{subobject declared here}}
 
       constexpr F() {}
       constexpr F(bool b) {
@@ -161,26 +146,19 @@ namespace UninitializedFields {
       }
   };
 
-  constexpr F foo[2] = {true}; // expected-error {{must be initialized by a constant expression}} \
-                               // expected-note {{subobject 'f' is not initialized}} \
-                               // ref-error {{must be initialized by a constant expression}} \
-                               // ref-note {{subobject 'f' is not initialized}}
-  constexpr F foo2[3] = {true, false, true}; // expected-error {{must be initialized by a constant expression}} \
-                                             // expected-note {{subobject 'f' is not initialized}} \
-                                             // ref-error {{must be initialized by a constant expression}} \
-                                             // ref-note {{subobject 'f' is not initialized}}
-  constexpr F foo3[3] = {true, true, F()}; // expected-error {{must be initialized by a constant expression}} \
-                                           // expected-note {{subobject 'f' is not initialized}} \
-                                           // ref-error {{must be initialized by a constant expression}} \
-                                           // ref-note {{subobject 'f' is not initialized}}
+  constexpr F foo[2] = {true}; // both-error {{must be initialized by a constant expression}} \
+                               // both-note {{subobject 'f' is not initialized}}
+  constexpr F foo2[3] = {true, false, true}; // both-error {{must be initialized by a constant expression}} \
+                                             // both-note {{subobject 'f' is not initialized}}
+  constexpr F foo3[3] = {true, true, F()}; // both-error {{must be initialized by a constant expression}} \
+                                           // both-note {{subobject 'f' is not initialized}}
 
 
 
   class Base {
   public:
     bool b;
-    int a; // expected-note {{subobject declared here}} \
-           // ref-note {{subobject declared here}}
+    int a; // both-note {{subobject declared here}}
     constexpr Base() : b(true) {}
   };
 
@@ -188,56 +166,44 @@ namespace UninitializedFields {
   public:
     constexpr Derived() : Base() {}   };
 
-  constexpr Derived D; // expected-error {{must be initialized by a constant expression}} \
-                       // expected-note {{subobject 'a' is not initialized}} \
-                       // ref-error {{must be initialized by a constant expression}} \
-                       // ref-note {{subobject 'a' is not initialized}}
+  constexpr Derived D; // both-error {{must be initialized by a constant expression}} \
+                       // both-note {{subobject 'a' is not initialized}}
 
   class C2 {
   public:
     A a;
     constexpr C2() {}   };
-  constexpr C2 c2; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{subobject 'a' is not initialized}} \
-                   // ref-error {{must be initialized by a constant expression}} \
-                   // ref-note {{subobject 'a' is not initialized}}
+  constexpr C2 c2; // both-error {{must be initialized by a constant expression}} \
+                   // both-note {{subobject 'a' is not initialized}}
 
   class C3 {
   public:
     A a[2];
     constexpr C3() {}
   };
-  constexpr C3 c3; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{subobject 'a' is not initialized}} \
-                   // ref-error {{must be initialized by a constant expression}} \
-                   // ref-note {{subobject 'a' is not initialized}}
+  constexpr C3 c3; // both-error {{must be initialized by a constant expression}} \
+                   // both-note {{subobject 'a' is not initialized}}
 
   class C4 {
   public:
-    bool B[2][3]; // expected-note {{subobject declared here}} \
-                  // ref-note {{subobject declared here}}
+    bool B[2][3]; // both-note {{subobject declared here}}
     constexpr C4(){}
   };
-  constexpr C4 c4; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{subobject 'B' is not initialized}} \
-                   // ref-error {{must be initialized by a constant expression}} \
-                   // ref-note {{subobject 'B' is not initialized}}
+  constexpr C4 c4; // both-error {{must be initialized by a constant expression}} \
+                   // both-note {{subobject 'B' is not initialized}}
 };
 
 namespace ConstThis {
   class Foo {
-    const int T = 12; // expected-note {{declared const here}} \
-                      // ref-note {{declared const here}}
+    const int T = 12; // both-note {{declared const here}}
     int a;
   public:
     constexpr Foo() {
       this->a = 10;
-      T = 13; // expected-error {{cannot assign to non-static data member 'T' with const-qualified type}} \
-              // ref-error {{cannot assign to non-static data member 'T' with const-qualified type}}
+      T = 13; // both-error {{cannot assign to non-static data member 'T' with const-qualified type}}
     }
   };
-  constexpr Foo F; // expected-error {{must be initialized by a constant expression}} \
-                   // ref-error {{must be initialized by a constant expression}}
+  constexpr Foo F; // both-error {{must be initialized by a constant expression}}
 
 
   class FooDtor {
@@ -264,8 +230,7 @@ namespace ConstThis {
     constexpr ctor_test() {
       if (Good)
         a = 10;
-      int local = 100 / a; // expected-note {{division by zero}} \
-                           // ref-note {{division by zero}}
+      int local = 100 / a; // both-note {{division by zero}}
     }
   };
 
@@ -277,22 +242,17 @@ namespace ConstThis {
     constexpr ~dtor_test() {
       if (Good)
         a = 10;
-      int local = 100 / a; // expected-note {{division by zero}} \
-                           // ref-note {{division by zero}}
+      int local = 100 / a; // both-note {{division by zero}}
     }
   };
 
   constexpr ctor_test<true> good_ctor;
   constexpr dtor_test<true> good_dtor;
 
-  constexpr ctor_test<false> bad_ctor; // expected-error {{must be initialized by a constant expression}} \
-                                       // expected-note {{in call to}} \
-                                       // ref-error {{must be initialized by a constant expression}} \
-                                       // ref-note {{in call to}}
-  constexpr dtor_test<false> bad_dtor; // expected-error {{must have constant destruction}} \
-                                       // expected-note {{in call to}} \
-                                       // ref-error {{must have constant destruction}} \
-                                       // ref-note {{in call to}}
+  constexpr ctor_test<false> bad_ctor; // both-error {{must be initialized by a constant expression}} \
+                                       // both-note {{in call to}}
+  constexpr dtor_test<false> bad_dtor; // both-error {{must have constant destruction}} \
+                                       // both-note {{in call to}}
 };
 
 namespace BaseInit {
@@ -311,10 +271,8 @@ namespace BaseInit {
   };
 
   static_assert(Final{1, 2, 3}.c == 3, ""); // OK
-  static_assert(Final{1, 2, 3}.a == 0, ""); // expected-error {{not an integral constant expression}} \
-                                            // expected-note {{read of uninitialized object}} \
-                                            // ref-error {{not an integral constant expression}} \
-                                            // ref-note {{read of uninitialized object}}
+  static_assert(Final{1, 2, 3}.a == 0, ""); // both-error {{not an integral constant expression}} \
+                                            // both-note {{read of uninitialized object}}
 
 
   struct Mixin  {
@@ -333,10 +291,8 @@ namespace BaseInit {
 
   static_assert(Final2{1, 2, 3}.c == 3, ""); // OK
   static_assert(Final2{1, 2, 3}.b == 2, ""); // OK
-  static_assert(Final2{1, 2, 3}.a == 0, ""); // expected-error {{not an integral constant expression}} \
-                                             // expected-note {{read of uninitialized object}} \
-                                             // ref-error {{not an integral constant expression}} \
-                                             // ref-note {{read of uninitialized object}}
+  static_assert(Final2{1, 2, 3}.a == 0, ""); // both-error {{not an integral constant expression}} \
+                                             // both-note {{read of uninitialized object}}
 
 
   struct Mixin3  {
@@ -352,10 +308,8 @@ namespace BaseInit {
 
   static_assert(Final3{1, 2, 3}.c == 3, ""); // OK
   static_assert(Final3{1, 2, 3}.b == 2, ""); // OK
-  static_assert(Final3{1, 2, 3}.a == 0, ""); // expected-error {{not an integral constant expression}} \
-                                             // expected-note {{read of uninitialized object}} \
-                                             // ref-error {{not an integral constant expression}} \
-                                             // ref-note {{read of uninitialized object}}
+  static_assert(Final3{1, 2, 3}.a == 0, ""); // both-error {{not an integral constant expression}} \
+                                             // both-note {{read of uninitialized object}}
 };
 
 namespace Destructors {
@@ -633,16 +587,13 @@ namespace ImplicitFunction {
 
    /// The operator= call here will fail and the diagnostics should be fine.
    b = a; // ref-note {{subobject 'a' is not initialized}} \
-          // ref-note {{in call to}} \
           // expected-note {{read of uninitialized object}} \
-          // expected-note {{in call to}}
+          // both-note {{in call to}}
 
    return 1;
   }
-  static_assert(callMe() == 1, ""); // ref-error {{not an integral constant expression}} \
-                                    // ref-note {{in call to 'callMe()'}} \
-                                    // expected-error {{not an integral constant expression}} \
-                                    // expected-note {{in call to 'callMe()'}}
+  static_assert(callMe() == 1, ""); // both-error {{not an integral constant expression}} \
+                                    // both-note {{in call to 'callMe()'}}
 }
 
 /// FIXME: Unfortunately, the similar tests in test/SemaCXX/{compare-cxx2a.cpp use member pointers,
@@ -680,8 +631,7 @@ namespace ThreeWayCmp {
   static_assert(1.0 <=> 2.f == -1, "");
   static_assert(1.0 <=> 1.0 == 0, "");
   static_assert(2.0 <=> 1.0 == 1, "");
-  constexpr int k = (1 <=> 1, 0); // expected-warning {{comparison result unused}} \
-                                  // ref-warning {{comparison result unused}}
+  constexpr int k = (1 <=> 1, 0); // both-warning {{comparison result unused}}
   static_assert(k== 0, "");
 
   /// Pointers.
@@ -690,10 +640,8 @@ namespace ThreeWayCmp {
   constexpr const int *pa1 = &a[1];
   constexpr const int *pa2 = &a[2];
   constexpr const int *pb1 = &b[1];
-  static_assert(pa1 <=> pb1 != 0, ""); // expected-error {{not an integral constant expression}} \
-                                       // expected-note {{has unspecified value}} \
-                                       // ref-error {{not an integral constant expression}} \
-                                       // ref-note {{has unspecified value}}
+  static_assert(pa1 <=> pb1 != 0, ""); // both-error {{not an integral constant expression}} \
+                                       // both-note {{has unspecified value}} \
   static_assert(pa1 <=> pa1 == 0, "");
   static_assert(pa1 <=> pa2 == -1, "");
   static_assert(pa2 <=> pa1 == 1, "");
@@ -799,3 +747,30 @@ void f2() {
                        // access info for unnamed bit-field
 }
 }
+
+namespace FailingDestructor {
+  struct D {
+    int n;
+    bool can_destroy;
+
+    constexpr ~D() {
+      if (!can_destroy)
+        throw "oh no";
+    }
+  };
+  template<D d>
+  void f() {} // both-note {{invalid explicitly-specified argument}}
+
+  void g() {
+    f<D{0, false}>(); // both-error {{no matching function}}
+  }
+}
+
+
+void overflowInSwitchCase(int n) {
+  switch (n) {
+  case (int)(float)1e300: // both-error {{constant expression}} \
+                          // both-note {{value +Inf is outside the range of representable values of type 'int'}}
+    break;
+  }
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
index ac16a31293f3de..80a9a263dab140 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
@@ -199,6 +199,8 @@ class RefCounted {
   bool trivial23() const { return OptionSet<Flags>::fromRaw(v).contains(Flags::Flag1); }
   int trivial24() const { ASSERT(v); return v; }
   unsigned trivial25() const { return __c11_atomic_load((volatile _Atomic(unsigned) *)&v, __ATOMIC_RELAXED); }
+  bool trivial26() { bool hasValue = v; return !hasValue; }
+  bool trivial27(int v) { bool value; value = v ? 1 : 0; return value; }
 
   static RefCounted& singleton() {
     static RefCounted s_RefCounted;
@@ -262,6 +264,15 @@ class RefCounted {
     return __c11_atomic_load((volatile _Atomic(unsigned) *)another(), __ATOMIC_RELAXED);
   }
 
+  void nonTrivial11() {
+    Number num(0.3);
+  }
+
+  bool nonTrivial12() {
+    bool val = otherFunction();
+    return val;
+  }
+
   unsigned v { 0 };
   Number* number { nullptr };
   Enum enumValue { Enum::Value1 };
@@ -309,6 +320,8 @@ class UnrelatedClass {
     getFieldTrivial().trivial23(); // no-warning
     getFieldTrivial().trivial24(); // no-warning
     getFieldTrivial().trivial25(); // no-warning
+    getFieldTrivial().trivial26(); // no-warning
+    getFieldTrivial().trivial27(5); // no-warning
     RefCounted::singleton().trivial18(); // no-warning
     RefCounted::singleton().someFunction(); // no-warning
 
@@ -334,6 +347,10 @@ class UnrelatedClass {
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
     getFieldTrivial().nonTrivial10();
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial11();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial12();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
   }
 };
 
diff --git a/clang/test/Driver/basic-block-address-map.c b/clang/test/Driver/basic-block-address-map.c
index 022f972b412d6b..12393e8ebfd54e 100644
--- a/clang/test/Driver/basic-block-address-map.c
+++ b/clang/test/Driver/basic-block-address-map.c
@@ -1,8 +1,9 @@
-// RUN: %clang -### -target x86_64 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-PRESENT %s
+// RUN: %clang -### --target=x86_64 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-PRESENT %s
+// RUN: %clang -### --target=aarch64 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-PRESENT %s
 // CHECK-PRESENT: -fbasic-block-address-map
 
-// RUN: %clang -### -target x86_64 -fno-basic-block-address-map %s -S 2>&1 | FileCheck %s --check-prefix=CHECK-ABSENT
+// RUN: %clang -### --target=x86_64 -fno-basic-block-address-map %s -S 2>&1 | FileCheck %s --check-prefix=CHECK-ABSENT
 // CHECK-ABSENT-NOT: -fbasic-block-address-map
 
-// RUN: not %clang -c -target x86_64-apple-darwin10 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
+// RUN: not %clang -c --target=x86_64-apple-darwin10 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
 // CHECK-TRIPLE: error: unsupported option '-fbasic-block-address-map' for target
diff --git a/clang/test/Driver/split-debug.c b/clang/test/Driver/split-debug.c
index a2a3dc02354503..968f33b4cc035c 100644
--- a/clang/test/Driver/split-debug.c
+++ b/clang/test/Driver/split-debug.c
@@ -124,8 +124,3 @@
 // G1_NOSPLIT: "-debug-info-kind=line-tables-only"
 // G1_NOSPLIT-NOT: "-split-dwarf-file"
 // G1_NOSPLIT-NOT: "-split-dwarf-output"
-
-/// Do not generate -ggnu-pubnames for -glldb
-// RUN: %clang -### -c -target x86_64 -gsplit-dwarf -g -glldb %s 2>&1 | FileCheck %s --check-prefixes=GLLDBSPLIT
-
-// GLLDBSPLIT-NOT: "-ggnu-pubnames"
diff --git a/clang/test/InstallAPI/basic.test b/clang/test/InstallAPI/basic.test
index 22b04792ca2c30..5b41ccd517b0af 100644
--- a/clang/test/InstallAPI/basic.test
+++ b/clang/test/InstallAPI/basic.test
@@ -16,6 +16,11 @@
 // CHECK-NOT: warning:  
 
 //--- basic_inputs.json
+{
+  "headers": [
+  ],
+  "version": "3"
+}
 
 //--- expected.tbd
 {
diff --git a/clang/test/InstallAPI/variables.test b/clang/test/InstallAPI/variables.test
new file mode 100644
index 00000000000000..6272867911f18b
--- /dev/null
+++ b/clang/test/InstallAPI/variables.test
@@ -0,0 +1,63 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|SRC_DIR|%/t|g" %t/vars_inputs.json.in > %t/vars_inputs.json
+
+/// Check multiple targets are captured.
+// RUN: clang-installapi -target arm64-apple-macos13.1 -target arm64e-apple-macos13.1 \
+// RUN: -fapplication-extension -install_name /usr/lib/vars.dylib \
+// RUN: %t/vars_inputs.json -o %t/vars.tbd 2>&1 | FileCheck %s --allow-empty
+// RUN: llvm-readtapi -compare %t/vars.tbd %t/expected.tbd 2>&1 | FileCheck %s --allow-empty
+
+// CHECK-NOT: error:  
+// CHECK-NOT: warning:  
+
+//--- vars.h
+extern int foo;
+
+//--- vars_inputs.json.in
+{
+  "headers": [ {
+    "path" : "SRC_DIR/vars.h",
+    "type" : "public"
+  }],
+  "version": "3"
+}
+
+//--- expected.tbd
+{
+  "main_library": {
+    "compatibility_versions": [
+      {
+        "version": "0"
+      }],
+    "current_versions": [
+      {
+        "version": "0"
+      }],
+    "install_names": [
+      {
+        "name": "/usr/lib/vars.dylib"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "data": {
+          "global": [
+            "_foo"
+          ]
+        }
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "13.1",
+        "target": "arm64-macos"
+      },
+      {
+        "min_deployment": "13.1",
+        "target": "arm64e-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt
index b8384c92c104f6..e05f4eac3ad198 100644
--- a/clang/tools/clang-installapi/CMakeLists.txt
+++ b/clang/tools/clang-installapi/CMakeLists.txt
@@ -14,6 +14,7 @@ add_clang_tool(clang-installapi
 
 clang_target_link_libraries(clang-installapi
   PRIVATE
+  clangAST
   clangInstallAPI
   clangBasic
   clangDriver
diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
index fc23ffd7ae6b9c..43c9fca0a82eec 100644
--- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
+++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
@@ -12,12 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "Options.h"
-#include "clang/Basic/DiagnosticIDs.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
-#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Driver/Tool.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
-#include "clang/InstallAPI/Context.h"
+#include "clang/InstallAPI/Frontend.h"
+#include "clang/Tooling/Tooling.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/CommandLine.h"
@@ -27,7 +29,9 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/TargetParser/Host.h"
+#include "llvm/TextAPI/RecordVisitor.h"
 #include "llvm/TextAPI/TextAPIWriter.h"
+#include <memory>
 
 using namespace clang;
 using namespace clang::installapi;
@@ -35,6 +39,36 @@ using namespace clang::driver::options;
 using namespace llvm::opt;
 using namespace llvm::MachO;
 
+static bool runFrontend(StringRef ProgName, bool Verbose,
+                        const InstallAPIContext &Ctx,
+                        llvm::vfs::InMemoryFileSystem *FS,
+                        const ArrayRef<std::string> InitialArgs) {
+
+  std::unique_ptr<llvm::MemoryBuffer> ProcessedInput = createInputBuffer(Ctx);
+  // Skip invoking cc1 when there are no header inputs.
+  if (!ProcessedInput)
+    return true;
+
+  if (Verbose)
+    llvm::errs() << getName(Ctx.Type) << " Headers:\n"
+                 << ProcessedInput->getBuffer() << "\n\n";
+
+  std::string InputFile = ProcessedInput->getBufferIdentifier().str();
+  FS->addFile(InputFile, /*ModTime=*/0, std::move(ProcessedInput));
+  // Reconstruct arguments with unique values like target triple or input
+  // headers.
+  std::vector<std::string> Args = {ProgName.data(), "-target",
+                                   Ctx.Slice->getTriple().str().c_str()};
+  llvm::copy(InitialArgs, std::back_inserter(Args));
+  Args.push_back(InputFile);
+
+  // Create & run invocation.
+  clang::tooling::ToolInvocation Invocation(
+      std::move(Args), std::make_unique<InstallAPIAction>(*Ctx.Slice), Ctx.FM);
+
+  return Invocation.run();
+}
+
 static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   // Setup Diagnostics engine.
   IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts = new DiagnosticOptions();
@@ -48,9 +82,15 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
       new clang::DiagnosticIDs(), DiagOpts.get(),
       new clang::TextDiagnosticPrinter(llvm::errs(), DiagOpts.get()));
 
-  // Create file manager for all file operations.
+  // Create file manager for all file operations and holding in-memory generated
+  // inputs.
+  llvm::IntrusiveRefCntPtr<llvm::vfs::OverlayFileSystem> OverlayFileSystem(
+      new llvm::vfs::OverlayFileSystem(llvm::vfs::getRealFileSystem()));
+  llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> InMemoryFileSystem(
+      new llvm::vfs::InMemoryFileSystem);
+  OverlayFileSystem->pushOverlay(InMemoryFileSystem);
   IntrusiveRefCntPtr<clang::FileManager> FM(
-      new FileManager(clang::FileSystemOptions()));
+      new FileManager(clang::FileSystemOptions(), OverlayFileSystem));
 
   // Set up driver to parse input arguments.
   auto DriverArgs = llvm::ArrayRef(Args).slice(1);
@@ -71,7 +111,10 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   Options Opts(*Diag, FM.get(), ArgList);
   if (Diag->hasErrorOccurred())
     return EXIT_FAILURE;
+
   InstallAPIContext Ctx = Opts.createContext();
+  if (Diag->hasErrorOccurred())
+    return EXIT_FAILURE;
 
   // Set up compilation.
   std::unique_ptr<CompilerInstance> CI(new CompilerInstance());
@@ -80,6 +123,21 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   if (!CI->hasDiagnostics())
     return EXIT_FAILURE;
 
+  // Execute and gather AST results.
+  llvm::MachO::Records FrontendResults;
+  for (const auto &[Targ, Trip] : Opts.DriverOpts.Targets) {
+    for (const HeaderType Type :
+         {HeaderType::Public, HeaderType::Private, HeaderType::Project}) {
+      Ctx.Slice = std::make_shared<RecordsSlice>(Trip);
+      Ctx.Type = Type;
+      if (!runFrontend(ProgName, Opts.DriverOpts.Verbose, Ctx,
+                       InMemoryFileSystem.get(), Opts.getClangFrontendArgs()))
+        return EXIT_FAILURE;
+      FrontendResults.emplace_back(std::move(Ctx.Slice));
+    }
+  }
+
+  // After symbols have been collected, prepare to write output.
   auto Out = CI->createOutputFile(Ctx.OutputLoc, /*Binary=*/false,
                                   /*RemoveFileOnSignal=*/false,
                                   /*UseTemporary=*/false,
@@ -88,7 +146,13 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
     return EXIT_FAILURE;
 
   // Assign attributes for serialization.
-  InterfaceFile IF;
+  auto Symbols = std::make_unique<SymbolSet>();
+  for (const auto &FR : FrontendResults) {
+    SymbolConverter Converter(Symbols.get(), FR->getTarget());
+    FR->visit(Converter);
+  }
+
+  InterfaceFile IF(std::move(Symbols));
   for (const auto &TargetInfo : Opts.DriverOpts.Targets) {
     IF.addTarget(TargetInfo.first);
     IF.setFromBinaryAttrs(Ctx.BA, TargetInfo.first);
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index 562a643edfcf40..7d45e999448d9f 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -9,6 +9,7 @@
 #include "Options.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
+#include "clang/InstallAPI/FileList.h"
 #include "llvm/Support/Program.h"
 #include "llvm/TargetParser/Host.h"
 
@@ -68,6 +69,8 @@ bool Options::processDriverOptions(InputArgList &Args) {
     }
   }
 
+  DriverOpts.Verbose = Args.hasArgNoClaim(OPT_v);
+
   return true;
 }
 
@@ -104,10 +107,21 @@ Options::Options(DiagnosticsEngine &Diag, FileManager *FM,
 
   if (!processLinkerOptions(ArgList))
     return;
+
+  /// Any remaining arguments should be handled by invoking the clang frontend.
+  for (const Arg *A : ArgList) {
+    if (A->isClaimed())
+      continue;
+    FrontendArgs.emplace_back(A->getAsString(ArgList));
+  }
+  FrontendArgs.push_back("-fsyntax-only");
 }
 
 InstallAPIContext Options::createContext() {
   InstallAPIContext Ctx;
+  Ctx.FM = FM;
+  Ctx.Diags = Diags;
+
   // InstallAPI requires two level namespacing.
   Ctx.BA.TwoLevelNamespace = true;
 
@@ -116,6 +130,21 @@ InstallAPIContext Options::createContext() {
   Ctx.BA.AppExtensionSafe = LinkerOpts.AppExtensionSafe;
   Ctx.FT = DriverOpts.OutFT;
   Ctx.OutputLoc = DriverOpts.OutputPath;
+
+  // Process inputs.
+  for (const std::string &ListPath : DriverOpts.FileLists) {
+    auto Buffer = FM->getBufferForFile(ListPath);
+    if (auto Err = Buffer.getError()) {
+      Diags->Report(diag::err_cannot_open_file) << ListPath;
+      return Ctx;
+    }
+    if (auto Err = FileListReader::loadHeaders(std::move(Buffer.get()),
+                                               Ctx.InputHeaders)) {
+      Diags->Report(diag::err_cannot_open_file) << ListPath;
+      return Ctx;
+    }
+  }
+
   return Ctx;
 }
 
diff --git a/clang/tools/clang-installapi/Options.h b/clang/tools/clang-installapi/Options.h
index 4a84166a6c91b1..f68addf1972880 100644
--- a/clang/tools/clang-installapi/Options.h
+++ b/clang/tools/clang-installapi/Options.h
@@ -43,6 +43,9 @@ struct DriverOptions {
 
   /// \brief File encoding to print.
   llvm::MachO::FileType OutFT = llvm::MachO::FileType::TBD_V5;
+
+  /// \brief Print verbose output.
+  bool Verbose = false;
 };
 
 struct LinkerOptions {
@@ -78,9 +81,14 @@ class Options {
   Options(clang::DiagnosticsEngine &Diag, FileManager *FM,
           llvm::opt::InputArgList &Args);
 
+  /// \brief Get CC1 arguments after extracting out the irrelevant
+  /// ones.
+  std::vector<std::string> &getClangFrontendArgs() { return FrontendArgs; }
+
 private:
   DiagnosticsEngine *Diags;
   FileManager *FM;
+  std::vector<std::string> FrontendArgs;
 };
 
 } // namespace installapi
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index bbb4e8d7c333e4..8a2b138d8d7020 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -771,8 +771,6 @@ mark_as_advanced(COMPILER_RT_ENABLE_INTERNAL_SYMBOLIZER)
 add_subdirectory(lib)
 
 if(COMPILER_RT_INCLUDE_TESTS)
-  add_subdirectory(unittests)
-  add_subdirectory(test)
   # Don't build llvm-lit for runtimes-build, it will clean up map_config.
   if (COMPILER_RT_STANDALONE_BUILD AND NOT LLVM_RUNTIMES_BUILD)
     # If we have a valid source tree, generate llvm-lit into the bin directory.
@@ -782,11 +780,17 @@ if(COMPILER_RT_INCLUDE_TESTS)
       # Needed for lit support in standalone builds.
       include(AddLLVM)
       add_subdirectory(${LLVM_MAIN_SRC_DIR}/utils/llvm-lit ${CMAKE_CURRENT_BINARY_DIR}/llvm-lit)
+      # Ensure that the testsuite uses the local lit rather than
+      # LLVM_INSTALL_DIR/bin/llvm-lit (which probably does not exist).
+      get_llvm_lit_path(_base_dir _file_name)
+      set(LLVM_EXTERNAL_LIT "${_base_dir}/${_file_name}" CACHE STRING "Command used to spawn lit" FORCE)
     elseif(NOT EXISTS ${LLVM_EXTERNAL_LIT})
       message(WARNING "Could not find LLVM source directory and LLVM_EXTERNAL_LIT does not"
                        "point to a valid file.  You will not be able to run tests.")
     endif()
   endif()
+  add_subdirectory(unittests)
+  add_subdirectory(test)
 endif()
 
 add_subdirectory(tools)
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 28ded8766f2533..83f7697a4a2b42 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -916,7 +916,7 @@ cmake_dependent_option(COMPILER_RT_BUILD_CRT "Build crtbegin.o/crtend.o" ON "COM
 if (COMPILER_RT_BUILD_CRT)
   add_compiler_rt_component(crt)
 
-  option(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY "Use eh_frame in crtbegin.o/crtend.o" ON)
+  option(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY "Use eh_frame in crtbegin.o/crtend.o" OFF)
 
   include(CheckSectionExists)
   check_section_exists(".init_array" COMPILER_RT_HAS_INITFINI_ARRAY
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index 7c5c26f4d113b1..1c451d7ec75637 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -61,19 +61,12 @@ uint64_t __llvm_profile_get_size_for_buffer(void) {
       NamesBegin, NamesEnd, VTableBegin, VTableEnd, VNamesBegin, VNamesEnd);
 }
 
+// NOTE: Caller should guarantee that `Begin` and `End` specifies a half-open
+// interval [Begin, End). Namely, `End` is one-byte past the end of the array.
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
                                      const __llvm_profile_data *End) {
   intptr_t BeginI = (intptr_t)Begin, EndI = (intptr_t)End;
-  // `sizeof(__llvm_profile_data) - 1` is required in the numerator when
-  // [Begin, End] represents an inclusive range.
-  // For ELF, [Begin, End) represents the address of linker-inserted
-  // symbols  `__start__<elf-section>` and `__stop_<elf-section>`.
-  // Thereby, `End` is one byte past the inclusive range, and
-  // `sizeof(__llvm_profile_data) - 1` is not necessary in the numerator to get
-  // the correct number of profile data.
-  // FIXME: Consider removing `sizeof(__llvm_profile_data) - 1` if this is true
-  // across platforms.
   return ((EndI + sizeof(__llvm_profile_data) - 1) - BeginI) /
          sizeof(__llvm_profile_data);
 }
diff --git a/compiler-rt/test/builtins/Unit/ctor_dtor.c b/compiler-rt/test/builtins/Unit/ctor_dtor.c
index 47560722a9f750..3d5f895a0a1cd1 100644
--- a/compiler-rt/test/builtins/Unit/ctor_dtor.c
+++ b/compiler-rt/test/builtins/Unit/ctor_dtor.c
@@ -9,23 +9,13 @@
 
 // Ensure the various startup functions are called in the proper order.
 
-// CHECK: __register_frame_info()
 /// ctor() is here if ld.so/libc supports DT_INIT/DT_FINI
 // CHECK:      main()
 /// dtor() is here if ld.so/libc supports DT_INIT/DT_FINI
-// CHECK:      __deregister_frame_info()
 
 struct object;
 static int counter;
 
-void __register_frame_info(const void *fi, struct object *obj) {
-  printf("__register_frame_info()\n");
-}
-
-void __deregister_frame_info(const void *fi) {
-  printf("__deregister_frame_info()\n");
-}
-
 void __attribute__((constructor)) ctor() {
   printf("ctor()\n");
   ++counter;
diff --git a/compiler-rt/test/profile/instrprof-basic.c b/compiler-rt/test/profile/instrprof-basic.c
index de66e1b2746806..702f521ba4ed8a 100644
--- a/compiler-rt/test/profile/instrprof-basic.c
+++ b/compiler-rt/test/profile/instrprof-basic.c
@@ -1,6 +1,7 @@
 // RUN: %clang_profgen -o %t -O3 %s
 // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t
 // RUN: llvm-profdata merge -o %t.profdata %t.profraw
+// RUN: llvm-profdata show --all-functions %t.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.profdata -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=ORIG
 //
 // RUN: rm -fr %t.dir1
@@ -8,6 +9,7 @@
 // RUN: env LLVM_PROFILE_FILE=%t.dir1/profraw_e_%1m %run %t
 // RUN: env LLVM_PROFILE_FILE=%t.dir1/profraw_e_%1m %run %t
 // RUN: llvm-profdata merge -o %t.em.profdata %t.dir1
+// RUN: llvm-profdata show --all-functions %t.em.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.em.profdata -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=MERGE
 //
 // RUN: rm -fr %t.dir2
@@ -16,6 +18,7 @@
 // RUN: %run %t.merge
 // RUN: %run %t.merge
 // RUN: llvm-profdata merge -o %t.m.profdata %t.dir2/
+// RUN: llvm-profdata show --all-functions %t.m.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.m.profdata -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=MERGE
 //
 // Test that merging is enabled by default with -fprofile-generate=
@@ -27,6 +30,7 @@
 // RUN: %run %t.merge3
 // RUN: %run %t.merge3
 // RUN: llvm-profdata merge -o %t.m3.profdata %t.dir3/
+// RUN: llvm-profdata show --all-functions %t.m3.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.m3.profdata -O0 -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=PGOMERGE
 //
 // Test that merging is enabled by default with -fprofile-generate
@@ -40,6 +44,7 @@
 // RUN: %run %t.dir4/merge4
 // RUN: rm -f %t.dir4/merge4*
 // RUN: llvm-profdata merge -o %t.m4.profdata ./
+// RUN: llvm-profdata show --all-functions %t.m4.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.m4.profdata -O0 -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON  --check-prefix=PGOMERGE
 
 /// Test that the merge pool size can be larger than 10.
@@ -49,6 +54,13 @@
 // RUN: not ls %t.dir5/e_%20m.profraw
 // RUN: ls %t.dir5/e_*.profraw | count 1
 
+// Test that all three functions have counters in the profile.
+// PROFCNT-DAG: begin
+// PROFCNT-DAG: end
+// PROFCNT-DAG: main
+// PROFCNT: Functions shown: 3
+// PROFCNT: Total functions: 3
+
 int begin(int i) {
   // COMMON: br i1 %{{.*}}, label %{{.*}}, label %{{.*}}, !prof ![[PD1:[0-9]+]]
   if (i)
diff --git a/flang/include/flang/Common/float128.h b/flang/include/flang/Common/float128.h
index 3443aa06437b04..2e76bc0a162e61 100644
--- a/flang/include/flang/Common/float128.h
+++ b/flang/include/flang/Common/float128.h
@@ -20,6 +20,8 @@
 #ifndef FORTRAN_COMMON_FLOAT128_H_
 #define FORTRAN_COMMON_FLOAT128_H_
 
+#include <float.h>
+
 #ifdef __cplusplus
 /*
  * libc++ does not fully support __float128 right now, e.g.
@@ -49,4 +51,25 @@
 #endif /* (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)) && \
           !defined(_LIBCPP_VERSION)  && !defined(__CUDA_ARCH__) */
 
+/* Define pure C CFloat128Type and CFloat128ComplexType. */
+#if LDBL_MANT_DIG == 113
+typedef long double CFloat128Type;
+#ifndef __cplusplus
+typedef long double _Complex CFloat128ComplexType;
+#endif
+#elif HAS_FLOAT128
+typedef __float128 CFloat128Type;
+
+#ifndef __cplusplus
+/*
+ * Use mode() attribute supported by GCC and Clang.
+ * Adjust it for other compilers as needed.
+ */
+#if !defined(_ARCH_PPC) || defined(__LONG_DOUBLE_IEEE128__)
+typedef _Complex float __attribute__((mode(TC))) CFloat128ComplexType;
+#else
+typedef _Complex float __attribute__((mode(KC))) CFloat128ComplexType;
+#endif
+#endif // __cplusplus
+#endif
 #endif /* FORTRAN_COMMON_FLOAT128_H_ */
diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index e2af59e0aaa196..32e7a5e2b04061 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -53,6 +53,7 @@ class DerivedTypeSpec;
 
 namespace lower {
 class SymMap;
+struct SymbolBox;
 namespace pft {
 struct Variable;
 }
@@ -299,6 +300,11 @@ class AbstractConverter {
     return loweringOptions;
   }
 
+  /// Find the symbol in one level up of symbol map such as for host-association
+  /// in OpenMP code or return null.
+  virtual Fortran::lower::SymbolBox
+  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) = 0;
+
 private:
   /// Options controlling lowering behavior.
   const Fortran::lower::LoweringOptions &loweringOptions;
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index 39821f1036c63f..bd9b67b14b966c 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -688,6 +688,9 @@ fir::BoxValue createBoxValue(fir::FirOpBuilder &builder, mlir::Location loc,
 /// Generate Null BoxProc for procedure pointer null initialization.
 mlir::Value createNullBoxProc(fir::FirOpBuilder &builder, mlir::Location loc,
                               mlir::Type boxType);
+
+/// Set internal linkage attribute on a function.
+void setInternalLinkage(mlir::func::FuncOp);
 } // namespace fir::factory
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H
diff --git a/flang/include/flang/Runtime/reduction.h b/flang/include/flang/Runtime/reduction.h
index b91fec0cd26b51..5b607765857523 100644
--- a/flang/include/flang/Runtime/reduction.h
+++ b/flang/include/flang/Runtime/reduction.h
@@ -92,9 +92,11 @@ void RTDECL(CppSumComplex8)(std::complex<double> &, const Descriptor &,
 void RTDECL(CppSumComplex10)(std::complex<long double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTDECL(CppSumComplex16)(std::complex<long double> &, const Descriptor &,
-    const char *source, int line, int dim = 0,
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+void RTDECL(CppSumComplex16)(std::complex<CppFloat128Type> &,
+    const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#endif
 
 void RTDECL(SumDim)(Descriptor &result, const Descriptor &array, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
@@ -145,12 +147,16 @@ void RTDECL(CppProductComplex4)(std::complex<float> &, const Descriptor &,
 void RTDECL(CppProductComplex8)(std::complex<double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#if LDBL_MANT_DIG == 64
 void RTDECL(CppProductComplex10)(std::complex<long double> &,
     const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTDECL(CppProductComplex16)(std::complex<long double> &,
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+void RTDECL(CppProductComplex16)(std::complex<CppFloat128Type> &,
     const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#endif
 
 void RTDECL(ProductDim)(Descriptor &result, const Descriptor &array, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
@@ -358,9 +364,12 @@ double RTDECL(Norm2_8)(
 #if LDBL_MANT_DIG == 64
 long double RTDECL(Norm2_10)(
     const Descriptor &, const char *source, int line, int dim = 0);
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 long double RTDECL(Norm2_16)(
     const Descriptor &, const char *source, int line, int dim = 0);
+void RTDECL(Norm2DimReal16)(
+    Descriptor &, const Descriptor &, int dim, const char *source, int line);
 #endif
 void RTDECL(Norm2Dim)(
     Descriptor &, const Descriptor &, int dim, const char *source, int line);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index f865b53f74debd..153ce0623ab3ae 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -1000,6 +1000,17 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       if (sym.detailsIf<Fortran::semantics::CommonBlockDetails>())
         return symMap->lookupSymbol(sym);
 
+      // For symbols to be privatized in OMP, the symbol is mapped to an
+      // instance of `SymbolBox::Intrinsic` (i.e. a direct mapping to an MLIR
+      // SSA value). This MLIR SSA value is the block argument to the
+      // `omp.private`'s `alloc` block. If this is the case, we return this
+      // `SymbolBox::Intrinsic` value.
+      if (Fortran::lower::SymbolBox v = symMap->lookupSymbol(sym))
+        return v.match(
+            [&](const Fortran::lower::SymbolBox::Intrinsic &)
+                -> Fortran::lower::SymbolBox { return v; },
+            [](const auto &) -> Fortran::lower::SymbolBox { return {}; });
+
       return {};
     }
     if (Fortran::lower::SymbolBox v = symMap->lookupSymbol(sym))
@@ -1018,7 +1029,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   /// Find the symbol in one level up of symbol map such as for host-association
   /// in OpenMP code or return null.
   Fortran::lower::SymbolBox
-  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) {
+  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) override {
     if (Fortran::lower::SymbolBox v = localSymbols.lookupOneLevelUpSymbol(sym))
       return v;
     return {};
@@ -4477,7 +4488,16 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     assert(builder && "FirOpBuilder did not instantiate");
     builder->setFastMathFlags(bridge.getLoweringOptions().getMathOptions());
     builder->setInsertionPointToStart(&func.front());
-    func.setVisibility(mlir::SymbolTable::Visibility::Public);
+    if (funit.parent.isA<Fortran::lower::pft::FunctionLikeUnit>()) {
+      // Give internal linkage to internal functions. There are no name clash
+      // risks, but giving global linkage to internal procedure will break the
+      // static link register in shared libraries because of the system calls.
+      // Also, it should be possible to eliminate the procedure code if all the
+      // uses have been inlined.
+      fir::factory::setInternalLinkage(func);
+    } else {
+      func.setVisibility(mlir::SymbolTable::Visibility::Public);
+    }
     assert(blockId == 0 && "invalid blockId");
     assert(activeConstructStack.empty() && "invalid construct stack state");
 
diff --git a/flang/lib/Lower/HostAssociations.cpp b/flang/lib/Lower/HostAssociations.cpp
index a62f7a7e99b6ff..b9e13ccad1c929 100644
--- a/flang/lib/Lower/HostAssociations.cpp
+++ b/flang/lib/Lower/HostAssociations.cpp
@@ -247,9 +247,11 @@ class CapturedCharacterScalars
   }
 };
 
-/// Class defining how polymorphic entities are captured in internal procedures.
-/// Polymorphic entities are always boxed as a fir.class box.
-class CapturedPolymorphic : public CapturedSymbols<CapturedPolymorphic> {
+/// Class defining how polymorphic scalar entities are captured in internal
+/// procedures. Polymorphic entities are always boxed as a fir.class box.
+/// Polymorphic array can be handled in CapturedArrays directly
+class CapturedPolymorphicScalar
+    : public CapturedSymbols<CapturedPolymorphicScalar> {
 public:
   static mlir::Type getType(Fortran::lower::AbstractConverter &converter,
                             const Fortran::semantics::Symbol &sym) {
@@ -257,19 +259,50 @@ class CapturedPolymorphic : public CapturedSymbols<CapturedPolymorphic> {
   }
   static void instantiateHostTuple(const InstantiateHostTuple &args,
                                    Fortran::lower::AbstractConverter &converter,
-                                   const Fortran::semantics::Symbol &) {
+                                   const Fortran::semantics::Symbol &sym) {
     fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+    mlir::Location loc = args.loc;
     mlir::Type typeInTuple = fir::dyn_cast_ptrEleTy(args.addrInTuple.getType());
     assert(typeInTuple && "addrInTuple must be an address");
     mlir::Value castBox = builder.createConvert(args.loc, typeInTuple,
                                                 fir::getBase(args.hostValue));
-    builder.create<fir::StoreOp>(args.loc, castBox, args.addrInTuple);
+    if (Fortran::semantics::IsOptional(sym)) {
+      auto isPresent =
+          builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), castBox);
+      builder.genIfThenElse(loc, isPresent)
+          .genThen([&]() {
+            builder.create<fir::StoreOp>(loc, castBox, args.addrInTuple);
+          })
+          .genElse([&]() {
+            mlir::Value null = fir::factory::createUnallocatedBox(
+                builder, loc, typeInTuple,
+                /*nonDeferredParams=*/mlir::ValueRange{});
+            builder.create<fir::StoreOp>(loc, null, args.addrInTuple);
+          })
+          .end();
+    } else {
+      builder.create<fir::StoreOp>(loc, castBox, args.addrInTuple);
+    }
   }
   static void getFromTuple(const GetFromTuple &args,
                            Fortran::lower::AbstractConverter &converter,
                            const Fortran::semantics::Symbol &sym,
                            const Fortran::lower::BoxAnalyzer &ba) {
-    bindCapturedSymbol(sym, args.valueInTuple, converter, args.symMap);
+    fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+    mlir::Location loc = args.loc;
+    mlir::Value box = args.valueInTuple;
+    if (Fortran::semantics::IsOptional(sym)) {
+      auto boxTy = box.getType().cast<fir::BaseBoxType>();
+      auto eleTy = boxTy.getEleTy();
+      if (!fir::isa_ref_type(eleTy))
+        eleTy = builder.getRefType(eleTy);
+      auto addr = builder.create<fir::BoxAddrOp>(loc, eleTy, box);
+      mlir::Value isPresent = builder.genIsNotNullAddr(loc, addr);
+      auto absentBox = builder.create<fir::AbsentOp>(loc, boxTy);
+      box =
+          builder.create<mlir::arith::SelectOp>(loc, isPresent, box, absentBox);
+    }
+    bindCapturedSymbol(sym, box, converter, args.symMap);
   }
 };
 
@@ -342,7 +375,12 @@ class CapturedArrays : public CapturedSymbols<CapturedArrays> {
   static mlir::Type getType(Fortran::lower::AbstractConverter &converter,
                             const Fortran::semantics::Symbol &sym) {
     mlir::Type type = converter.genType(sym);
-    assert(type.isa<fir::SequenceType>() && "must be a sequence type");
+    bool isPolymorphic = Fortran::semantics::IsPolymorphic(sym);
+    assert((type.isa<fir::SequenceType>() ||
+            (isPolymorphic && type.isa<fir::ClassType>())) &&
+           "must be a sequence type");
+    if (isPolymorphic)
+      return type;
     return fir::BoxType::get(type);
   }
 
@@ -410,13 +448,13 @@ class CapturedArrays : public CapturedSymbols<CapturedArrays> {
                          fir::factory::readBoxValue(builder, loc, boxValue),
                          converter, args.symMap);
     } else {
-      // Keep variable as a fir.box.
+      // Keep variable as a fir.box/fir.class.
       // If this is an optional that is absent, the fir.box needs to be an
       // AbsentOp result, otherwise it will not work properly with IsPresentOp
       // (absent boxes are null descriptor addresses, not descriptors containing
       // a null base address).
       if (Fortran::semantics::IsOptional(sym)) {
-        auto boxTy = box.getType().cast<fir::BoxType>();
+        auto boxTy = box.getType().cast<fir::BaseBoxType>();
         auto eleTy = boxTy.getEleTy();
         if (!fir::isa_ref_type(eleTy))
           eleTy = builder.getRefType(eleTy);
@@ -470,14 +508,10 @@ walkCaptureCategories(T visitor, Fortran::lower::AbstractConverter &converter,
   ba.analyze(sym);
   if (Fortran::semantics::IsAllocatableOrPointer(sym))
     return CapturedAllocatableAndPointer::visit(visitor, converter, sym, ba);
-  if (Fortran::semantics::IsPolymorphic(sym)) {
-    if (ba.isArray() && !ba.lboundIsAllOnes())
-      TODO(converter.genLocation(sym.name()),
-           "polymorphic array with non default lower bound");
-    return CapturedPolymorphic::visit(visitor, converter, sym, ba);
-  }
   if (ba.isArray())
     return CapturedArrays::visit(visitor, converter, sym, ba);
+  if (Fortran::semantics::IsPolymorphic(sym))
+    return CapturedPolymorphicScalar::visit(visitor, converter, sym, ba);
   if (ba.isChar())
     return CapturedCharacterScalars::visit(visitor, converter, sym, ba);
   assert(ba.isTrivial() && "must be trivial scalar");
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 136bda0b582ee3..717b8cc0276a30 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -14,6 +14,7 @@
 
 #include "Utils.h"
 #include "flang/Lower/PFTBuilder.h"
+#include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Semantics/tools.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
@@ -66,9 +67,10 @@ void DataSharingProcessor::cloneSymbol(const Fortran::semantics::Symbol *sym) {
 }
 
 void DataSharingProcessor::copyFirstPrivateSymbol(
-    const Fortran::semantics::Symbol *sym) {
+    const Fortran::semantics::Symbol *sym,
+    mlir::OpBuilder::InsertPoint *copyAssignIP) {
   if (sym->test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate))
-    converter.copyHostAssociateVar(*sym);
+    converter.copyHostAssociateVar(*sym, copyAssignIP);
 }
 
 void DataSharingProcessor::copyLastPrivateSymbol(
@@ -307,14 +309,10 @@ void DataSharingProcessor::privatize() {
   for (const Fortran::semantics::Symbol *sym : privatizedSymbols) {
     if (const auto *commonDet =
             sym->detailsIf<Fortran::semantics::CommonBlockDetails>()) {
-      for (const auto &mem : commonDet->objects()) {
-        cloneSymbol(&*mem);
-        copyFirstPrivateSymbol(&*mem);
-      }
-    } else {
-      cloneSymbol(sym);
-      copyFirstPrivateSymbol(sym);
-    }
+      for (const auto &mem : commonDet->objects())
+        doPrivatize(&*mem);
+    } else
+      doPrivatize(sym);
   }
 }
 
@@ -338,11 +336,95 @@ void DataSharingProcessor::defaultPrivatize() {
         !sym->GetUltimate().has<Fortran::semantics::NamelistDetails>() &&
         !symbolsInNestedRegions.contains(sym) &&
         !symbolsInParentRegions.contains(sym) &&
-        !privatizedSymbols.contains(sym)) {
+        !privatizedSymbols.contains(sym))
+      doPrivatize(sym);
+  }
+}
+
+void DataSharingProcessor::doPrivatize(const Fortran::semantics::Symbol *sym) {
+  if (!useDelayedPrivatization) {
+    cloneSymbol(sym);
+    copyFirstPrivateSymbol(sym);
+    return;
+  }
+
+  Fortran::lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym);
+  assert(hsb && "Host symbol box not found");
+
+  mlir::Type symType = hsb.getAddr().getType();
+  mlir::Location symLoc = hsb.getAddr().getLoc();
+  std::string privatizerName = sym->name().ToString() + ".privatizer";
+  bool isFirstPrivate =
+      sym->test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate);
+
+  mlir::omp::PrivateClauseOp privatizerOp = [&]() {
+    auto moduleOp = firOpBuilder.getModule();
+    auto uniquePrivatizerName = fir::getTypeAsString(
+        symType, converter.getKindMap(),
+        converter.mangleName(*sym) +
+            (isFirstPrivate ? "_firstprivate" : "_private"));
+
+    if (auto existingPrivatizer =
+            moduleOp.lookupSymbol<mlir::omp::PrivateClauseOp>(
+                uniquePrivatizerName))
+      return existingPrivatizer;
+
+    auto ip = firOpBuilder.saveInsertionPoint();
+    firOpBuilder.setInsertionPoint(&moduleOp.getBodyRegion().front(),
+                                   moduleOp.getBodyRegion().front().begin());
+    auto result = firOpBuilder.create<mlir::omp::PrivateClauseOp>(
+        symLoc, uniquePrivatizerName, symType,
+        isFirstPrivate ? mlir::omp::DataSharingClauseType::FirstPrivate
+                       : mlir::omp::DataSharingClauseType::Private);
+
+    symTable->pushScope();
+
+    // Populate the `alloc` region.
+    {
+      mlir::Region &allocRegion = result.getAllocRegion();
+      mlir::Block *allocEntryBlock = firOpBuilder.createBlock(
+          &allocRegion, /*insertPt=*/{}, symType, symLoc);
+
+      firOpBuilder.setInsertionPointToEnd(allocEntryBlock);
+      symTable->addSymbol(*sym, allocRegion.getArgument(0));
+      symTable->pushScope();
       cloneSymbol(sym);
-      copyFirstPrivateSymbol(sym);
+      firOpBuilder.create<mlir::omp::YieldOp>(
+          hsb.getAddr().getLoc(),
+          symTable->shallowLookupSymbol(*sym).getAddr());
+      symTable->popScope();
     }
-  }
+
+    // Populate the `copy` region if this is a `firstprivate`.
+    if (isFirstPrivate) {
+      mlir::Region &copyRegion = result.getCopyRegion();
+      // First block argument corresponding to the original/host value while
+      // second block argument corresponding to the privatized value.
+      mlir::Block *copyEntryBlock = firOpBuilder.createBlock(
+          &copyRegion, /*insertPt=*/{}, {symType, symType}, {symLoc, symLoc});
+      firOpBuilder.setInsertionPointToEnd(copyEntryBlock);
+      symTable->addSymbol(*sym, copyRegion.getArgument(0),
+                          /*force=*/true);
+      symTable->pushScope();
+      symTable->addSymbol(*sym, copyRegion.getArgument(1));
+      auto ip = firOpBuilder.saveInsertionPoint();
+      copyFirstPrivateSymbol(sym, &ip);
+
+      firOpBuilder.create<mlir::omp::YieldOp>(
+          hsb.getAddr().getLoc(),
+          symTable->shallowLookupSymbol(*sym).getAddr());
+      symTable->popScope();
+    }
+
+    symTable->popScope();
+    firOpBuilder.restoreInsertionPoint(ip);
+    return result;
+  }();
+
+  delayedPrivatizationInfo.privatizers.push_back(
+      mlir::SymbolRefAttr::get(privatizerOp));
+  delayedPrivatizationInfo.originalAddresses.push_back(hsb.getAddr());
+  delayedPrivatizationInfo.symbols.push_back(sym);
 }
 
 } // namespace omp
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index 10c0a30c09c391..9f7301df07598f 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -23,6 +23,24 @@ namespace lower {
 namespace omp {
 
 class DataSharingProcessor {
+public:
+  /// Collects all the information needed for delayed privatization. This can be
+  /// used by ops with data-sharing clauses to properly generate their regions
+  /// (e.g. add region arguments) and map the original SSA values to their
+  /// corresponding OMP region operands.
+  struct DelayedPrivatizationInfo {
+    // The list of symbols referring to delayed privatizer ops (i.e.
+    // `omp.private` ops).
+    llvm::SmallVector<mlir::SymbolRefAttr> privatizers;
+    // SSA values that correspond to "original" values being privatized.
+    // "Original" here means the SSA value outside the OpenMP region from which
+    // a clone is created inside the region.
+    llvm::SmallVector<mlir::Value> originalAddresses;
+    // Fortran symbols corresponding to the above SSA values.
+    llvm::SmallVector<const Fortran::semantics::Symbol *> symbols;
+  };
+
+private:
   bool hasLastPrivateOp;
   mlir::OpBuilder::InsertPoint lastPrivIP;
   mlir::OpBuilder::InsertPoint insPt;
@@ -36,6 +54,9 @@ class DataSharingProcessor {
   fir::FirOpBuilder &firOpBuilder;
   const Fortran::parser::OmpClauseList &opClauseList;
   Fortran::lower::pft::Evaluation &eval;
+  bool useDelayedPrivatization;
+  Fortran::lower::SymMap *symTable;
+  DelayedPrivatizationInfo delayedPrivatizationInfo;
 
   bool needBarrier();
   void collectSymbols(Fortran::semantics::Symbol::Flag flag);
@@ -47,10 +68,13 @@ class DataSharingProcessor {
   void collectDefaultSymbols();
   void privatize();
   void defaultPrivatize();
+  void doPrivatize(const Fortran::semantics::Symbol *sym);
   void copyLastPrivatize(mlir::Operation *op);
   void insertLastPrivateCompare(mlir::Operation *op);
   void cloneSymbol(const Fortran::semantics::Symbol *sym);
-  void copyFirstPrivateSymbol(const Fortran::semantics::Symbol *sym);
+  void
+  copyFirstPrivateSymbol(const Fortran::semantics::Symbol *sym,
+                         mlir::OpBuilder::InsertPoint *copyAssignIP = nullptr);
   void copyLastPrivateSymbol(const Fortran::semantics::Symbol *sym,
                              mlir::OpBuilder::InsertPoint *lastPrivIP);
   void insertDeallocs();
@@ -58,10 +82,14 @@ class DataSharingProcessor {
 public:
   DataSharingProcessor(Fortran::lower::AbstractConverter &converter,
                        const Fortran::parser::OmpClauseList &opClauseList,
-                       Fortran::lower::pft::Evaluation &eval)
+                       Fortran::lower::pft::Evaluation &eval,
+                       bool useDelayedPrivatization = false,
+                       Fortran::lower::SymMap *symTable = nullptr)
       : hasLastPrivateOp(false), converter(converter),
         firOpBuilder(converter.getFirOpBuilder()), opClauseList(opClauseList),
-        eval(eval) {}
+        eval(eval), useDelayedPrivatization(useDelayedPrivatization),
+        symTable(symTable) {}
+
   // Privatisation is split into two steps.
   // Step1 performs cloning of all privatisation clauses and copying for
   // firstprivates. Step1 is performed at the place where process/processStep1
@@ -80,6 +108,10 @@ class DataSharingProcessor {
     assert(!loopIV && "Loop iteration variable already set");
     loopIV = iv;
   }
+
+  const DelayedPrivatizationInfo &getDelayedPrivatizationInfo() const {
+    return delayedPrivatizationInfo;
+  }
 };
 
 } // namespace omp
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 7953bf83cba0fe..21ad51c53d8742 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -558,6 +558,7 @@ genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::ParallelOp
 genParallelOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, bool genNested,
               mlir::Location currentLocation,
@@ -590,8 +591,8 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   auto reductionCallback = [&](mlir::Operation *op) {
     llvm::SmallVector<mlir::Location> locs(reductionVars.size(),
                                            currentLocation);
-    auto block = converter.getFirOpBuilder().createBlock(&op->getRegion(0), {},
-                                                         reductionTypes, locs);
+    auto *block = converter.getFirOpBuilder().createBlock(&op->getRegion(0), {},
+                                                          reductionTypes, locs);
     for (auto [arg, prv] :
          llvm::zip_equal(reductionSymbols, block->getArguments())) {
       converter.bindSymbol(*arg, prv);
@@ -599,13 +600,78 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
     return reductionSymbols;
   };
 
-  return genOpWithBody<mlir::omp::ParallelOp>(
+  OpWithBodyGenInfo genInfo =
       OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setOuterCombined(outerCombined)
           .setClauses(&clauseList)
           .setReductions(&reductionSymbols, &reductionTypes)
-          .setGenRegionEntryCb(reductionCallback),
+          .setGenRegionEntryCb(reductionCallback);
+
+  if (!enableDelayedPrivatization) {
+    return genOpWithBody<mlir::omp::ParallelOp>(
+        genInfo,
+        /*resultTypes=*/mlir::TypeRange(), ifClauseOperand,
+        numThreadsClauseOperand, allocateOperands, allocatorOperands,
+        reductionVars,
+        reductionDeclSymbols.empty()
+            ? nullptr
+            : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
+                                   reductionDeclSymbols),
+        procBindKindAttr, /*private_vars=*/llvm::SmallVector<mlir::Value>{},
+        /*privatizers=*/nullptr);
+  }
+
+  bool privatize = !outerCombined;
+  DataSharingProcessor dsp(converter, clauseList, eval,
+                           /*useDelayedPrivatization=*/true, &symTable);
+
+  if (privatize)
+    dsp.processStep1();
+
+  const auto &delayedPrivatizationInfo = dsp.getDelayedPrivatizationInfo();
+
+  auto genRegionEntryCB = [&](mlir::Operation *op) {
+    auto parallelOp = llvm::cast<mlir::omp::ParallelOp>(op);
+
+    llvm::SmallVector<mlir::Location> reductionLocs(reductionVars.size(),
+                                                    currentLocation);
+
+    mlir::OperandRange privateVars = parallelOp.getPrivateVars();
+    mlir::Region &region = parallelOp.getRegion();
+
+    llvm::SmallVector<mlir::Type> privateVarTypes = reductionTypes;
+    privateVarTypes.reserve(privateVarTypes.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(privateVarTypes),
+                    [](mlir::Value v) { return v.getType(); });
+
+    llvm::SmallVector<mlir::Location> privateVarLocs = reductionLocs;
+    privateVarLocs.reserve(privateVarLocs.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(privateVarLocs),
+                    [](mlir::Value v) { return v.getLoc(); });
+
+    converter.getFirOpBuilder().createBlock(&region, /*insertPt=*/{},
+                                            privateVarTypes, privateVarLocs);
+
+    llvm::SmallVector<const Fortran::semantics::Symbol *> allSymbols =
+        reductionSymbols;
+    allSymbols.append(delayedPrivatizationInfo.symbols);
+    for (auto [arg, prv] : llvm::zip_equal(allSymbols, region.getArguments())) {
+      converter.bindSymbol(*arg, prv);
+    }
+
+    return allSymbols;
+  };
+
+  // TODO Merge with the reduction CB.
+  genInfo.setGenRegionEntryCb(genRegionEntryCB).setDataSharingProcessor(&dsp);
+
+  llvm::SmallVector<mlir::Attribute> privatizers(
+      delayedPrivatizationInfo.privatizers.begin(),
+      delayedPrivatizationInfo.privatizers.end());
+
+  return genOpWithBody<mlir::omp::ParallelOp>(
+      genInfo,
       /*resultTypes=*/mlir::TypeRange(), ifClauseOperand,
       numThreadsClauseOperand, allocateOperands, allocatorOperands,
       reductionVars,
@@ -613,8 +679,11 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
           ? nullptr
           : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
                                  reductionDeclSymbols),
-      procBindKindAttr, /*private_vars=*/llvm::SmallVector<mlir::Value>{},
-      /*privatizers=*/nullptr);
+      procBindKindAttr, delayedPrivatizationInfo.originalAddresses,
+      delayedPrivatizationInfo.privatizers.empty()
+          ? nullptr
+          : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
+                                 privatizers));
 }
 
 static mlir::omp::SectionOp
@@ -1621,7 +1690,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
     if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
-      genParallelOp(converter, semaCtx, eval, /*genNested=*/false,
+      genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/false,
                     currentLocation, loopOpClauseList,
                     /*outerCombined=*/true);
     }
@@ -1711,8 +1780,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
                        currentLocation);
     break;
   case llvm::omp::Directive::OMPD_parallel:
-    genParallelOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
-                  beginClauseList);
+    genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/true,
+                  currentLocation, beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_single:
     genSingleOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
@@ -1769,7 +1838,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
           .test(directive.v)) {
     bool outerCombined =
         directive.v != llvm::omp::Directive::OMPD_target_parallel;
-    genParallelOp(converter, semaCtx, eval, /*genNested=*/false,
+    genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/false,
                   currentLocation, beginClauseList, outerCombined);
     combinedDirective = true;
   }
@@ -1852,7 +1921,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
   // Parallel wrapper of PARALLEL SECTIONS construct
   if (dir == llvm::omp::Directive::OMPD_parallel_sections) {
-    genParallelOp(converter, semaCtx, eval,
+    genParallelOp(converter, symTable, semaCtx, eval,
                   /*genNested=*/false, currentLocation, sectionsClauseList,
                   /*outerCombined=*/true);
   } else {
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 31b15257d18687..49517f62895dfe 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -24,6 +24,12 @@ llvm::cl::opt<bool> treatIndexAsSection(
     llvm::cl::desc("In the OpenMP data clauses treat `a(N)` as `a(N:N)`."),
     llvm::cl::init(true));
 
+llvm::cl::opt<bool> enableDelayedPrivatization(
+    "openmp-enable-delayed-privatization",
+    llvm::cl::desc(
+        "Emit `[first]private` variables as clauses on the MLIR ops."),
+    llvm::cl::init(false));
+
 namespace Fortran {
 namespace lower {
 namespace omp {
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index c346f891f0797e..f57cd7420ce4d5 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -15,6 +15,7 @@
 #include "llvm/Support/CommandLine.h"
 
 extern llvm::cl::opt<bool> treatIndexAsSection;
+extern llvm::cl::opt<bool> enableDelayedPrivatization;
 
 namespace fir {
 class FirOpBuilder;
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 3cce39f5b8c782..788c99e40105a2 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -18,6 +18,7 @@
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "flang/Optimizer/Support/InternalNames.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -1533,3 +1534,10 @@ mlir::Value fir::factory::createNullBoxProc(fir::FirOpBuilder &builder,
   mlir::Value initVal{builder.create<fir::ZeroOp>(loc, boxEleTy)};
   return builder.create<fir::EmboxProcOp>(loc, boxTy, initVal);
 }
+
+void fir::factory::setInternalLinkage(mlir::func::FuncOp func) {
+  auto internalLinkage = mlir::LLVM::linkage::Linkage::Internal;
+  auto linkage =
+      mlir::LLVM::LinkageAttr::get(func->getContext(), internalLinkage);
+  func->setAttr("llvm.linkage", linkage);
+}
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index c84fb27cb38da5..837ef0576ef696 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -1834,10 +1834,7 @@ mlir::func::FuncOp IntrinsicLibrary::getWrapper(GeneratorType generator,
     // First time this wrapper is needed, build it.
     function = builder.createFunction(loc, wrapperName, funcType);
     function->setAttr("fir.intrinsic", builder.getUnitAttr());
-    auto internalLinkage = mlir::LLVM::linkage::Linkage::Internal;
-    auto linkage =
-        mlir::LLVM::LinkageAttr::get(builder.getContext(), internalLinkage);
-    function->setAttr("llvm.linkage", linkage);
+    fir::factory::setInternalLinkage(function);
     function.addEntryBlock();
 
     // Create local context to emit code into the newly created function
diff --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp
index 4d8860b60915c4..d4012e9c3d9d93 100644
--- a/flang/lib/Optimizer/Builder/MutableBox.cpp
+++ b/flang/lib/Optimizer/Builder/MutableBox.cpp
@@ -674,7 +674,7 @@ void fir::factory::disassociateMutableBox(fir::FirOpBuilder &builder,
     // 7.3.2.3 point 7. The dynamic type of a disassociated pointer is the
     // same as its declared type.
     auto boxTy = box.getBoxTy().dyn_cast<fir::BaseBoxType>();
-    auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(boxTy.getEleTy());
+    auto eleTy = fir::unwrapPassByRefType(boxTy.getEleTy());
     mlir::Type derivedType = fir::getDerivedType(eleTy);
     if (auto recTy = derivedType.dyn_cast<fir::RecordType>()) {
       fir::runtime::genNullifyDerivedType(builder, loc, box.getAddr(), recTy,
diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
index fabbff818b6f0e..66fbaddcbda1aa 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
@@ -149,6 +149,22 @@ struct ForcedNorm2Real16 {
   }
 };
 
+/// Placeholder for real*16 version of Norm2Dim Intrinsic
+struct ForcedNorm2DimReal16 {
+  static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Norm2DimReal16));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto boxTy =
+          fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
+      auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
+      auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
+      return mlir::FunctionType::get(
+          ctx, {fir::ReferenceType::get(boxTy), boxTy, intTy, strTy, intTy},
+          mlir::NoneType::get(ctx));
+    };
+  }
+};
+
 /// Placeholder for real*10 version of Product Intrinsic
 struct ForcedProductReal10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ProductReal10));
@@ -876,7 +892,14 @@ mlir::Value fir::runtime::genMinval(fir::FirOpBuilder &builder,
 void fir::runtime::genNorm2Dim(fir::FirOpBuilder &builder, mlir::Location loc,
                                mlir::Value resultBox, mlir::Value arrayBox,
                                mlir::Value dim) {
-  auto func = fir::runtime::getRuntimeFunc<mkRTKey(Norm2Dim)>(loc, builder);
+  mlir::func::FuncOp func;
+  auto ty = arrayBox.getType();
+  auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
+  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  if (eleTy.isF128())
+    func = fir::runtime::getRuntimeFunc<ForcedNorm2DimReal16>(loc, builder);
+  else
+    func = fir::runtime::getRuntimeFunc<mkRTKey(Norm2Dim)>(loc, builder);
   auto fTy = func.getFunctionType();
   auto sourceFile = fir::factory::locationToFilename(builder, loc);
   auto sourceLine =
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
index 314e4264c17e83..377cc44392028f 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
@@ -176,7 +176,14 @@ class HlfirIntrinsicConversion : public mlir::OpRewritePattern<OP> {
           rewriter.eraseOp(use);
       }
     }
-    rewriter.replaceAllUsesWith(op->getResults(), {base});
+    // TODO: This entire pass should be a greedy pattern rewrite or a manual
+    // IR traversal. A dialect conversion cannot be used here because
+    // `replaceAllUsesWith` is not supported. Similarly, `replaceOp` is not
+    // suitable because "op->getResult(0)" and "base" can have different types.
+    // In such a case, the dialect conversion will attempt to convert the type,
+    // but no type converter is specified in this pass. Also note that all
+    // patterns in this pass are actually rewrite patterns.
+    op->getResult(0).replaceAllUsesWith(base);
     rewriter.replaceOp(op, base);
   }
 };
diff --git a/flang/runtime/Float128Math/CMakeLists.txt b/flang/runtime/Float128Math/CMakeLists.txt
index 8d276e8f122728..f11678cd70b769 100644
--- a/flang/runtime/Float128Math/CMakeLists.txt
+++ b/flang/runtime/Float128Math/CMakeLists.txt
@@ -69,6 +69,7 @@ set(sources
   log.cpp
   log10.cpp
   lround.cpp
+  norm2.cpp
   pow.cpp
   round.cpp
   sin.cpp
diff --git a/flang/runtime/Float128Math/cabs.cpp b/flang/runtime/Float128Math/cabs.cpp
index 63f2bdf8e177ae..827b197a6a81ae 100644
--- a/flang/runtime/Float128Math/cabs.cpp
+++ b/flang/runtime/Float128Math/cabs.cpp
@@ -10,15 +10,15 @@
 
 namespace Fortran::runtime {
 extern "C" {
-
+#if 0
+// FIXME: temporarily disabled. Need to add pure C entry point
+// using C _Complex ABI.
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-// FIXME: the argument should be CppTypeFor<TypeCategory::Complex, 16>,
-// and it should be translated into the underlying library's
-// corresponding complex128 type.
-CppTypeFor<TypeCategory::Real, 16> RTDEF(CAbsF128)(ComplexF128 x) {
+// NOTE: Flang calls the runtime APIs using C _Complex ABI
+CppTypeFor<TypeCategory::Real, 16> RTDEF(CAbsF128)(CFloat128ComplexType x) {
   return CAbs<RTNAME(CAbsF128)>::invoke(x);
 }
 #endif
-
+#endif
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index fe1525468edcaf..a0d81d0cbb5407 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -54,6 +54,7 @@ namespace Fortran::runtime {
   };
 
 // Define fallback callers.
+DEFINE_FALLBACK(Abs)
 DEFINE_FALLBACK(Acos)
 DEFINE_FALLBACK(Acosh)
 DEFINE_FALLBACK(Asin)
@@ -61,7 +62,6 @@ DEFINE_FALLBACK(Asinh)
 DEFINE_FALLBACK(Atan)
 DEFINE_FALLBACK(Atan2)
 DEFINE_FALLBACK(Atanh)
-DEFINE_FALLBACK(CAbs)
 DEFINE_FALLBACK(Ceil)
 DEFINE_FALLBACK(Cos)
 DEFINE_FALLBACK(Cosh)
@@ -91,15 +91,6 @@ DEFINE_FALLBACK(Y0)
 DEFINE_FALLBACK(Y1)
 DEFINE_FALLBACK(Yn)
 
-// Define ComplexF128 type that is compatible with
-// the type of results/arguments of libquadmath.
-// TODO: this may need more work for other libraries/compilers.
-#if !defined(_ARCH_PPC) || defined(__LONG_DOUBLE_IEEE128__)
-typedef _Complex float __attribute__((mode(TC))) ComplexF128;
-#else
-typedef _Complex float __attribute__((mode(KC))) ComplexF128;
-#endif
-
 #if HAS_LIBM
 // Define wrapper callers for libm.
 #include <ccomplex>
@@ -109,6 +100,7 @@ typedef _Complex float __attribute__((mode(KC))) ComplexF128;
 // Use STD math functions. They provide IEEE-754 128-bit float
 // support either via 'long double' or __float128.
 // The Bessel's functions are not present in STD namespace.
+DEFINE_SIMPLE_ALIAS(Abs, std::abs)
 DEFINE_SIMPLE_ALIAS(Acos, std::acos)
 DEFINE_SIMPLE_ALIAS(Acosh, std::acosh)
 DEFINE_SIMPLE_ALIAS(Asin, std::asin)
@@ -165,6 +157,7 @@ DEFINE_SIMPLE_ALIAS(Yn, ynl)
 #elif HAS_QUADMATHLIB
 // Define wrapper callers for libquadmath.
 #include "quadmath.h"
+DEFINE_SIMPLE_ALIAS(Abs, fabsq)
 DEFINE_SIMPLE_ALIAS(Acos, acosq)
 DEFINE_SIMPLE_ALIAS(Acosh, acoshq)
 DEFINE_SIMPLE_ALIAS(Asin, asinq)
@@ -172,7 +165,6 @@ DEFINE_SIMPLE_ALIAS(Asinh, asinhq)
 DEFINE_SIMPLE_ALIAS(Atan, atanq)
 DEFINE_SIMPLE_ALIAS(Atan2, atan2q)
 DEFINE_SIMPLE_ALIAS(Atanh, atanhq)
-DEFINE_SIMPLE_ALIAS(CAbs, cabsq)
 DEFINE_SIMPLE_ALIAS(Ceil, ceilq)
 DEFINE_SIMPLE_ALIAS(Cos, cosq)
 DEFINE_SIMPLE_ALIAS(Cosh, coshq)
@@ -202,6 +194,19 @@ DEFINE_SIMPLE_ALIAS(Y0, y0q)
 DEFINE_SIMPLE_ALIAS(Y1, y1q)
 DEFINE_SIMPLE_ALIAS(Yn, ynq)
 #endif
+
+extern "C" {
+// Declarations of the entry points that might be referenced
+// within the Float128Math library itself.
+// Note that not all of these entry points are actually
+// defined in this library. Some of them are used just
+// as template parameters to call the corresponding callee directly.
+CppTypeFor<TypeCategory::Real, 16> RTDECL(AbsF128)(
+    CppTypeFor<TypeCategory::Real, 16> x);
+CppTypeFor<TypeCategory::Real, 16> RTDECL(SqrtF128)(
+    CppTypeFor<TypeCategory::Real, 16> x);
+} // extern "C"
+
 } // namespace Fortran::runtime
 
 #endif // FORTRAN_RUNTIME_FLOAT128MATH_MATH_ENTRIES_H_
diff --git a/flang/runtime/Float128Math/norm2.cpp b/flang/runtime/Float128Math/norm2.cpp
new file mode 100644
index 00000000000000..17453bd2d6cbd7
--- /dev/null
+++ b/flang/runtime/Float128Math/norm2.cpp
@@ -0,0 +1,59 @@
+//===-- runtime/Float128Math/norm2.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+#include "reduction-templates.h"
+#include <cmath>
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+
+namespace {
+using namespace Fortran::runtime;
+
+using AccumType = Norm2AccumType<16>;
+
+struct ABSTy {
+  static AccumType compute(AccumType x) {
+    return Sqrt<RTNAME(AbsF128)>::invoke(x);
+  }
+};
+
+struct SQRTTy {
+  static AccumType compute(AccumType x) {
+    return Sqrt<RTNAME(SqrtF128)>::invoke(x);
+  }
+};
+
+using Float128Norm2Accumulator = Norm2Accumulator<16, ABSTy, SQRTTy>;
+} // namespace
+
+namespace Fortran::runtime {
+extern "C" {
+
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Norm2_16)(
+    const Descriptor &x, const char *source, int line, int dim) {
+  auto accumulator{::Float128Norm2Accumulator(x)};
+  return GetTotalReduction<TypeCategory::Real, 16>(
+      x, source, line, dim, nullptr, accumulator, "NORM2");
+}
+
+void RTDEF(Norm2DimReal16)(Descriptor &result, const Descriptor &x, int dim,
+    const char *source, int line) {
+  Terminator terminator{source, line};
+  auto type{x.type().GetCategoryAndKind()};
+  RUNTIME_CHECK(terminator, type);
+  RUNTIME_CHECK(
+      terminator, type->first == TypeCategory::Real && type->second == 16);
+  DoMaxMinNorm2<TypeCategory::Real, 16, ::Float128Norm2Accumulator>(
+      result, x, dim, nullptr, "NORM2", terminator);
+}
+
+} // extern "C"
+} // namespace Fortran::runtime
+
+#endif
diff --git a/flang/runtime/complex-reduction.c b/flang/runtime/complex-reduction.c
index d77e1c0a550069..72c31ce08b875a 100644
--- a/flang/runtime/complex-reduction.c
+++ b/flang/runtime/complex-reduction.c
@@ -19,6 +19,11 @@ struct CppComplexDouble {
 struct CppComplexLongDouble {
   long double r, i;
 };
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+struct CppComplexFloat128 {
+  CFloat128Type r, i;
+};
+#endif
 
 /* Not all environments define CMPLXF, CMPLX, CMPLXL. */
 
@@ -70,6 +75,29 @@ static long_double_Complex_t CMPLXL(long double r, long double i) {
 #endif
 #endif
 
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+#ifndef CMPLXF128
+/*
+ * GCC 7.4.0 (currently minimum GCC version for llvm builds)
+ * supports __builtin_complex. For Clang, require >=12.0.
+ * Otherwise, rely on the memory layout compatibility.
+ */
+#if (defined(__clang_major__) && (__clang_major__ >= 12)) || defined(__GNUC__)
+#define CMPLXF128 __builtin_complex
+#else
+static CFloat128ComplexType CMPLXF128(CFloat128Type r, CFloat128Type i) {
+  union {
+    struct CppComplexFloat128 x;
+    CFloat128ComplexType result;
+  } u;
+  u.x.r = r;
+  u.x.i = i;
+  return u.result;
+}
+#endif
+#endif
+#endif
+
 /* RTNAME(SumComplex4) calls RTNAME(CppSumComplex4) with the same arguments
  * and converts the members of its C++ complex result to C _Complex.
  */
@@ -93,9 +121,10 @@ ADAPT_REDUCTION(SumComplex8, double_Complex_t, CppComplexDouble, CMPLX,
 #if LDBL_MANT_DIG == 64
 ADAPT_REDUCTION(SumComplex10, long_double_Complex_t, CppComplexLongDouble,
     CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
-#elif LDBL_MANT_DIG == 113
-ADAPT_REDUCTION(SumComplex16, long_double_Complex_t, CppComplexLongDouble,
-    CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+ADAPT_REDUCTION(SumComplex16, CFloat128ComplexType, CppComplexFloat128,
+    CMPLXF128, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
 #endif
 
 /* PRODUCT() */
@@ -106,9 +135,10 @@ ADAPT_REDUCTION(ProductComplex8, double_Complex_t, CppComplexDouble, CMPLX,
 #if LDBL_MANT_DIG == 64
 ADAPT_REDUCTION(ProductComplex10, long_double_Complex_t, CppComplexLongDouble,
     CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
-#elif LDBL_MANT_DIG == 113
-ADAPT_REDUCTION(ProductComplex16, long_double_Complex_t, CppComplexLongDouble,
-    CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+ADAPT_REDUCTION(ProductComplex16, CFloat128ComplexType, CppComplexFloat128,
+    CMPLXF128, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
 #endif
 
 /* DOT_PRODUCT() */
@@ -119,7 +149,8 @@ ADAPT_REDUCTION(DotProductComplex8, double_Complex_t, CppComplexDouble, CMPLX,
 #if LDBL_MANT_DIG == 64
 ADAPT_REDUCTION(DotProductComplex10, long_double_Complex_t,
     CppComplexLongDouble, CMPLXL, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES)
-#elif LDBL_MANT_DIG == 113
-ADAPT_REDUCTION(DotProductComplex16, long_double_Complex_t,
-    CppComplexLongDouble, CMPLXL, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES)
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+ADAPT_REDUCTION(DotProductComplex16, CFloat128ComplexType, CppComplexFloat128,
+    CMPLXF128, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES)
 #endif
diff --git a/flang/runtime/complex-reduction.h b/flang/runtime/complex-reduction.h
index 5c4f1f5126e393..1d37b235d5194b 100644
--- a/flang/runtime/complex-reduction.h
+++ b/flang/runtime/complex-reduction.h
@@ -15,6 +15,7 @@
 #ifndef FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
 #define FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
 
+#include "flang/Common/float128.h"
 #include "flang/Runtime/entry-names.h"
 #include <complex.h>
 
@@ -40,14 +41,18 @@ float_Complex_t RTNAME(SumComplex3)(REDUCTION_ARGS);
 float_Complex_t RTNAME(SumComplex4)(REDUCTION_ARGS);
 double_Complex_t RTNAME(SumComplex8)(REDUCTION_ARGS);
 long_double_Complex_t RTNAME(SumComplex10)(REDUCTION_ARGS);
-long_double_Complex_t RTNAME(SumComplex16)(REDUCTION_ARGS);
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CFloat128ComplexType RTNAME(SumComplex16)(REDUCTION_ARGS);
+#endif
 
 float_Complex_t RTNAME(ProductComplex2)(REDUCTION_ARGS);
 float_Complex_t RTNAME(ProductComplex3)(REDUCTION_ARGS);
 float_Complex_t RTNAME(ProductComplex4)(REDUCTION_ARGS);
 double_Complex_t RTNAME(ProductComplex8)(REDUCTION_ARGS);
 long_double_Complex_t RTNAME(ProductComplex10)(REDUCTION_ARGS);
-long_double_Complex_t RTNAME(ProductComplex16)(REDUCTION_ARGS);
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CFloat128ComplexType RTNAME(ProductComplex16)(REDUCTION_ARGS);
+#endif
 
 #define DOT_PRODUCT_ARGS \
   const struct CppDescriptor *x, const struct CppDescriptor *y, \
@@ -60,6 +65,8 @@ float_Complex_t RTNAME(DotProductComplex3)(DOT_PRODUCT_ARGS);
 float_Complex_t RTNAME(DotProductComplex4)(DOT_PRODUCT_ARGS);
 double_Complex_t RTNAME(DotProductComplex8)(DOT_PRODUCT_ARGS);
 long_double_Complex_t RTNAME(DotProductComplex10)(DOT_PRODUCT_ARGS);
-long_double_Complex_t RTNAME(DotProductComplex16)(DOT_PRODUCT_ARGS);
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CFloat128ComplexType RTNAME(DotProductComplex16)(DOT_PRODUCT_ARGS);
+#endif
 
 #endif // FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
diff --git a/flang/runtime/extrema.cpp b/flang/runtime/extrema.cpp
index 3fdc8e159866d1..fc2b4e165cb269 100644
--- a/flang/runtime/extrema.cpp
+++ b/flang/runtime/extrema.cpp
@@ -528,35 +528,6 @@ inline RT_API_ATTRS CppTypeFor<CAT, KIND> TotalNumericMaxOrMin(
       NumericExtremumAccumulator<CAT, KIND, IS_MAXVAL>{x}, intrinsic);
 }
 
-template <TypeCategory CAT, int KIND, typename ACCUMULATOR>
-static RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
-    int dim, const Descriptor *mask, const char *intrinsic,
-    Terminator &terminator) {
-  using Type = CppTypeFor<CAT, KIND>;
-  ACCUMULATOR accumulator{x};
-  if (dim == 0 || x.rank() == 1) {
-    // Total reduction
-
-    // Element size of the destination descriptor is the same
-    // as the element size of the source.
-    result.Establish(x.type(), x.ElementBytes(), nullptr, 0, nullptr,
-        CFI_attribute_allocatable);
-    if (int stat{result.Allocate()}) {
-      terminator.Crash(
-          "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
-    }
-    DoTotalReduction<Type>(x, dim, mask, accumulator, intrinsic, terminator);
-    accumulator.GetResult(result.OffsetElement<Type>());
-  } else {
-    // Partial reduction
-
-    // Element size of the destination descriptor is the same
-    // as the element size of the source.
-    PartialReduction<ACCUMULATOR, CAT, KIND>(result, x, x.ElementBytes(), dim,
-        mask, terminator, intrinsic, accumulator);
-  }
-}
-
 template <TypeCategory CAT, bool IS_MAXVAL> struct MaxOrMinHelper {
   template <int KIND> struct Functor {
     RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
@@ -802,66 +773,11 @@ RT_EXT_API_GROUP_END
 
 // NORM2
 
-RT_VAR_GROUP_BEGIN
-
-// Use at least double precision for accumulators.
-// Don't use __float128, it doesn't work with abs() or sqrt() yet.
-static constexpr RT_CONST_VAR_ATTRS int largestLDKind {
-#if LDBL_MANT_DIG == 113
-  16
-#elif LDBL_MANT_DIG == 64
-  10
-#else
-  8
-#endif
-};
-
-RT_VAR_GROUP_END
-
-template <int KIND> class Norm2Accumulator {
-public:
-  using Type = CppTypeFor<TypeCategory::Real, KIND>;
-  using AccumType =
-      CppTypeFor<TypeCategory::Real, std::clamp(KIND, 8, largestLDKind)>;
-  explicit RT_API_ATTRS Norm2Accumulator(const Descriptor &array)
-      : array_{array} {}
-  RT_API_ATTRS void Reinitialize() { max_ = sum_ = 0; }
-  template <typename A>
-  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
-    // m * sqrt(1 + sum((others(:)/m)**2))
-    *p = static_cast<Type>(max_ * std::sqrt(1 + sum_));
-  }
-  RT_API_ATTRS bool Accumulate(Type x) {
-    auto absX{std::abs(static_cast<AccumType>(x))};
-    if (!max_) {
-      max_ = absX;
-    } else if (absX > max_) {
-      auto t{max_ / absX}; // < 1.0
-      auto tsq{t * t};
-      sum_ *= tsq; // scale sum to reflect change to the max
-      sum_ += tsq; // include a term for the previous max
-      max_ = absX;
-    } else { // absX <= max_
-      auto t{absX / max_};
-      sum_ += t * t;
-    }
-    return true;
-  }
-  template <typename A>
-  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
-    return Accumulate(*array_.Element<A>(at));
-  }
-
-private:
-  const Descriptor &array_;
-  AccumType max_{0}; // value (m) with largest magnitude
-  AccumType sum_{0}; // sum((others(:)/m)**2)
-};
-
 template <int KIND> struct Norm2Helper {
   RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x, int dim,
       const Descriptor *mask, Terminator &terminator) const {
-    DoMaxMinNorm2<TypeCategory::Real, KIND, Norm2Accumulator<KIND>>(
+    DoMaxMinNorm2<TypeCategory::Real, KIND,
+        typename Norm2AccumulatorGetter<KIND>::Type>(
         result, x, dim, mask, "NORM2", terminator);
   }
 };
@@ -872,26 +788,27 @@ RT_EXT_API_GROUP_BEGIN
 // TODO: REAL(2 & 3)
 CppTypeFor<TypeCategory::Real, 4> RTDEF(Norm2_4)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 4>(
-      x, source, line, dim, nullptr, Norm2Accumulator<4>{x}, "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 4>(x, source, line, dim, nullptr,
+      Norm2AccumulatorGetter<4>::create(x), "NORM2");
 }
 CppTypeFor<TypeCategory::Real, 8> RTDEF(Norm2_8)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 8>(
-      x, source, line, dim, nullptr, Norm2Accumulator<8>{x}, "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 8>(x, source, line, dim, nullptr,
+      Norm2AccumulatorGetter<8>::create(x), "NORM2");
 }
 #if LDBL_MANT_DIG == 64
 CppTypeFor<TypeCategory::Real, 10> RTDEF(Norm2_10)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 10>(
-      x, source, line, dim, nullptr, Norm2Accumulator<10>{x}, "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 10>(x, source, line, dim,
+      nullptr, Norm2AccumulatorGetter<10>::create(x), "NORM2");
 }
 #endif
 #if LDBL_MANT_DIG == 113
+// The __float128 implementation resides in FortranFloat128Math library.
 CppTypeFor<TypeCategory::Real, 16> RTDEF(Norm2_16)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 16>(
-      x, source, line, dim, nullptr, Norm2Accumulator<16>{x}, "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 16>(x, source, line, dim,
+      nullptr, Norm2AccumulatorGetter<16>::create(x), "NORM2");
 }
 #endif
 
@@ -901,7 +818,7 @@ void RTDEF(Norm2Dim)(Descriptor &result, const Descriptor &x, int dim,
   auto type{x.type().GetCategoryAndKind()};
   RUNTIME_CHECK(terminator, type);
   if (type->first == TypeCategory::Real) {
-    ApplyFloatingPointKind<Norm2Helper, void>(
+    ApplyFloatingPointKind<Norm2Helper, void, true>(
         type->second, terminator, result, x, dim, nullptr, terminator);
   } else {
     terminator.Crash("NORM2: bad type code %d", x.type().raw());
diff --git a/flang/runtime/product.cpp b/flang/runtime/product.cpp
index a516bc51a959b7..4c3b8c33a12e0f 100644
--- a/flang/runtime/product.cpp
+++ b/flang/runtime/product.cpp
@@ -123,7 +123,8 @@ CppTypeFor<TypeCategory::Real, 10> RTDEF(ProductReal10)(const Descriptor &x,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 10>>{x},
       "PRODUCT");
 }
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(ProductReal16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 16>(x, source, line, dim, mask,
@@ -154,7 +155,8 @@ void RTDEF(CppProductComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
       mask, ComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 10>>{x},
       "PRODUCT");
 }
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 void RTDEF(CppProductComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
diff --git a/flang/runtime/reduction-templates.h b/flang/runtime/reduction-templates.h
index 7d0f82d59a084d..0891bc021ff753 100644
--- a/flang/runtime/reduction-templates.h
+++ b/flang/runtime/reduction-templates.h
@@ -25,6 +25,7 @@
 #include "tools.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
+#include <algorithm>
 
 namespace Fortran::runtime {
 
@@ -332,5 +333,119 @@ template <typename ACCUMULATOR> struct PartialLocationHelper {
   };
 };
 
+// NORM2 templates
+
+RT_VAR_GROUP_BEGIN
+
+// Use at least double precision for accumulators.
+// Don't use __float128, it doesn't work with abs() or sqrt() yet.
+static constexpr RT_CONST_VAR_ATTRS int Norm2LargestLDKind {
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+  16
+#elif LDBL_MANT_DIG == 64
+  10
+#else
+  8
+#endif
+};
+
+RT_VAR_GROUP_END
+
+template <TypeCategory CAT, int KIND, typename ACCUMULATOR>
+inline RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
+    int dim, const Descriptor *mask, const char *intrinsic,
+    Terminator &terminator) {
+  using Type = CppTypeFor<CAT, KIND>;
+  ACCUMULATOR accumulator{x};
+  if (dim == 0 || x.rank() == 1) {
+    // Total reduction
+
+    // Element size of the destination descriptor is the same
+    // as the element size of the source.
+    result.Establish(x.type(), x.ElementBytes(), nullptr, 0, nullptr,
+        CFI_attribute_allocatable);
+    if (int stat{result.Allocate()}) {
+      terminator.Crash(
+          "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
+    }
+    DoTotalReduction<Type>(x, dim, mask, accumulator, intrinsic, terminator);
+    accumulator.GetResult(result.OffsetElement<Type>());
+  } else {
+    // Partial reduction
+
+    // Element size of the destination descriptor is the same
+    // as the element size of the source.
+    PartialReduction<ACCUMULATOR, CAT, KIND>(result, x, x.ElementBytes(), dim,
+        mask, terminator, intrinsic, accumulator);
+  }
+}
+
+// The data type used by Norm2Accumulator.
+template <int KIND>
+using Norm2AccumType =
+    CppTypeFor<TypeCategory::Real, std::clamp(KIND, 8, Norm2LargestLDKind)>;
+
+template <int KIND, typename ABS, typename SQRT> class Norm2Accumulator {
+public:
+  using Type = CppTypeFor<TypeCategory::Real, KIND>;
+  using AccumType = Norm2AccumType<KIND>;
+  explicit RT_API_ATTRS Norm2Accumulator(const Descriptor &array)
+      : array_{array} {}
+  RT_API_ATTRS void Reinitialize() { max_ = sum_ = 0; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+    // m * sqrt(1 + sum((others(:)/m)**2))
+    *p = static_cast<Type>(max_ * SQRT::compute(1 + sum_));
+  }
+  RT_API_ATTRS bool Accumulate(Type x) {
+    auto absX{ABS::compute(static_cast<AccumType>(x))};
+    if (!max_) {
+      max_ = absX;
+    } else if (absX > max_) {
+      auto t{max_ / absX}; // < 1.0
+      auto tsq{t * t};
+      sum_ *= tsq; // scale sum to reflect change to the max
+      sum_ += tsq; // include a term for the previous max
+      max_ = absX;
+    } else { // absX <= max_
+      auto t{absX / max_};
+      sum_ += t * t;
+    }
+    return true;
+  }
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
+    return Accumulate(*array_.Element<A>(at));
+  }
+
+private:
+  const Descriptor &array_;
+  AccumType max_{0}; // value (m) with largest magnitude
+  AccumType sum_{0}; // sum((others(:)/m)**2)
+};
+
+// Helper class for creating Norm2Accumulator instance
+// based on the given KIND. This helper returns and instance
+// that uses std::abs and std::sqrt for the computations.
+template <int KIND> class Norm2AccumulatorGetter {
+  using AccumType = Norm2AccumType<KIND>;
+
+public:
+  struct ABSTy {
+    static constexpr RT_API_ATTRS AccumType compute(AccumType &&x) {
+      return std::abs(std::forward<AccumType>(x));
+    }
+  };
+  struct SQRTTy {
+    static constexpr RT_API_ATTRS AccumType compute(AccumType &&x) {
+      return std::sqrt(std::forward<AccumType>(x));
+    }
+  };
+
+  using Type = Norm2Accumulator<KIND, ABSTy, SQRTTy>;
+
+  static RT_API_ATTRS Type create(const Descriptor &x) { return Type(x); }
+};
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_
diff --git a/flang/runtime/sum.cpp b/flang/runtime/sum.cpp
index 048399737c8501..d2495e3e956fe6 100644
--- a/flang/runtime/sum.cpp
+++ b/flang/runtime/sum.cpp
@@ -175,7 +175,8 @@ void RTDEF(CppSumComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
   result = GetTotalReduction<TypeCategory::Complex, 10>(
       x, source, line, dim, mask, ComplexSumAccumulator<long double>{x}, "SUM");
 }
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 void RTDEF(CppSumComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h
index 89e5069995748b..c1f89cadca06e7 100644
--- a/flang/runtime/tools.h
+++ b/flang/runtime/tools.h
@@ -266,7 +266,8 @@ inline RT_API_ATTRS RESULT ApplyIntegerKind(
   }
 }
 
-template <template <int KIND> class FUNC, typename RESULT, typename... A>
+template <template <int KIND> class FUNC, typename RESULT,
+    bool NEEDSMATH = false, typename... A>
 inline RT_API_ATTRS RESULT ApplyFloatingPointKind(
     int kind, Terminator &terminator, A &&...x) {
   switch (kind) {
@@ -287,7 +288,13 @@ inline RT_API_ATTRS RESULT ApplyFloatingPointKind(
     break;
   case 16:
     if constexpr (HasCppTypeFor<TypeCategory::Real, 16>) {
-      return FUNC<16>{}(std::forward<A>(x)...);
+      // If FUNC implemenation relies on FP math functions,
+      // then we should not be here. The compiler should have
+      // generated a call to an entry in FortranFloat128Math
+      // library.
+      if constexpr (!NEEDSMATH) {
+        return FUNC<16>{}(std::forward<A>(x)...);
+      }
     }
     break;
   }
diff --git a/flang/test/Integration/OpenMP/copyprivate.f90 b/flang/test/Integration/OpenMP/copyprivate.f90
new file mode 100644
index 00000000000000..d32319a18c28bb
--- /dev/null
+++ b/flang/test/Integration/OpenMP/copyprivate.f90
@@ -0,0 +1,97 @@
+!===----------------------------------------------------------------------===!
+! This directory can be used to add Integration tests involving multiple
+! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
+! contain executable tests. We should only add tests here sparingly and only
+! if there is no other way to test. Repeat this message in each test that is
+! added to this directory and sub-directories.
+!===----------------------------------------------------------------------===!
+
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s
+
+!CHECK-DAG: define void @_copy_box_Uxi32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_10xi32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_i64(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_box_Uxi64(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_f32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_2x3xf32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_z32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_10xz32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_l32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_5xl32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_c8x8(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_10xc8x8(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_c16x5(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_rec__QFtest_typesTdt(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_box_heap_Uxi32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_box_ptr_Uxc8x9(ptr %{{.*}}, ptr %{{.*}})
+
+!CHECK-LABEL: define void @_copy_i32(
+!CHECK-SAME:                         ptr %[[DST:.*]], ptr %[[SRC:.*]]){{.*}} {
+!CHECK-NEXT:    %[[SRC_VAL:.*]] = load i32, ptr %[[SRC]]
+!CHECK-NEXT:    store i32 %[[SRC_VAL]], ptr %[[DST]]
+!CHECK-NEXT:    ret void
+!CHECK-NEXT:  }
+
+!CHECK-LABEL: define internal void @test_scalar_..omp_par({{.*}})
+!CHECK:         %[[I:.*]] = alloca i32, i64 1
+!CHECK:         %[[J:.*]] = alloca i32, i64 1
+!CHECK:         %[[DID_IT:.*]] = alloca i32
+!CHECK:         store i32 0, ptr %[[DID_IT]]
+!CHECK:         %[[THREAD_NUM1:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[LOC:.*]])
+!CHECK:         %[[RET:.*]] = call i32 @__kmpc_single({{.*}})
+!CHECK:         %[[NOT_ZERO:.*]] = icmp ne i32 %[[RET]], 0
+!CHECK:         br i1 %[[NOT_ZERO]], label %[[OMP_REGION_BODY:.*]], label %[[OMP_REGION_END:.*]]
+
+!CHECK:       [[OMP_REGION_END]]:
+!CHECK:         %[[THREAD_NUM2:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[LOC:.*]])
+!CHECK:         %[[DID_IT_VAL:.*]] = load i32, ptr %[[DID_IT]]
+!CHECK:         call void @__kmpc_copyprivate(ptr @[[LOC]], i32 %[[THREAD_NUM2]], i64 0, ptr %[[I]], ptr @_copy_i32, i32 %[[DID_IT_VAL]])
+!CHECK:         %[[THREAD_NUM3:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[LOC]])
+!CHECK:         %[[DID_IT_VAL2:.*]] = load i32, ptr %[[DID_IT]]
+!CHECK:         call void @__kmpc_copyprivate(ptr @[[LOC]], i32 %[[THREAD_NUM3]], i64 0, ptr %[[J]], ptr @_copy_i32, i32 %[[DID_IT_VAL2]])
+
+!CHECK:       [[OMP_REGION_BODY]]:
+!CHECK:         br label %[[OMP_SINGLE_REGION:.*]]
+!CHECK:       [[OMP_SINGLE_REGION]]:
+!CHECK:         store i32 11, ptr %[[I]]
+!CHECK:         store i32 22, ptr %[[J]]
+!CHECK:         br label %[[OMP_REGION_CONT3:.*]]
+!CHECK:       [[OMP_REGION_CONT3:.*]]:
+!CHECK:         store i32 1, ptr %[[DID_IT]]
+!CHECK:         call void @__kmpc_end_single(ptr @[[LOC]], i32 %[[THREAD_NUM1]])
+!CHECK:         br label %[[OMP_REGION_END]]
+subroutine test_scalar()
+  integer :: i, j
+
+  !$omp parallel private(i, j)
+  !$omp single
+  i = 11
+  j = 22
+  !$omp end single copyprivate(i, j)
+  !$omp end parallel
+end subroutine
+
+subroutine test_types(a, n)
+  integer :: a(:), n
+  integer(4) :: i4, i4a(10)
+  integer(8) :: i8, i8a(n)
+  real :: r, ra(2, 3)
+  complex :: z, za(10)
+  logical :: l, la(5)
+  character(kind=1, len=8) :: c1, c1a(10)
+  character(kind=2, len=5) :: c2
+
+  type dt
+    integer :: i
+    real :: r
+  end type
+  type(dt) :: t
+
+  integer, allocatable :: aloc(:)
+  character(kind=1, len=9), pointer :: ptr(:)
+
+  !$omp parallel private(a, i4, i4a, i8, i8a, r, ra, z, za, l, la, c1, c1a, c2, t, aloc, ptr)
+  !$omp single
+  !$omp end single copyprivate(a, i4, i4a, i8, i8a, r, ra, z, za, l, la, c1, c1a, c2, t, aloc, ptr)
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/HLFIR/allocatable-end-of-scope-dealloc.f90 b/flang/test/Lower/HLFIR/allocatable-end-of-scope-dealloc.f90
index ad4b015ef9443f..05cae6e5ba6ccf 100644
--- a/flang/test/Lower/HLFIR/allocatable-end-of-scope-dealloc.f90
+++ b/flang/test/Lower/HLFIR/allocatable-end-of-scope-dealloc.f90
@@ -224,7 +224,7 @@ subroutine internal()
     allocate(x)
   end subroutine
 end subroutine
-! CHECK-LABEL:   func.func @_QFno_dealloc_host_assocPinternal
+! CHECK-LABEL:   func.func private @_QFno_dealloc_host_assocPinternal
 ! CHECK-NOT: freemem
 ! CHECK-NOT: Deallocate
 ! CHECK: return
diff --git a/flang/test/Lower/HLFIR/bindc_internal_proc.f90 b/flang/test/Lower/HLFIR/bindc_internal_proc.f90
index 027c94f95a326e..00e24c7016f19f 100644
--- a/flang/test/Lower/HLFIR/bindc_internal_proc.f90
+++ b/flang/test/Lower/HLFIR/bindc_internal_proc.f90
@@ -3,7 +3,7 @@
 ! internal procedures.
 ! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
 
-!CHECK: func.func @_QFsub1Pfoo(%{{.*}}: i32
+!CHECK: func.func private @_QFsub1Pfoo(%{{.*}}: i32
 subroutine sub1()
   call foo(42)
 contains
@@ -13,7 +13,7 @@ subroutine foo(i) bind(c)
   end subroutine
 end subroutine
 
-!CHECK: func.func @_QFsub2Pfoo(%{{.*}}: i64
+!CHECK: func.func private @_QFsub2Pfoo(%{{.*}}: i64
 subroutine sub2()
   call foo(42_8)
 contains
diff --git a/flang/test/Lower/HLFIR/internal-procedures-2.f90 b/flang/test/Lower/HLFIR/internal-procedures-2.f90
index bb05545bef1aa2..f1c4780954b24d 100644
--- a/flang/test/Lower/HLFIR/internal-procedures-2.f90
+++ b/flang/test/Lower/HLFIR/internal-procedures-2.f90
@@ -23,7 +23,7 @@ subroutine internal_procedure(i, mask)
   end forall
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFhost_procedurePinternal_procedure(
+! CHECK-LABEL: func.func private @_QFhost_procedurePinternal_procedure(
 ! CHECK:    fir.address_of(@_QMmodule_used_by_hostEindexed_by_var) : !fir.ref<!fir.array<2xi32>>
 ! CHECK:    fir.address_of(@_QMmodule_used_by_hostEref_in_forall) : !fir.ref<!fir.array<2xi32>>
 ! CHECK:    fir.address_of(@_QMmodule_used_by_hostEref_in_implied_do) : !fir.ref<i32>
diff --git a/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90 b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
new file mode 100644
index 00000000000000..8645488290d715
--- /dev/null
+++ b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
@@ -0,0 +1,81 @@
+! Test lowering of internal procedure capturing OPTIONAL polymorphic
+! objects.
+! RUN: bbc -emit-hlfir --polymorphic-type -o - %s -I nw | FileCheck %s
+
+
+module captured_optional_polymorphic
+  type sometype
+  end type
+contains
+subroutine test(x, y)
+  class(sometype), optional :: x
+  class(sometype), optional :: y(2:)
+  call internal()
+contains
+  subroutine internal()
+    if (present(x).and.present(y)) then
+      print *, same_type_as(x, y)
+    end if
+  end subroutine
+end
+end module
+
+! CHECK-LABEL:   func.func @_QMcaptured_optional_polymorphicPtest(
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare{{.*}}Ex
+! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i64
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
+! CHECK:           %[[VAL_5:.*]] = fir.shift %[[VAL_4]] : (index) -> !fir.shift<1>
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare{{.*}}Ey
+! CHECK:           %[[VAL_7:.*]] = fir.alloca tuple<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>, !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_9:.*]] = fir.coordinate_of %[[VAL_7]], %[[VAL_8]]
+! CHECK:           %[[VAL_10:.*]] = fir.is_present %[[VAL_2]]#1 : (!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> i1
+! CHECK:           fir.if %[[VAL_10]] {
+! CHECK:             fir.store %[[VAL_2]]#1 to %[[VAL_9]] : !fir.ref<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_11:.*]] = fir.zero_bits !fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:             %[[VAL_12:.*]] = fir.embox %[[VAL_11]] : (!fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> !fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:             fir.store %[[VAL_12]] to %[[VAL_9]] : !fir.ref<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           }
+! CHECK:           %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_14:.*]] = fir.coordinate_of %[[VAL_7]], %[[VAL_13]]
+! CHECK:           %[[VAL_15:.*]] = fir.is_present %[[VAL_6]]#1 : (!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>) -> i1
+! CHECK:           fir.if %[[VAL_15]] {
+! CHECK:             %[[VAL_16:.*]] = fir.shift %[[VAL_4]] : (index) -> !fir.shift<1>
+! CHECK:             %[[VAL_17:.*]] = fir.rebox %[[VAL_6]]#1(%[[VAL_16]]) : (!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>, !fir.shift<1>) -> !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:             fir.store %[[VAL_17]] to %[[VAL_14]] : !fir.ref<!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_18:.*]] = fir.type_desc !fir.type<_QMcaptured_optional_polymorphicTsometype>
+! CHECK:             %[[VAL_19:.*]] = fir.convert %[[VAL_14]] : (!fir.ref<!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>) -> !fir.ref<!fir.box<none>>
+! CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_18]] : (!fir.tdesc<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> !fir.ref<none>
+! CHECK:             %[[VAL_21:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_22:.*]] = arith.constant 0 : i32
+! CHECK:             %[[VAL_23:.*]] = fir.call @_FortranAPointerNullifyDerived(%[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK:           }
+! CHECK:           fir.call @_QMcaptured_optional_polymorphicFtestPinternal(%[[VAL_7]])
+
+! CHECK-LABEL: func.func{{.*}} @_QMcaptured_optional_polymorphicFtestPinternal(
+! CHECK-SAME:      %[[VAL_0:.*]]: !fir.ref<tuple<{{.*}}>>
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]]
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> !fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> i64
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_7:.*]] = arith.cmpi ne, %[[VAL_5]], %[[VAL_6]] : i64
+! CHECK:           %[[VAL_8:.*]] = fir.absent !fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:           %[[VAL_9:.*]] = arith.select %[[VAL_7]], %[[VAL_3]], %[[VAL_8]] : !fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] {fortran_attrs = #fir.var_attrs<optional, host_assoc>, {{.*}}Ex
+! CHECK:           %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_12:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_11]]
+! CHECK:           %[[VAL_13:.*]] = fir.load %[[VAL_12]] : !fir.ref<!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>
+! CHECK:           %[[VAL_14:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_14]]
+! CHECK:           %[[VAL_16:.*]] = fir.box_addr %[[VAL_13]]
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>) -> i64
+! CHECK:           %[[VAL_18:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_17]], %[[VAL_18]] : i64
+! CHECK:           %[[VAL_20:.*]] = fir.absent !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           %[[VAL_21:.*]] = arith.select %[[VAL_19]], %[[VAL_13]], %[[VAL_20]] : !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           %[[VAL_22:.*]] = fir.shift %[[VAL_15]]#0 : (index) -> !fir.shift<1>
+! CHECK:           %[[VAL_23:.*]]:2 = hlfir.declare %[[VAL_21]](%[[VAL_22]]) {fortran_attrs = #fir.var_attrs<optional, host_assoc>, {{.*}}Ey
diff --git a/flang/test/Lower/HLFIR/internal-procedures.f90 b/flang/test/Lower/HLFIR/internal-procedures.f90
index d517cb4345afa9..c898903b6fbe11 100644
--- a/flang/test/Lower/HLFIR/internal-procedures.f90
+++ b/flang/test/Lower/HLFIR/internal-procedures.f90
@@ -9,8 +9,8 @@ subroutine internal
   call takes_array(x)
 end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFtest_explicit_shape_arrayPinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFtest_explicit_shape_arrayPinternal(
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.box<!fir.array<?xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
@@ -27,8 +27,8 @@ subroutine internal
   call takes_array(x)
 end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFtest_assumed_shapePinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFtest_assumed_shapePinternal(
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.box<!fir.array<?xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
@@ -44,8 +44,8 @@ subroutine internal()
   call bar(c)
 end subroutine
 end subroutine
-! CHECK-LABEL:   func.func @_QFtest_scalar_charPinternal(
-! CHECK-SAME:                               %[[VAL_0:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL:   func.func private @_QFtest_scalar_charPinternal(
+! CHECK-SAME:                               %[[VAL_0:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.boxchar<1>>
diff --git a/flang/test/Lower/Intrinsics/norm2.f90 b/flang/test/Lower/Intrinsics/norm2.f90
index f14cad59d5bd3b..0d125e36f6650a 100644
--- a/flang/test/Lower/Intrinsics/norm2.f90
+++ b/flang/test/Lower/Intrinsics/norm2.f90
@@ -76,3 +76,19 @@ subroutine norm2_test_dim_3(a,r)
   ! CHECK-DAG:  %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.heap<!fir.array<?x?xf32>>
   ! CHECK-DAG:  fir.freemem %[[addr]]
 end subroutine norm2_test_dim_3
+
+! CHECK-LABEL: func @_QPnorm2_test_real16(
+! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?x?x?xf128>>{{.*}}, %[[arg1:.*]]: !fir.box<!fir.array<?x?xf128>>{{.*}})
+subroutine norm2_test_real16(a,r)
+  real(16) :: a(:,:,:)
+  real(16) :: r(:,:)
+  ! CHECK-DAG:  %[[dim:.*]] = arith.constant 3 : i32
+  ! CHECK-DAG:  %[[r:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf128>>>
+  ! CHECK-DAG:  %[[res:.*]] = fir.convert %[[r]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf128>>>>) -> !fir.ref<!fir.box<none>>
+  ! CHECK:  %[[arr:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?x?xf128>>) -> !fir.box<none>
+  r = norm2(a,dim=3)
+  ! CHECK:  %{{.*}} = fir.call @_FortranANorm2DimReal16(%[[res]], %[[arr]], %[[dim]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK:  %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf128>>>>
+  ! CHECK-DAG:  %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?x?xf128>>>) -> !fir.heap<!fir.array<?x?xf128>>
+  ! CHECK-DAG:  fir.freemem %[[addr]]
+end subroutine norm2_test_real16
diff --git a/flang/test/Lower/Intrinsics/random.f90 b/flang/test/Lower/Intrinsics/random.f90
index ca194befd02741..4fb1a9a5da27a7 100644
--- a/flang/test/Lower/Intrinsics/random.f90
+++ b/flang/test/Lower/Intrinsics/random.f90
@@ -45,7 +45,7 @@ subroutine random_test_2
   call foo(size)
   call bar(size, get)
 contains
-  ! CHECK-LABEL: func @_QFrandom_test_2Pfoo
+  ! CHECK-LABEL: func private @_QFrandom_test_2Pfoo
   subroutine foo(size, put, get)
     ! CHECK: [[s1:%[0-9]+]] = fir.is_present %arg0
     ! CHECK: [[s2:%[0-9]+]] = fir.embox %arg0
@@ -70,7 +70,7 @@ subroutine foo(size, put, get)
     print*, size
   end subroutine
 
-  ! CHECK-LABEL: func @_QFrandom_test_2Pbar
+  ! CHECK-LABEL: func private @_QFrandom_test_2Pbar
   subroutine bar(size, get, put)
     integer, optional :: size
     ! CHECK: [[p1:%[0-9]+]] = fir.is_present %arg2
diff --git a/flang/test/Lower/Intrinsics/ubound01.f90 b/flang/test/Lower/Intrinsics/ubound01.f90
index 4ebe3870cf0b36..df51d79eb6afe0 100644
--- a/flang/test/Lower/Intrinsics/ubound01.f90
+++ b/flang/test/Lower/Intrinsics/ubound01.f90
@@ -16,7 +16,7 @@ subroutine s2(a,n,n2)
   End Subroutine
 end
 
-! CHECK-LABEL: func.func @_QFPs2
+! CHECK-LABEL: func.func private @_QFPs2
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>>
 ! CHECK: %[[BOX:.*]] = fir.rebox %[[ARG0]](%{{.*}}) : (!fir.box<!fir.array<?x?xf32>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xf32>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[BOX]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
diff --git a/flang/test/Lower/OpenACC/acc-routine04.f90 b/flang/test/Lower/OpenACC/acc-routine04.f90
index b5f5aa2ca4882e..2339c23eaaf857 100644
--- a/flang/test/Lower/OpenACC/acc-routine04.f90
+++ b/flang/test/Lower/OpenACC/acc-routine04.f90
@@ -31,4 +31,4 @@ subroutine sub2()
 ! CHECK: acc.routine @acc_routine_0 func(@_QMdummy_modPsub1) seq
 ! CHECK: func.func @_QMdummy_modPsub1(%arg0: !fir.ref<i32> {fir.bindc_name = "i"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>}
 ! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "test_acc_routine"}
-! CHECK: func.func @_QFPsub2() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>}
+! CHECK: func.func private @_QFPsub2() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>, llvm.linkage = #llvm.linkage<internal>}
diff --git a/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90 b/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90
new file mode 100644
index 00000000000000..122542345f104b
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90
@@ -0,0 +1,29 @@
+! Test delayed privatization for the `private` clause.
+
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_firstprivate
+  implicit none
+  integer :: var1
+
+!$OMP PARALLEL FIRSTPRIVATE(var1)
+  var1 = 10
+!$OMP END PARALLEL
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_firstprivateEvar1"}
+! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<i32>)
+! CHECK: } copy {
+! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref<i32>, %[[PRIV_PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]] : !fir.ref<i32>
+! CHECK:    fir.store %[[ORIG_VAL]] to %[[PRIV_PRIV_ARG]] : !fir.ref<i32>
+! CHECK:    omp.yield(%[[PRIV_PRIV_ARG]] : !fir.ref<i32>)
+! CHECK: }
+
+! CHECK-LABEL: @_QPdelayed_privatization_firstprivate
+! CHECK: omp.parallel private(@[[VAR1_PRIVATIZER_SYM]] %{{.*}} -> %{{.*}} : !fir.ref<i32>) {
+! CHECK: omp.terminator
+
diff --git a/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90 b/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90
new file mode 100644
index 00000000000000..2e9995ea1fd4c4
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90
@@ -0,0 +1,38 @@
+! Test delayed privatization for the `private` clause.
+
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_private
+  implicit none
+  integer :: var1
+
+!$OMP PARALLEL PRIVATE(var1)
+  var1 = 10
+!$OMP END PARALLEL
+
+!$OMP PARALLEL PRIVATE(var1)
+  var1 = 20
+!$OMP END PARALLEL
+
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_privateEvar1"}
+! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<i32>)
+! CHECK-NOT: } copy {
+
+! CHECK-LABEL: @_QPdelayed_privatization_private
+! CHECK: %[[ORIG_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatization_privateEvar1"}
+! CHECK: omp.parallel private(@[[PRIVATIZER_SYM]] %[[ORIG_ALLOC]] -> %[[PAR_ARG:.*]] : !fir.ref<i32>) {
+! CHECK: %[[C10:.*]] = arith.constant 10 : i32
+! CHECK: fir.store %[[C10]] to %[[PAR_ARG]] : !fir.ref<i32>
+! CHECK: omp.terminator
+
+! Test that the same privatizer is used if the a variable with the same type and
+! name was previously privatized.
+! CHECK: omp.parallel private(@[[PRIVATIZER_SYM]] %[[ORIG_ALLOC]] -> %[[PAR_ARG:.*]] : !fir.ref<i32>) {
+! CHECK: %[[C20:.*]] = arith.constant 20 : i32
+! CHECK: fir.store %[[C20]] to %[[PAR_ARG]] : !fir.ref<i32>
+! CHECK: omp.terminator
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90
index 14c0dff8da4bb1..6db5735c21f1e1 100644
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90
+++ b/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90
@@ -17,7 +17,7 @@ module m
 ! CHECK:         return
 ! CHECK:       }
 !
-! CHECK-LABEL: func.func @_QMm2FtestPinternal_test() {
+! CHECK-LABEL: func.func private @_QMm2FtestPinternal_test() {{.*}} {
 ! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QMmEx) : !fir.ref<i32>
 ! CHECK:         %[[VAL_1:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
 ! CHECK:         fir.call @_QPbar(%[[VAL_1]]) {{.*}}: (!fir.ref<i32>) -> ()
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90
new file mode 100644
index 00000000000000..e3d2a5a8af2608
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90
@@ -0,0 +1,29 @@
+! Test delayed privatization for the `firstprivate` clause.
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_firstprivate
+  implicit none
+  integer :: var1
+
+!$omp parallel firstprivate(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_firstprivateEvar1"}
+! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] {uniq_name = "_QFdelayed_privatization_firstprivateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : !fir.ref<i32>)
+! CHECK: } copy {
+! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref<i32>, %[[PRIV_PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]] : !fir.ref<i32>
+! CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PRIV_PRIV_ARG]] temporary_lhs : i32, !fir.ref<i32>
+! CHECK:    omp.yield(%[[PRIV_PRIV_ARG]] : !fir.ref<i32>)
+! CHECK: }
+
+! CHECK-LABEL: @_QPdelayed_privatization_firstprivate
+! CHECK: omp.parallel private(@[[VAR1_PRIVATIZER_SYM]] %{{.*}} -> %{{.*}} : !fir.ref<i32>) {
+! CHECK: omp.terminator
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90
new file mode 100644
index 00000000000000..46eef6eb3bcf6a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90
@@ -0,0 +1,34 @@
+! Test delayed privatization for both `private` and `firstprivate` clauses.
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_private_firstprivate
+  implicit none
+  integer :: var1
+  integer :: var2
+
+!$omp parallel private(var1) firstprivate(var2)
+  var1 = 10
+  var2 = var1 + var2
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[VAR2_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK: } copy {
+! CHECK: }
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK: }
+
+! CHECK-LABEL: func.func @_QPdelayed_privatization_private_firstprivate() {
+! CHECK:  %[[VAR1_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1"
+! CHECK:  %[[VAR1_DECL:.*]]:2 = hlfir.declare %[[VAR1_ALLOC]]
+
+! CHECK:  %[[VAR2_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var2"
+! CHECK:  %[[VAR2_DECL:.*]]:2 = hlfir.declare %[[VAR2_ALLOC]]
+
+! CHECK:  omp.parallel private(
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM]] %[[VAR1_DECL]]#0 -> %{{.*}} : !fir.ref<i32>, 
+! CHECK-SAME: @[[VAR2_PRIVATIZER_SYM]] %[[VAR2_DECL]]#0 -> %{{.*}} : !fir.ref<i32>) {
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-private.f90 b/flang/test/Lower/OpenMP/delayed-privatization-private.f90
new file mode 100644
index 00000000000000..240e0e71bfcd16
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-private.f90
@@ -0,0 +1,28 @@
+! Test delayed privatization for the `private` clause.
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_private
+  implicit none
+  integer :: var1
+
+!$omp parallel private(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_privateEvar1"}
+! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] {uniq_name = "_QFdelayed_privatization_privateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : !fir.ref<i32>)
+! CHECK-NOT: } copy {
+
+! CHECK-LABEL: @_QPdelayed_privatization_private
+! CHECK: %[[ORIG_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatization_privateEvar1"}
+! CHECK: %[[ORIG_DECL:.*]]:2 = hlfir.declare %[[ORIG_ALLOC]] {uniq_name = "_QFdelayed_privatization_privateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: omp.parallel private(@[[PRIVATIZER_SYM]] %[[ORIG_DECL]]#0 -> %[[PAR_ARG:.*]] : !fir.ref<i32>) {
+! CHECK: %[[PAR_ARG_DECL:.*]]:2 = hlfir.declare %[[PAR_ARG]] {uniq_name = "_QFdelayed_privatization_privateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: hlfir.assign %{{.*}} to %[[PAR_ARG_DECL]]#0 : i32, !fir.ref<i32>
+! CHECK: omp.terminator
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-reduction.f90 b/flang/test/Lower/OpenMP/delayed-privatization-reduction.f90
new file mode 100644
index 00000000000000..c61f352b9b055a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-reduction.f90
@@ -0,0 +1,30 @@
+! Test that reductions and delayed privatization work properly togehter. Since
+! both types of clauses add block arguments to the OpenMP region, we make sure
+! that the block arguments are added in the proper order (reductions first and
+! then delayed privatization.
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine red_and_delayed_private
+    integer :: red
+    integer :: prv
+
+    red = 0
+    prv = 10
+
+    !$omp parallel reduction(+:red) private(prv)
+    red = red + 1
+    prv = 20
+    !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+
+! CHECK-LABEL: omp.reduction.declare
+! CHECK-SAME: @[[REDUCTION_SYM:.*]] : i32 init
+
+! CHECK-LABEL: _QPred_and_delayed_private
+! CHECK: omp.parallel
+! CHECK-SAME: reduction(@[[REDUCTION_SYM]] %{{.*}} -> %arg0 : !fir.ref<i32>)
+! CHECK-SAME: private(@[[PRIVATIZER_SYM]] %{{.*}} -> %arg1 : !fir.ref<i32>) {
diff --git a/flang/test/Lower/OpenMP/threadprivate-commonblock-use.f90 b/flang/test/Lower/OpenMP/threadprivate-commonblock-use.f90
index 28616f7595a0d6..71f1c7608a2c31 100644
--- a/flang/test/Lower/OpenMP/threadprivate-commonblock-use.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-commonblock-use.f90
@@ -15,7 +15,7 @@ module  m1
   subroutine ss1
     use m0
   contains
-!CHECK-LABEL: func @_QMm1Fss1Pss2
+!CHECK-LABEL: func private @_QMm1Fss1Pss2
 !CHECK: %[[CMN:.*]] = fir.address_of(@cmn_) : !fir.ref<!fir.array<4xi8>>
 !CHECK: omp.parallel
 !CHECK: %{{.*}} = omp.threadprivate %[[CMN]] : !fir.ref<!fir.array<4xi8>> -> !fir.ref<!fir.array<4xi8>>
diff --git a/flang/test/Lower/OpenMP/threadprivate-use-association-2-hlfir.f90 b/flang/test/Lower/OpenMP/threadprivate-use-association-2-hlfir.f90
index 722f023fbefc96..79a1ce9897f256 100644
--- a/flang/test/Lower/OpenMP/threadprivate-use-association-2-hlfir.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-use-association-2-hlfir.f90
@@ -19,7 +19,7 @@ module m
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func.func @_QMm2FtestPinternal_test() {
+! CHECK-LABEL: func.func private @_QMm2FtestPinternal_test() {{.*}} {
 ! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QMmEx) : !fir.ref<i32>
 ! CHECK:         %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QMmEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:         %[[VAL_2:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref<i32> -> !fir.ref<i32>
diff --git a/flang/test/Lower/PowerPC/ppc-vector-types.f90 b/flang/test/Lower/PowerPC/ppc-vector-types.f90
index be293f873ecb42..4745e4567b2d12 100644
--- a/flang/test/Lower/PowerPC/ppc-vector-types.f90
+++ b/flang/test/Lower/PowerPC/ppc-vector-types.f90
@@ -44,7 +44,7 @@ program ppc_vec_unit
       ! CHECK-LLVM-NEXT: store <512 x i1> %[[RESQ]], ptr @_QFEvq2, align 64
 
       contains
-      ! CHECK-LLVM-LABEL: define <4 x i32> @_QFPtest_vec_integer_assign
+      ! CHECK-LLVM-LABEL: define internal <4 x i32> @_QFPtest_vec_integer_assign
       function test_vec_integer_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <4 x i32>, i64 1, align 16
         vector(integer(4)) :: arg1, test_vec_integer_assign
@@ -58,7 +58,7 @@ function test_vec_integer_assign(arg1)
         ! CHECK-LLVM-NEXT: ret <4 x i32> %[[RET]]
       end function test_vec_integer_assign
 
-      ! CHECK-LLVM-LABEL: define <2 x double> @_QFPtest_vec_real_assign
+      ! CHECK-LLVM-LABEL: define internal <2 x double> @_QFPtest_vec_real_assign
       function test_vec_real_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <2 x double>, i64 1, align 16
         vector(real(8)) :: arg1, test_vec_real_assign
@@ -72,7 +72,7 @@ function test_vec_real_assign(arg1)
         ! CHECK-LLVM-NEXT: ret <2 x double> %[[RET]]
       end function test_vec_real_assign
 
-      ! CHECK-LLVM-LABEL: define <8 x i16> @_QFPtest_vec_unsigned_assign
+      ! CHECK-LLVM-LABEL: define internal <8 x i16> @_QFPtest_vec_unsigned_assign
       function test_vec_unsigned_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <8 x i16>, i64 1, align 16
         vector(unsigned(2)) :: arg1, test_vec_unsigned_assign
@@ -86,7 +86,7 @@ function test_vec_unsigned_assign(arg1)
         ! CHECK-LLVM-NEXT: ret <8 x i16> %[[RET]]
       end function test_vec_unsigned_assign
 
-      ! CHECK-LLVM-LABEL: define <256 x i1> @_QFPtest_vec_pair_assign
+      ! CHECK-LLVM-LABEL: define internal <256 x i1> @_QFPtest_vec_pair_assign
       function test_vec_pair_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <256 x i1>, i64 1, align 32
         __vector_pair :: arg1, test_vec_pair_assign
@@ -100,7 +100,7 @@ function test_vec_pair_assign(arg1)
         ! CHECK-LLVM-NEXT: ret <256 x i1> %[[RET]]
       end function test_vec_pair_assign
 
-      ! CHECK-LLVM-LABEL: define <512 x i1> @_QFPtest_vec_quad_assign
+      ! CHECK-LLVM-LABEL: define internal <512 x i1> @_QFPtest_vec_quad_assign
       function test_vec_quad_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <512 x i1>, i64 1, align 64
         __vector_quad :: arg1, test_vec_quad_assign
diff --git a/flang/test/Lower/array-temp.f90 b/flang/test/Lower/array-temp.f90
index 347d4cef78bcd1..10c5ee91d44bda 100644
--- a/flang/test/Lower/array-temp.f90
+++ b/flang/test/Lower/array-temp.f90
@@ -404,7 +404,7 @@ subroutine tt1
   ! CHECK-NEXT: fir.call @_FortranAioEndIoStatement
   print*, [(r([7.0]),i=1,3)]
 contains
-  ! CHECK-LABEL: func @_QFtt1Pr
+  ! CHECK-LABEL: func private @_QFtt1Pr
   function r(x)
     real x(:)
     r = x(1)
diff --git a/flang/test/Lower/dummy-arguments.f90 b/flang/test/Lower/dummy-arguments.f90
index 43d8e3c1e5d448..331e089a60fa02 100644
--- a/flang/test/Lower/dummy-arguments.f90
+++ b/flang/test/Lower/dummy-arguments.f90
@@ -9,7 +9,7 @@ program test1
   call foo(10)
 contains
 
-! CHECK-LABEL: func @_QFPfoo
+! CHECK-LABEL: func private @_QFPfoo
 subroutine foo(avar1)
   integer :: avar1
 !  integer :: my_data, my_data2
diff --git a/flang/test/Lower/dummy-procedure-character.f90 b/flang/test/Lower/dummy-procedure-character.f90
index cecd839287ed13..72d548513fb270 100644
--- a/flang/test/Lower/dummy-procedure-character.f90
+++ b/flang/test/Lower/dummy-procedure-character.f90
@@ -213,7 +213,7 @@ subroutine host(f)
   ! CHECK: fir.call @_QFhostPintern(%[[VAL_1]])
   call intern()
 contains
-! CHECK-LABEL: func @_QFhostPintern(
+! CHECK-LABEL: func private @_QFhostPintern(
 ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc})
   subroutine intern()
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
@@ -242,7 +242,7 @@ subroutine host2(f)
   ! CHECK: fir.call @_QFhost2Pintern(%[[VAL_1]])
   call intern()
 contains
-! CHECK-LABEL: func @_QFhost2Pintern(
+! CHECK-LABEL: func private @_QFhost2Pintern(
 ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc})
   subroutine intern()
     ! CHECK:  %[[VAL_1:.*]] = fir.alloca !fir.char<1,42> {bindc_name = ".result"}
diff --git a/flang/test/Lower/equivalence-with-host-assoc.f90 b/flang/test/Lower/equivalence-with-host-assoc.f90
index ec84fb506314ba..0ffb1bc5bf9ee1 100644
--- a/flang/test/Lower/equivalence-with-host-assoc.f90
+++ b/flang/test/Lower/equivalence-with-host-assoc.f90
@@ -10,7 +10,7 @@ subroutine inner
     i1 = j1
   end subroutine inner
 end subroutine test1
-! FIR-LABEL:   func.func @_QFtest1Pinner() attributes {fir.internal_proc} {
+! FIR-LABEL:   func.func private @_QFtest1Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@_QFtest1Ei1) : !fir.ref<!fir.array<1xi32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -24,7 +24,7 @@ end subroutine test1
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func @_QFtest1Pinner() attributes {fir.internal_proc} {
+! HLFIR-LABEL:   func.func private @_QFtest1Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@_QFtest1Ei1) : !fir.ref<!fir.array<1xi32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -54,7 +54,7 @@ subroutine inner
     end subroutine inner
   end subroutine host
 end module test2
-! FIR-LABEL:   func.func @_QMtest2FhostPinner() attributes {fir.internal_proc} {
+! FIR-LABEL:   func.func private @_QMtest2FhostPinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@_QMtest2FhostEf1) : !fir.ref<!fir.array<1xi32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -68,7 +68,7 @@ end module test2
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func @_QMtest2FhostPinner() attributes {fir.internal_proc} {
+! HLFIR-LABEL:   func.func private @_QMtest2FhostPinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@_QMtest2FhostEf1) : !fir.ref<!fir.array<1xi32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -94,7 +94,7 @@ subroutine inner
     i1 = j1 + k1
   end subroutine inner
 end subroutine test3
-! FIR-LABEL:   func.func @_QFtest3Pinner() attributes {fir.internal_proc} {
+! FIR-LABEL:   func.func private @_QFtest3Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -115,7 +115,7 @@ end subroutine test3
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func @_QFtest3Pinner() attributes {fir.internal_proc} {
+! HLFIR-LABEL:   func.func private @_QFtest3Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -149,7 +149,7 @@ subroutine inner
     i1 = j1 + k1
   end subroutine inner
 end subroutine test4
-! FIR-LABEL:   func.func @_QFtest4Pinner() attributes {fir.internal_proc} {
+! FIR-LABEL:   func.func private @_QFtest4Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -170,7 +170,7 @@ end subroutine test4
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func @_QFtest4Pinner() attributes {fir.internal_proc} {
+! HLFIR-LABEL:   func.func private @_QFtest4Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/explicit-interface-results-2.f90 b/flang/test/Lower/explicit-interface-results-2.f90
index 4e4b035bae7e1e..86aae720e7fcf9 100644
--- a/flang/test/Lower/explicit-interface-results-2.f90
+++ b/flang/test/Lower/explicit-interface-results-2.f90
@@ -69,8 +69,8 @@ subroutine host4()
   integer :: n
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost4Pinternal_proc_a
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFhost4Pinternal_proc_a
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
@@ -94,7 +94,7 @@ subroutine host5()
   implicit none
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost5Pinternal_proc_a() attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFhost5Pinternal_proc_a() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QMsome_moduleEn_module) : !fir.ref<i32>
@@ -115,7 +115,7 @@ subroutine host6()
   implicit none
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost6Pinternal_proc_a
+! CHECK-LABEL: func private @_QFhost6Pinternal_proc_a
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QMsome_moduleEn_module) : !fir.ref<i32>
@@ -187,7 +187,7 @@ subroutine host9()
   common /mycom/ n_common
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost9Pinternal_proc_a
+! CHECK-LABEL: func private @_QFhost9Pinternal_proc_a
   subroutine internal_proc_a()
 ! CHECK:  %[[VAL_0:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_1:.*]] = fir.address_of(@mycom_) : !fir.ref<!fir.array<4xi8>>
@@ -213,7 +213,7 @@ subroutine host10()
   implicit none
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost10Pinternal_proc_a
+! CHECK-LABEL: func private @_QFhost10Pinternal_proc_a
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_0:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/forall/array-constructor.f90 b/flang/test/Lower/forall/array-constructor.f90
index 083a71ba479aa2..ad21ed33fba2d4 100644
--- a/flang/test/Lower/forall/array-constructor.f90
+++ b/flang/test/Lower/forall/array-constructor.f90
@@ -114,8 +114,8 @@ end subroutine ac1
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func @_QFac1Pfunc(
-! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -> i32 {
+! CHECK-LABEL: func private @_QFac1Pfunc(
+! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -> i32 {{.*}} {
 ! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "func", uniq_name = "_QFac1FfuncEfunc"}
 ! CHECK:         %[[VAL_2:.*]] = arith.constant 1 : i64
 ! CHECK:         %[[VAL_3:.*]] = arith.constant 1 : i64
@@ -259,8 +259,8 @@ end subroutine ac2
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func @_QFac2Pfunc(
-! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -> !fir.array<3xi32> {
+! CHECK-LABEL: func private @_QFac2Pfunc(
+! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -> !fir.array<3xi32> {{.*}} {
 ! CHECK:         %[[VAL_1:.*]] = arith.constant 3 : index
 ! CHECK:         %[[VAL_2:.*]] = fir.alloca !fir.array<3xi32> {bindc_name = "func", uniq_name = "_QFac2FfuncEfunc"}
 ! CHECK:         %[[VAL_3:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/forall/character-1.f90 b/flang/test/Lower/forall/character-1.f90
index e5c40a16420a62..e97c3f36b0b10f 100644
--- a/flang/test/Lower/forall/character-1.f90
+++ b/flang/test/Lower/forall/character-1.f90
@@ -17,7 +17,7 @@ subroutine sub(x)
   end subroutine sub
 end program test
 
-! CHECK-LABEL: define void @_QFPsub(
+! CHECK-LABEL: define internal void @_QFPsub(
 ! CHECK-SAME:    ptr %[[arg:.*]])
 ! CHECK: %[[extent:.*]] = getelementptr { {{.*}}, [1 x [3 x i64]] }, ptr %[[arg]], i32 0, i32 7, i64 0, i32 1
 ! CHECK: %[[extval:.*]] = load i64, ptr %[[extent]]
diff --git a/flang/test/Lower/global-initialization.f90 b/flang/test/Lower/global-initialization.f90
index dd60a6fd8b9f86..ff208ecc3c8978 100644
--- a/flang/test/Lower/global-initialization.f90
+++ b/flang/test/Lower/global-initialization.f90
@@ -4,16 +4,19 @@ program bar
 ! CHECK: fir.address_of(@[[name1:.*]]my_data)
   integer, save :: my_data = 1
   print *, my_data
+  call foo()
+  call foo2()
+  call foo3()
 contains
 
-! CHECK-LABEL: func @_QFPfoo
+! CHECK-LABEL: func private @_QFPfoo
 subroutine foo()
 ! CHECK: fir.address_of(@[[name2:.*foo.*my_data]])
   integer, save :: my_data = 2
   print *, my_data + 1
 end subroutine
 
-! CHECK-LABEL: func @_QFPfoo2
+! CHECK-LABEL: func private @_QFPfoo2
 subroutine foo2()
 ! CHECK: fir.address_of(@[[name3:.*foo2.*my_data]])
   integer, save :: my_data
@@ -21,7 +24,7 @@ subroutine foo2()
   print *, my_data
 end subroutine
 
-! CHECK-LABEL: func @_QFPfoo3
+! CHECK-LABEL: func private @_QFPfoo3
 subroutine foo3()
 ! CHECK-DAG: fir.address_of(@[[name4:.*foo3.*idata]]){{.*}}fir.array<5xi32>
 ! CHECK-DAG: fir.address_of(@[[name5:.*foo3.*rdata]]){{.*}}fir.array<3xf16>
diff --git a/flang/test/Lower/host-associated-functions.f90 b/flang/test/Lower/host-associated-functions.f90
index 77a51490950fa1..78d081748c2f42 100644
--- a/flang/test/Lower/host-associated-functions.f90
+++ b/flang/test/Lower/host-associated-functions.f90
@@ -19,8 +19,8 @@ subroutine capture_char_func_dummy(char_func_dummy, n)
   ! CHECK:  fir.call @_QFcapture_char_func_dummyPinternal(%[[VAL_2]]) {{.*}}: (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>>) -> ()
   call internal()
 contains
-  ! CHECK-LABEL: func @_QFcapture_char_func_dummyPinternal(
-  ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFcapture_char_func_dummyPinternal(
+  ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal()
   ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
   ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>>, i32) -> !fir.ref<tuple<!fir.boxproc<() -> ()>, i64>>
@@ -55,8 +55,8 @@ subroutine capture_char_func_assumed_dummy(char_func_dummy)
 ! CHECK:  fir.call @_QFcapture_char_func_assumed_dummyPinternal(%[[VAL_1]]) {{.*}}: (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>>) -> ()
   call internal()
 contains
-! CHECK-LABEL: func @_QFcapture_char_func_assumed_dummyPinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFcapture_char_func_assumed_dummyPinternal(
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal()
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>>, i32) -> !fir.ref<tuple<!fir.boxproc<() -> ()>, i64>>
@@ -84,7 +84,7 @@ subroutine capture_char_func(n)
 ! CHECK:  fir.call @_QFcapture_char_funcPinternal(%[[VAL_1]]) {{.*}}: (!fir.ref<tuple<!fir.ref<i32>>>) -> ()
   call internal()
 contains
-! CHECK-LABEL: func @_QFcapture_char_funcPinternal(
+! CHECK-LABEL: func private @_QFcapture_char_funcPinternal(
 ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc})
   subroutine internal()
    print *, char_func()
@@ -109,8 +109,8 @@ function array_func()
   call internal()
 contains
   subroutine internal()
-! CHECK-LABEL: func @_QFcapture_array_funcPinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFcapture_array_funcPinternal(
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.llvm_ptr<!fir.ref<i32>>
@@ -146,7 +146,7 @@ subroutine use_module()
 ! CHECK:  fir.call @_QFuse_modulePinternal() {{.*}}: () -> ()
   call internal()
   contains
-! CHECK-LABEL: func @_QFuse_modulePinternal() {
+! CHECK-LABEL: func private @_QFuse_modulePinternal() {{.*}} {
   subroutine internal()
     print *, return_char(42)
   end subroutine
diff --git a/flang/test/Lower/host-associated-globals.f90 b/flang/test/Lower/host-associated-globals.f90
index bb22a32775427d..fe612e777aeaad 100644
--- a/flang/test/Lower/host-associated-globals.f90
+++ b/flang/test/Lower/host-associated-globals.f90
@@ -18,7 +18,7 @@ subroutine bar()
     print *, j_in_equiv, not_in_equiv
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFmodule_varPbar()
+! CHECK-LABEL: func.func private @_QFmodule_varPbar()
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QMtest_mod_used_in_hostEi) : !fir.ref<!fir.array<4xi8>>
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<!fir.array<4xi8>>, index) -> !fir.ref<i8>
@@ -37,7 +37,7 @@ subroutine bar()
     print *, j_in_equiv, not_in_equiv
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFtest_commonPbar() attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFtest_commonPbar() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@x_) : !fir.ref<!fir.array<12xi8>>
 ! CHECK:  %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<12xi8>>) -> !fir.ref<!fir.array<?xi8>>
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 4 : index
@@ -59,7 +59,7 @@ subroutine bar()
     print *, j_in_equiv, not_in_equiv
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFsaved_equivPbar() attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFsaved_equivPbar() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QFsaved_equivEi) : !fir.ref<!fir.array<8xi8>>
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 4 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<!fir.array<8xi8>>, index) -> !fir.ref<i8>
@@ -79,8 +79,8 @@ subroutine bar()
     call test(saved_j, j)
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFmixed_capturePbar(
-! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFmixed_capturePbar(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = fir.address_of(@_QFmixed_captureEsaved_i) : !fir.ref<!fir.array<4xi8>>
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_1]], %[[VAL_2]] : (!fir.ref<!fir.array<4xi8>>, index) -> !fir.ref<i8>
diff --git a/flang/test/Lower/host-associated.f90 b/flang/test/Lower/host-associated.f90
index 25e637805e872b..f88903c8af80f8 100644
--- a/flang/test/Lower/host-associated.f90
+++ b/flang/test/Lower/host-associated.f90
@@ -19,8 +19,8 @@ subroutine test1
   call test1_internal
   print *, i
 contains
-  ! CHECK-LABEL: func @_QFtest1Ptest1_internal(
-  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest1Ptest1_internal(
+  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   ! CHECK: %[[iaddr:.*]] = fir.coordinate_of %[[arg]], %c0
   ! CHECK: %[[i:.*]] = fir.load %[[iaddr]] : !fir.llvm_ptr<!fir.ref<i32>>
   ! CHECK: %[[val:.*]] = fir.call @_QPifoo() {{.*}}: () -> i32
@@ -46,8 +46,8 @@ subroutine test2
   call test2_internal
   print *, a, b
 contains
-  ! CHECK-LABEL: func @_QFtest2Ptest2_internal(
-  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest2Ptest2_internal(
+  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test2_internal
     ! CHECK: %[[a:.*]] = fir.coordinate_of %[[arg]], %c0
     ! CHECK: %[[aa:.*]] = fir.load %[[a]] : !fir.llvm_ptr<!fir.ref<f32>>
@@ -61,8 +61,8 @@ subroutine test2_internal
     call test2_inner
   end subroutine test2_internal
 
-  ! CHECK-LABEL: func @_QFtest2Ptest2_inner(
-  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest2Ptest2_inner(
+  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test2_inner
     ! CHECK: %[[a:.*]] = fir.coordinate_of %[[arg]], %c0
     ! CHECK: %[[aa:.*]] = fir.load %[[a]] : !fir.llvm_ptr<!fir.ref<f32>>
@@ -95,8 +95,8 @@ subroutine test6(c)
   print *, c
 
 contains
-  ! CHECK-LABEL: func @_QFtest6Ptest6_inner(
-  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest6Ptest6_inner(
+  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test6_inner
     ! CHECK: %[[coor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
     ! CHECK: %[[load:.*]] = fir.load %[[coor]] : !fir.ref<!fir.boxchar<1>>
@@ -137,8 +137,8 @@ subroutine test3(p,q,i)
   end if
   
 contains
-  ! CHECK-LABEL: func @_QFtest3Ptest3_inner(
-  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest3Ptest3_inner(
+  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test3_inner
     ! CHECK: %[[pcoor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
     ! CHECK: %[[p:.*]] = fir.load %[[pcoor]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
@@ -184,8 +184,8 @@ subroutine test3a(p)
   end if
   
 contains
-  ! CHECK: func @_QFtest3aPtest3a_inner(
-  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<10xf32>>, !fir.box<!fir.array<10xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK: func private @_QFtest3aPtest3a_inner(
+  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<10xf32>>, !fir.box<!fir.array<10xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test3a_inner
     ! CHECK: %[[pcoor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.box<!fir.array<10xf32>>, !fir.box<!fir.array<10xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<10xf32>>>
     ! CHECK: %[[p:.*]] = fir.load %[[pcoor]] : !fir.ref<!fir.box<!fir.array<10xf32>>>
@@ -228,8 +228,8 @@ subroutine test4
   end if
   
 contains
-  ! CHECK-LABEL: func @_QFtest4Ptest4_inner(
-  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest4Ptest4_inner(
+  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test4_inner
     ! CHECK: %[[ptup:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<f32>>>>
     ! CHECK: %[[p:.*]] = fir.load %[[ptup]] : !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<f32>>>>
@@ -270,8 +270,8 @@ subroutine test5
   end if
   
 contains
-  ! CHECK-LABEL: func @_QFtest5Ptest5_inner(
-  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest5Ptest5_inner(
+  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test5_inner
     ! CHECK: %[[ptup:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>>
     ! CHECK: %[[p:.*]] = fir.load %[[ptup]] : !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>>
@@ -308,8 +308,8 @@ subroutine test7(j, k)
   k = test7_inner(k)
 contains
 
-! CHECK-LABEL: func @_QFtest7Ptest7_inner(
-! CHECK-SAME: %[[i:.*]]: !fir.ref<i32>{{.*}}, %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) -> i32 attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFtest7Ptest7_inner(
+! CHECK-SAME: %[[i:.*]]: !fir.ref<i32>{{.*}}, %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) -> i32 attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 elemental integer function test7_inner(i)
   implicit none
   integer, intent(in) :: i
@@ -329,8 +329,8 @@ subroutine issue990()
   integer :: captured
   call bar()
 contains
-! CHECK-LABEL: func @_QFissue990Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFissue990Pbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   integer :: stmt_func, i
   stmt_func(i) = i + captured
@@ -351,8 +351,8 @@ subroutine issue990b()
   captured_stmt_func(i) = i + captured
   call bar()
 contains
-! CHECK-LABEL: func @_QFissue990bPbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFissue990bPbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
   ! CHECK: %[[addr:.*]] = fir.load %[[tupAddr]] : !fir.llvm_ptr<!fir.ref<i32>>
@@ -372,8 +372,8 @@ real function dummy_proc(x)
  end interface
  call bar()
 contains
-! CHECK-LABEL: func @_QFtest8Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFtest8Pbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.boxproc<() -> ()>>>, i32) -> !fir.ref<!fir.boxproc<() -> ()>>
   ! CHECK: %[[dummyProc:.*]] = fir.load %[[tupAddr]] : !fir.ref<!fir.boxproc<() -> ()>>
@@ -392,8 +392,8 @@ subroutine dummy_proc()
  end interface
  call bar()
 contains
-! CHECK-LABEL: func @_QFtest9Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFtest9Pbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.boxproc<() -> ()>>>, i32) -> !fir.ref<!fir.boxproc<() -> ()>>
   ! CHECK: %[[dummyProc:.*]] = fir.load %[[tupAddr]] : !fir.ref<!fir.boxproc<() -> ()>>
@@ -415,8 +415,8 @@ subroutine test10(i)
  ! CHECK: fir.call @_QFtest10Pbar(%[[tup]]) {{.*}}: (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>>) -> ()
  call bar()
 contains
-! CHECK-LABEL: func @_QFtest10Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFtest10Pbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>
   ! CHECK: fir.load %[[tupAddr]] : !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>
@@ -433,9 +433,9 @@ subroutine bar()
 ! CHECK:         %[[VAL_8:.*]] = fir.emboxproc %[[VAL_7]], %[[VAL_5]] : ((!fir.ref<i32>, !fir.ref<tuple<!fir.ref<i32>>>) -> (), !fir.ref<tuple<!fir.ref<i32>>>) -> !fir.boxproc<() -> ()>
 ! CHECK:         fir.call @_QPtest_proc_dummy_other(%[[VAL_8]]) {{.*}}: (!fir.boxproc<() -> ()>) -> ()
 
-! CHECK-LABEL: func @_QFtest_proc_dummyPtest_proc_dummy_a(
+! CHECK-LABEL: func private @_QFtest_proc_dummyPtest_proc_dummy_a(
 ! CHECK-SAME:          %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "j"},
-! CHECK-SAME:          %[[VAL_1:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-SAME:          %[[VAL_1:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:         %[[VAL_2:.*]] = arith.constant 0 : i32
 ! CHECK:         %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_1]], %[[VAL_2]] : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
 ! CHECK:         %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.llvm_ptr<!fir.ref<i32>>
@@ -525,10 +525,10 @@ end subroutine test_proc_dummy_other
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func @_QFtest_proc_dummy_charPgen_message(
+! CHECK-LABEL: func private @_QFtest_proc_dummy_charPgen_message(
 ! CHECK-SAME:                                            %[[VAL_0:.*]]: !fir.ref<!fir.char<1,10>>,
 ! CHECK-SAME:                                            %[[VAL_1:.*]]: index,
-! CHECK-SAME:                                            %[[VAL_2:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) -> !fir.boxchar<1> attributes {fir.internal_proc} {
+! CHECK-SAME:                                            %[[VAL_2:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) -> !fir.boxchar<1> attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK-DAG:         %[[VAL_3:.*]] = arith.constant 0 : i32
 ! CHECK-DAG:         %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK-DAG:         %[[VAL_5:.*]] = arith.constant false
diff --git a/flang/test/Lower/module-and-internal-proc.f90 b/flang/test/Lower/module-and-internal-proc.f90
index 1da5ce42293974..0f4c6809581cfc 100644
--- a/flang/test/Lower/module-and-internal-proc.f90
+++ b/flang/test/Lower/module-and-internal-proc.f90
@@ -17,7 +17,7 @@ subroutine test1()
 subroutine test2()
   call test2internal()
   contains
-  ! CHECK-LABEL: func @_QMparentFtest2Ptest2internal()
+  ! CHECK-LABEL: func private @_QMparentFtest2Ptest2internal()
   subroutine test2internal()
     ! CHECK: fir.address_of(@_QMparentEi) : !fir.ref<i32>
     print *, i
@@ -31,7 +31,7 @@ subroutine test3()
   use parent
   call test3internal()
   contains
-  ! CHECK-LABEL: func @_QFtest3Ptest3internal()
+  ! CHECK-LABEL: func private @_QFtest3Ptest3internal()
   subroutine test3internal()
     ! CHECK: fir.address_of(@_QMparentEi) : !fir.ref<i32>
     print *, i
diff --git a/flang/test/Lower/parent-component.f90 b/flang/test/Lower/parent-component.f90
index ed1130a08493e2..c6bc53340643fc 100644
--- a/flang/test/Lower/parent-component.f90
+++ b/flang/test/Lower/parent-component.f90
@@ -29,20 +29,20 @@ subroutine print_scalar(a)
     type(p), intent(in) :: a
     print*, a
   end subroutine
-  ! CHECK-LABEL: func.func @_QFPprint_scalar(%{{.*}}: !fir.ref<!fir.type<_QFTp{a:i32}>> {fir.bindc_name = "a"})
+  ! CHECK-LABEL: func.func private @_QFPprint_scalar(%{{.*}}: !fir.ref<!fir.type<_QFTp{a:i32}>> {fir.bindc_name = "a"})
 
   subroutine print_p(a)
     type(p), intent(in) :: a(2)
     print*, a
   end subroutine
-  ! CHECK-LABEL: func.func @_QFPprint_p(%{{.*}}: !fir.ref<!fir.array<2x!fir.type<_QFTp{a:i32}>>> {fir.bindc_name = "a"})
+  ! CHECK-LABEL: func.func private @_QFPprint_p(%{{.*}}: !fir.ref<!fir.array<2x!fir.type<_QFTp{a:i32}>>> {fir.bindc_name = "a"})
 
   subroutine init_with_slice()
     type(c) :: y(2) = [ c(11, 21), c(12, 22) ]
     call print_p(y(:)%p)
     print*,y(:)%p
   end subroutine
-  ! CHECK-LABEL: func.func @_QFPinit_with_slice()
+  ! CHECK-LABEL: func.func private @_QFPinit_with_slice()
   ! CHECK: %[[Y:.*]] = fir.address_of(@_QFFinit_with_sliceEy) : !fir.ref<!fir.array<2x!fir.type<_QFTc{a:i32,b:i32}>>>
   ! CHECK: %[[C2:.*]] = arith.constant 2 : index
   ! CHECK: %[[C1:.*]] = arith.constant 1 : index
@@ -78,7 +78,7 @@ subroutine init_no_slice()
     call print_p(y%p)
     print*,y%p
   end subroutine
-  ! CHECK-LABEL: func.func @_QFPinit_no_slice()
+  ! CHECK-LABEL: func.func private @_QFPinit_no_slice()
   ! CHECK: %[[Y:.*]] = fir.address_of(@_QFFinit_no_sliceEy) : !fir.ref<!fir.array<2x!fir.type<_QFTc{a:i32,b:i32}>>>
   ! CHECK: %[[C2:.*]] = arith.constant 2 : index
   ! CHECK: %[[SHAPE:.*]] = fir.shape %[[C2]] : (index) -> !fir.shape<1>
@@ -106,7 +106,7 @@ subroutine init_allocatable()
     print*,y%p
   end subroutine
 
-  ! CHECK-LABEL: func.func @_QFPinit_allocatable()
+  ! CHECK-LABEL: func.func private @_QFPinit_allocatable()
   ! CHECK: %[[ALLOC:.*]] = fir.alloca !fir.heap<!fir.array<?x!fir.type<_QFTc{a:i32,b:i32}>>> {uniq_name = "_QFFinit_allocatableEy.addr"}
   ! CHECK: %[[LB0:.*]] = fir.alloca index {uniq_name = "_QFFinit_allocatableEy.lb0"}
   ! CHECK: %[[EXT0:.*]] = fir.alloca index {uniq_name = "_QFFinit_allocatableEy.ext0"}
@@ -139,7 +139,7 @@ subroutine init_scalar()
     print*,s%p
   end subroutine
 
-  ! CHECK-LABEL: func.func @_QFPinit_scalar()
+  ! CHECK-LABEL: func.func private @_QFPinit_scalar()
   ! CHECK: %[[S:.*]] = fir.address_of(@_QFFinit_scalarEs) : !fir.ref<!fir.type<_QFTc{a:i32,b:i32}>>
   ! CHECK: %[[CAST:.*]] = fir.convert %[[S]] : (!fir.ref<!fir.type<_QFTc{a:i32,b:i32}>>) -> !fir.ref<!fir.type<_QFTp{a:i32}>>
   ! CHECK: fir.call @_QFPprint_scalar(%[[CAST]]) {{.*}}: (!fir.ref<!fir.type<_QFTp{a:i32}>>) -> ()
@@ -154,7 +154,7 @@ subroutine init_assumed(y)
     print*,y%p
   end subroutine
 
-  ! CHECK-LABEL: func.func @_QFPinit_assumed(
+  ! CHECK-LABEL: func.func private @_QFPinit_assumed(
   ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.type<_QFTc{a:i32,b:i32}>>
   ! CHECK: %[[BOX:.*]] = fir.rebox %[[ARG0]] : (!fir.box<!fir.array<?x!fir.type<_QFTc{a:i32,b:i32}>>>) -> !fir.box<!fir.array<?x!fir.type<_QFTp{a:i32}>>>
 
@@ -167,7 +167,7 @@ subroutine init_existing_field()
     call print_p(y%c%p)
   end subroutine
 
-  ! CHECK-LABEL: func.func @_QFPinit_existing_field
+  ! CHECK-LABEL: func.func private @_QFPinit_existing_field
   ! CHECK: %[[C2:.*]] = arith.constant 2 : index
   ! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<2x!fir.type<_QFTz{k:i32,c:!fir.type<_QFTc{a:i32,b:i32}>}>> {bindc_name = "y", uniq_name = "_QFFinit_existing_fieldEy"}
   ! CHECK: %[[FIELD_C:.*]] = fir.field_index c, !fir.type<_QFTz{k:i32,c:!fir.type<_QFTc{a:i32,b:i32}>}>
@@ -183,7 +183,7 @@ subroutine parent_comp_lhs()
     a%p = B
   end subroutine
 
-! CHECK-LABEL: func.func @_QFPparent_comp_lhs()
+! CHECK-LABEL: func.func private @_QFPparent_comp_lhs()
 ! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.type<_QFTp{a:i32}>>
 ! CHECK: %[[A:.*]] = fir.alloca !fir.type<_QFTc{a:i32,b:i32}> {bindc_name = "a", uniq_name = "_QFFparent_comp_lhsEa"}
 ! CHECK: %[[B:.*]] = fir.alloca !fir.type<_QFTp{a:i32}> {bindc_name = "b", uniq_name = "_QFFparent_comp_lhsEb"}
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index a813eff690b77e..15d8a86e4ef47c 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -519,8 +519,8 @@ subroutine internal
     end subroutine
   end subroutine
 
-! CHECK-LABEL: func.func @_QMpolymorphic_testFhost_assocPinternal(
-! CHECK-SAME: %[[TUPLE:.*]]: !fir.ref<tuple<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QMpolymorphic_testFhost_assocPinternal(
+! CHECK-SAME: %[[TUPLE:.*]]: !fir.ref<tuple<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK: %[[POS_IN_TUPLE:.*]] = arith.constant 0 : i32
 ! CHECK: %[[COORD_OF_CLASS:.*]] = fir.coordinate_of %[[TUPLE]], %[[POS_IN_TUPLE]] : (!fir.ref<tuple<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>, i32) -> !fir.ref<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
 ! CHECK: %[[CLASS:.*]] = fir.load %[[COORD_OF_CLASS]] : !fir.ref<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
diff --git a/flang/test/Lower/program-units-fir-mangling.f90 b/flang/test/Lower/program-units-fir-mangling.f90
index 36631979141a08..002343c45f6ec7 100644
--- a/flang/test/Lower/program-units-fir-mangling.f90
+++ b/flang/test/Lower/program-units-fir-mangling.f90
@@ -44,12 +44,12 @@ function foo()
 function foo2()
   real(4) :: foo2
 contains
-  ! CHECK-LABEL: func @_QFfoo2Psub() {
+  ! CHECK-LABEL: func private @_QFfoo2Psub() {{.*}} {
   subroutine sub()
   ! CHECK: }
   end subroutine
 
-  ! CHECK-LABEL: func @_QFfoo2Pfoo() {
+  ! CHECK-LABEL: func private @_QFfoo2Pfoo() {{.*}} {
   subroutine foo()
   ! CHECK: }
   end subroutine
@@ -58,12 +58,12 @@ subroutine foo()
 ! CHECK-LABEL: func @_QPsub2()
 subroutine sUb2()
 contains
-  ! CHECK-LABEL: func @_QFsub2Psub() {
+  ! CHECK-LABEL: func private @_QFsub2Psub() {{.*}} {
   subroutine sub()
   ! CHECK: }
   end subroutine
 
-  ! CHECK-LABEL: func @_QFsub2Pfoo() {
+  ! CHECK-LABEL: func private @_QFsub2Pfoo() {{.*}} {
   subroutine Foo()
   ! CHECK: }
   end subroutine
@@ -74,7 +74,7 @@ module testMod2
   ! CHECK-LABEL: func @_QMtestmod2Psub()
   subroutine sub()
   contains
-    ! CHECK-LABEL: func @_QMtestmod2FsubPsubsub() {
+    ! CHECK-LABEL: func private @_QMtestmod2FsubPsubsub() {{.*}} {
     subroutine subSub()
     ! CHECK: }
     end subroutine
@@ -105,7 +105,7 @@ subroutine sub
   ! CHECK-LABEL: func @_QMcolor_pointsScolor_points_aSimplPfoo()
   subroutine foo
     contains
-    ! CHECK-LABEL: func @_QMcolor_pointsScolor_points_aSimplFfooPbar() {
+    ! CHECK-LABEL: func private @_QMcolor_pointsScolor_points_aSimplFfooPbar() {{.*}} {
     subroutine bar
     ! CHECK: }
     end subroutine
@@ -128,7 +128,7 @@ subroutine should_not_collide()
 program test
 ! CHECK: }
 contains
-! CHECK-LABEL: func @_QFPshould_not_collide() {
+! CHECK-LABEL: func private @_QFPshould_not_collide() {{.*}} {
 subroutine should_not_collide()
 ! CHECK: }
 end subroutine
@@ -226,7 +226,7 @@ subroutine nest1
   ! CHECK:   fir.call @_QFnest1Pinner()
   call inner
 contains
-  ! CHECK-LABEL: func @_QFnest1Pinner
+  ! CHECK-LABEL: func private @_QFnest1Pinner
   subroutine inner
     ! CHECK:   %[[V_0:[0-9]+]] = fir.address_of(@_QFnest1FinnerEkk) : !fir.ref<i32>
     integer, save :: kk = 1
@@ -239,7 +239,7 @@ subroutine nest2
   ! CHECK:   fir.call @_QFnest2Pinner()
   call inner
 contains
-  ! CHECK-LABEL: func @_QFnest2Pinner
+  ! CHECK-LABEL: func private @_QFnest2Pinner
   subroutine inner
     ! CHECK:   %[[V_0:[0-9]+]] = fir.address_of(@_QFnest2FinnerEkk) : !fir.ref<i32>
     integer, save :: kk = 77
diff --git a/libc/benchmarks/automemcpy/lib/CMakeLists.txt b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
index 0c7d399d4023bb..bb6a5631f2c3f6 100644
--- a/libc/benchmarks/automemcpy/lib/CMakeLists.txt
+++ b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
@@ -18,7 +18,8 @@ add_custom_command(
 
 add_library(automemcpy_implementations "${Implementations}")
 target_link_libraries(automemcpy_implementations PUBLIC LLVMSupport libc-memory-benchmark)
-target_include_directories(automemcpy_implementations PRIVATE ${LIBC_SOURCE_DIR} ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
+target_include_directories(automemcpy_implementations PRIVATE 
+                           ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
 target_compile_options(automemcpy_implementations PRIVATE ${LIBC_COMPILE_OPTIONS_NATIVE} "SHELL:-mllvm -combiner-global-alias-analysis" -fno-builtin)
 llvm_update_compile_flags(automemcpy_implementations)
 
diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index c7ccd392354cb8..72b04822d8b84a 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -87,6 +87,7 @@ function(_get_common_compile_options output_var flags)
     list(APPEND compile_options "-fvisibility=hidden")
     list(APPEND compile_options "-fconvergent-functions")
     list(APPEND compile_options "-flto")
+    list(APPEND compile_options "-Wno-multi-gpu")
 
     if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
       list(APPEND compile_options "-Wno-unknown-cuda-version")
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 0649e9f7a76709..5469799f023983 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -59,6 +59,7 @@ function(create_object_library fq_target_name)
   )
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   target_compile_options(${fq_target_name} PRIVATE ${compile_options})
 
   # The NVPTX target is installed as LLVM-IR but the internal testing toolchain
@@ -73,6 +74,7 @@ function(create_object_library fq_target_name)
     )
     target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
     target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
     target_compile_options(${internal_target_name} PRIVATE ${compile_options}
                            -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
   endif()
@@ -279,6 +281,7 @@ function(create_entrypoint_object fq_target_name)
   target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
   target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   add_dependencies(${internal_target_name} ${full_deps_list})
   target_link_libraries(${internal_target_name} ${full_deps_list})
 
@@ -300,6 +303,7 @@ function(create_entrypoint_object fq_target_name)
   target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING)
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   add_dependencies(${fq_target_name} ${full_deps_list})
   target_link_libraries(${fq_target_name} ${full_deps_list})
 
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 76ce6754bd7339..5981d427b71f8d 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -184,6 +184,7 @@ function(create_libc_unittest fq_target_name)
   )
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
   if(NOT LIBC_UNITTEST_CXX_STANDARD)
@@ -317,6 +318,7 @@ function(add_libc_fuzzer target_name)
   )
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
 
   target_link_libraries(${fq_target_name} PRIVATE
     ${link_object_files}
@@ -457,13 +459,14 @@ function(add_integration_test test_name)
       PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
 
   _get_hermetic_test_compile_options(compile_options "${INTEGRATION_TEST_COMPILE_OPTIONS}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
@@ -471,7 +474,7 @@ function(add_integration_test test_name)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
@@ -632,6 +635,7 @@ function(add_libc_hermetic_test test_name)
   _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
@@ -647,14 +651,14 @@ function(add_libc_hermetic_test test_name)
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT}
-      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
diff --git a/libc/cmake/modules/compiler_features/check_fixed_point.cpp b/libc/cmake/modules/compiler_features/check_fixed_point.cpp
index a5192697d43f77..9199340fe652ea 100644
--- a/libc/cmake/modules/compiler_features/check_fixed_point.cpp
+++ b/libc/cmake/modules/compiler_features/check_fixed_point.cpp
@@ -1,4 +1,4 @@
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 #ifndef LIBC_COMPILER_HAS_FIXED_POINT
 #error unsupported
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index c2300a2aa681a4..27c9a42934c240 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -142,6 +142,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_count_ones_ui
     libc.src.stdbit.stdc_count_ones_ul
     libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_has_single_bit_uc
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_has_single_bit_ui
+    libc.src.stdbit.stdc_has_single_bit_ul
+    libc.src.stdbit.stdc_has_single_bit_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
diff --git a/libc/docs/dev/undefined_behavior.rst b/libc/docs/dev/undefined_behavior.rst
index 0cb25c7f2a2336..6e73a305e8e054 100644
--- a/libc/docs/dev/undefined_behavior.rst
+++ b/libc/docs/dev/undefined_behavior.rst
@@ -20,6 +20,7 @@ guidelines and the resulting code should behave predictably even in unexpected
 situations.
 
 #. Follow the standards.
+    #. If there is no standard, first ask yourself if this implementation is necessary (are there users who need this functionality?). If it truly is, then match existing implementations. Creating competing designs just causes confusion (see the history of qsort_r).
 #. Avoid giving an incorrect answer.
     #. In general, correct answer > correct answer (wrong format) > no answer > crash the program >>>>>>> incorrect answer.
     #. The C library is called frequently in performance critical situations, and so can't afford to do thorough error checking and correction.
@@ -61,7 +62,7 @@ Often the standard will imply an intended behavior through what it states is und
 
 Ignoring Bug-For-Bug Compatibility
 ----------------------------------
-Any long running implementations will have bugs and deviations from the standard. Hyrum's Law states that “all observable behaviors of your system will be depended on by somebody” which includes these bugs. An example of a long-standing bug is glibc's scanf float parsing behavior. The behavior is specifically defined in the standard, but it isn't adhered to by all libc implementations. There is a longstanding bug in glibc where it incorrectly parses the string 100er and this caused the C standard to add that specific example to the definition for scanf. The intended behavior is for scanf, when parsing a float, to parse the longest possibly valid prefix and then accept it if and only if that complete parsed value is a float. In the case of 100er the longest possibly valid prefix is 100e but the float parsed from that string is only 100. Since there is no number after the e it shouldn't be included in the float, so scanf should return a parsing error. For LLVM's libc it was decided to follow the standard, even though glibc's version is slightly simpler to implement and this edge case is rare. Following the standard must be the first priority, since that's the goal of the library.
+Any long running implementations will have bugs and deviations from the standard. Hyrum's Law states that “all observable behaviors of your system will be depended on by somebody” which includes these bugs. An example of a long-standing bug is glibc's scanf float parsing behavior. The behavior is specifically defined in the standard, but it isn't adhered to by all libc implementations. There is a longstanding bug in glibc where it incorrectly parses the string 100er and this caused the C standard to add that specific example to the definition for scanf. The intended behavior is for scanf, when parsing a float, to parse the longest possibly valid prefix and then accept it if and only if that complete parsed value is a float. In the case of 100er the longest possibly valid prefix is 100e but the float parsed from that string is only 100. Since there is no number after the e it shouldn't be included in the float, so scanf should return a parsing error. For LLVM's libc it was decided to follow the standard, even though glibc's version is slightly simpler to implement and this edge case is rare. Following the standard must be the first priority, since that's the goal of the library. If there is no standard, then matching another implementation (even bug-for-bug) may be necessary, but before you implement an unstandardized function first consider if anyone will actually use it at all.
 
 Design Decisions
 ================
diff --git a/libc/docs/stdbit.rst b/libc/docs/stdbit.rst
index 0308caeb929321..b579e9dbbc2f51 100644
--- a/libc/docs/stdbit.rst
+++ b/libc/docs/stdbit.rst
@@ -81,11 +81,11 @@ stdc_count_ones_us           |check|
 stdc_count_ones_ui           |check|
 stdc_count_ones_ul           |check|
 stdc_count_ones_ull          |check|
-stdc_has_single_bit_uc
-stdc_has_single_bit_us
-stdc_has_single_bit_ui
-stdc_has_single_bit_ul
-stdc_has_single_bit_ull
+stdc_has_single_bit_uc       |check|
+stdc_has_single_bit_us       |check|
+stdc_has_single_bit_ui       |check|
+stdc_has_single_bit_ul       |check|
+stdc_has_single_bit_ull      |check|
 stdc_bit_width_uc
 stdc_bit_width_us
 stdc_bit_width_ui
@@ -124,7 +124,7 @@ stdc_first_trailing_zero   |check|
 stdc_first_trailing_one    |check|
 stdc_count_zeros           |check|
 stdc_count_ones            |check|
-stdc_has_single_bit
+stdc_has_single_bit        |check|
 stdc_bit_width
 stdc_bit_floor
 stdc_bit_ceil
diff --git a/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp b/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
index b4a8621891203d..c385c3a8f3e44a 100644
--- a/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
+++ b/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 #include "src/stdio/snprintf.h"
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/fixed_point/fx_rep.h"
 
diff --git a/libc/include/llvm-libc-macros/stdbit-macros.h b/libc/include/llvm-libc-macros/stdbit-macros.h
index 5ee152e105f772..e3a36d10ed92ab 100644
--- a/libc/include/llvm-libc-macros/stdbit-macros.h
+++ b/libc/include/llvm-libc-macros/stdbit-macros.h
@@ -157,6 +157,21 @@ inline unsigned stdc_count_ones(unsigned long x) {
 inline unsigned stdc_count_ones(unsigned long long x) {
   return stdc_count_ones_ull(x);
 }
+inline bool stdc_has_single_bit(unsigned char x) {
+  return stdc_has_single_bit_uc(x);
+}
+inline bool stdc_has_single_bit(unsigned short x) {
+  return stdc_has_single_bit_us(x);
+}
+inline bool stdc_has_single_bit(unsigned x) {
+  return stdc_has_single_bit_ui(x);
+}
+inline bool stdc_has_single_bit(unsigned long x) {
+  return stdc_has_single_bit_ul(x);
+}
+inline bool stdc_has_single_bit(unsigned long long x) {
+  return stdc_has_single_bit_ull(x);
+}
 #else
 #define stdc_leading_zeros(x)                                                  \
   _Generic((x),                                                                \
@@ -228,6 +243,13 @@ inline unsigned stdc_count_ones(unsigned long long x) {
       unsigned: stdc_count_ones_ui,                                            \
       unsigned long: stdc_count_ones_ul,                                       \
       unsigned long long: stdc_count_ones_ull)(x)
+#define stdc_has_single_bit(x)                                                 \
+  _Generic((x),                                                                \
+      unsigned char: stdc_has_single_bit_uc,                                   \
+      unsigned short: stdc_has_single_bit_us,                                  \
+      unsigned: stdc_has_single_bit_ui,                                        \
+      unsigned long: stdc_has_single_bit_ul,                                   \
+      unsigned long long: stdc_has_single_bit_ull)(x)
 #endif // __cplusplus
 
 #endif // __LLVM_LIBC_MACROS_STDBIT_MACROS_H
diff --git a/libc/include/llvm-libc-types/float128.h b/libc/include/llvm-libc-types/float128.h
index 61a094fdb96b12..1907a5e3ece727 100644
--- a/libc/include/llvm-libc-types/float128.h
+++ b/libc/include/llvm-libc-types/float128.h
@@ -9,7 +9,7 @@
 #ifndef __LLVM_LIBC_TYPES_FLOAT128_H__
 #define __LLVM_LIBC_TYPES_FLOAT128_H__
 
-#include <include/llvm-libc-macros/float-macros.h> // LDBL_MANT_DIG
+#include "llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
 
 // Currently, C23 `_Float128` type is only defined as a built-in type in GCC 7
 // or later, and only for C.  For C++, or for clang, `__float128` is defined
diff --git a/libc/spec/spec.td b/libc/spec/spec.td
index 90c076580be125..998f37fb26deed 100644
--- a/libc/spec/spec.td
+++ b/libc/spec/spec.td
@@ -51,6 +51,7 @@ def LongDoubleType : NamedType<"long double">;
 def CharType : NamedType<"char">;
 def UnsignedCharType : NamedType<"unsigned char">;
 def UnsignedShortType : NamedType<"unsigned short">;
+def BoolType : NamedType<"bool">;
 
 def Float128Type : NamedType<"float128">;
 
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 8a1a235e4eecdc..5b97255b899746 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -799,7 +799,8 @@ def StdC : StandardSpec<"stdc"> {
         Macro<"stdc_first_trailing_zero">,
         Macro<"stdc_first_trailing_one">,
         Macro<"stdc_count_zeros">,
-        Macro<"stdc_count_ones">
+        Macro<"stdc_count_ones">,
+        Macro<"stdc_has_single_bit">
       ], // Macros
       [], // Types
       [], // Enumerations
@@ -848,7 +849,12 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"stdc_count_ones_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
           FunctionSpec<"stdc_count_ones_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
           FunctionSpec<"stdc_count_ones_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_count_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>
+          FunctionSpec<"stdc_count_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
+          FunctionSpec<"stdc_has_single_bit_uc", RetValSpec<BoolType>, [ArgSpec<UnsignedCharType>]>,
+          FunctionSpec<"stdc_has_single_bit_us", RetValSpec<BoolType>, [ArgSpec<UnsignedShortType>]>,
+          FunctionSpec<"stdc_has_single_bit_ui", RetValSpec<BoolType>, [ArgSpec<UnsignedIntType>]>,
+          FunctionSpec<"stdc_has_single_bit_ul", RetValSpec<BoolType>, [ArgSpec<UnsignedLongType>]>,
+          FunctionSpec<"stdc_has_single_bit_ull", RetValSpec<BoolType>, [ArgSpec<UnsignedLongLongType>]>
       ] // Functions
   >;
 
diff --git a/libc/src/__support/CPP/limits.h b/libc/src/__support/CPP/limits.h
index 1ffde5f9556f87..6440e8cb358fa7 100644
--- a/libc/src/__support/CPP/limits.h
+++ b/libc/src/__support/CPP/limits.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_CPP_LIMITS_H
 #define LLVM_LIBC_SRC___SUPPORT_CPP_LIMITS_H
 
-#include "include/llvm-libc-macros/limits-macros.h" // CHAR_BIT
+#include "llvm-libc-macros/limits-macros.h" // CHAR_BIT
 #include "src/__support/CPP/type_traits/is_integral.h"
 #include "src/__support/CPP/type_traits/is_signed.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
diff --git a/libc/src/__support/CPP/type_traits/is_fixed_point.h b/libc/src/__support/CPP/type_traits/is_fixed_point.h
index 317ba39748b7de..e139e6477e2e7e 100644
--- a/libc/src/__support/CPP/type_traits/is_fixed_point.h
+++ b/libc/src/__support/CPP/type_traits/is_fixed_point.h
@@ -12,7 +12,7 @@
 #include "src/__support/CPP/type_traits/remove_cv.h"
 #include "src/__support/macros/attributes.h"
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
diff --git a/libc/src/__support/HashTable/table.h b/libc/src/__support/HashTable/table.h
index 5b4697e5245b6d..e2a26d0e2b5f36 100644
--- a/libc/src/__support/HashTable/table.h
+++ b/libc/src/__support/HashTable/table.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_HASHTABLE_table_H
 #define LLVM_LIBC_SRC___SUPPORT_HASHTABLE_table_H
 
-#include "include/llvm-libc-types/ENTRY.h"
+#include "llvm-libc-types/ENTRY.h"
 #include "src/__support/CPP/bit.h" // bit_ceil
 #include "src/__support/CPP/new.h"
 #include "src/__support/HashTable/bitmask.h"
diff --git a/libc/src/__support/fixed_point/fx_bits.h b/libc/src/__support/fixed_point/fx_bits.h
index fcd47cd72cbb31..0c8d03beb84ae5 100644
--- a/libc/src/__support/fixed_point/fx_bits.h
+++ b/libc/src/__support/fixed_point/fx_bits.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXBITS_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXBITS_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
diff --git a/libc/src/__support/fixed_point/fx_rep.h b/libc/src/__support/fixed_point/fx_rep.h
index fcd7554c4d8550..7d18f14a8c483a 100644
--- a/libc/src/__support/fixed_point/fx_rep.h
+++ b/libc/src/__support/fixed_point/fx_rep.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXREP_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXREP_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE, LIBC_INLINE_VAR
 
diff --git a/libc/src/__support/fixed_point/sqrt.h b/libc/src/__support/fixed_point/sqrt.h
index d8df294b18a1a8..236ebb2857030b 100644
--- a/libc/src/__support/fixed_point/sqrt.h
+++ b/libc/src/__support/fixed_point/sqrt.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
diff --git a/libc/src/__support/macros/properties/float.h b/libc/src/__support/macros/properties/float.h
index 08a1ab726cbde4..510f3923749358 100644
--- a/libc/src/__support/macros/properties/float.h
+++ b/libc/src/__support/macros/properties/float.h
@@ -11,8 +11,8 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_FLOAT_H
 #define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_FLOAT_H
 
-#include "include/llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
-#include "include/llvm-libc-types/float128.h"      // float128
+#include "llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
+#include "llvm-libc-types/float128.h"      // float128
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/compiler.h"
 #include "src/__support/macros/properties/cpu_features.h"
diff --git a/libc/src/stdbit/CMakeLists.txt b/libc/src/stdbit/CMakeLists.txt
index 5fb77d21e57a13..8bc7dd7852bbca 100644
--- a/libc/src/stdbit/CMakeLists.txt
+++ b/libc/src/stdbit/CMakeLists.txt
@@ -9,6 +9,7 @@ set(prefixes
   first_trailing_one
   count_zeros
   count_ones
+  has_single_bit
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/src/stdbit/stdc_has_single_bit_uc.cpp b/libc/src/stdbit/stdc_has_single_bit_uc.cpp
new file mode 100644
index 00000000000000..e5acdc2a71b4bc
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_uc.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_uc --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_uc.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_uc, (unsigned char value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_uc.h b/libc/src/stdbit/stdc_has_single_bit_uc.h
new file mode 100644
index 00000000000000..028d4ee710505a
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_uc.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_uc --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UC_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UC_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_uc(unsigned char value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UC_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_ui.cpp b/libc/src/stdbit/stdc_has_single_bit_ui.cpp
new file mode 100644
index 00000000000000..37578882324aa6
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ui.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_ui --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_ui.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_ui, (unsigned value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_ui.h b/libc/src/stdbit/stdc_has_single_bit_ui.h
new file mode 100644
index 00000000000000..1e8cd9afaee885
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ui.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_ui --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UI_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UI_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_ui(unsigned value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UI_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_ul.cpp b/libc/src/stdbit/stdc_has_single_bit_ul.cpp
new file mode 100644
index 00000000000000..85133ab81cc602
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ul.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_ul --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_ul.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_ul, (unsigned long value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_ul.h b/libc/src/stdbit/stdc_has_single_bit_ul.h
new file mode 100644
index 00000000000000..9b924fca9f065d
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ul.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_ul --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UL_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_ul(unsigned long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UL_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_ull.cpp b/libc/src/stdbit/stdc_has_single_bit_ull.cpp
new file mode 100644
index 00000000000000..4491cf2b98b6d3
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ull.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_ull -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_ull.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_ull, (unsigned long long value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_ull.h b/libc/src/stdbit/stdc_has_single_bit_ull.h
new file mode 100644
index 00000000000000..d4802bc287274f
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ull.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_ull -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_ULL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_ULL_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_ull(unsigned long long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_ULL_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_us.cpp b/libc/src/stdbit/stdc_has_single_bit_us.cpp
new file mode 100644
index 00000000000000..7a42ae553aa2e8
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_us.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_us --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_us.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_us, (unsigned short value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_us.h b/libc/src/stdbit/stdc_has_single_bit_us.h
new file mode 100644
index 00000000000000..201ff4954c3b7a
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_us.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_us --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_US_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_US_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_us(unsigned short value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_US_H
diff --git a/libc/src/stdfix/abshk.h b/libc/src/stdfix/abshk.h
index 13c9300caab883..80dc73053dfb45 100644
--- a/libc/src/stdfix/abshk.h
+++ b/libc/src/stdfix/abshk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSHK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abshr.h b/libc/src/stdfix/abshr.h
index 5acd0cfc4a60db..035f9a6de222e8 100644
--- a/libc/src/stdfix/abshr.h
+++ b/libc/src/stdfix/abshr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSHR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/absk.h b/libc/src/stdfix/absk.h
index 73dfcac0ac8e7f..426415de28e6ae 100644
--- a/libc/src/stdfix/absk.h
+++ b/libc/src/stdfix/absk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abslk.h b/libc/src/stdfix/abslk.h
index 7de116fa227932..21e33f856bfc65 100644
--- a/libc/src/stdfix/abslk.h
+++ b/libc/src/stdfix/abslk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSLK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSLK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abslr.h b/libc/src/stdfix/abslr.h
index bf5b585bbbb669..ebca35e58aa510 100644
--- a/libc/src/stdfix/abslr.h
+++ b/libc/src/stdfix/abslr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSLR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSLR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/absr.h b/libc/src/stdfix/absr.h
index b5ead7ce14e2a0..2744fcb5a7eccc 100644
--- a/libc/src/stdfix/absr.h
+++ b/libc/src/stdfix/absr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundhk.h b/libc/src/stdfix/roundhk.h
index 9a5c874cc030db..06de5cc05cdbe4 100644
--- a/libc/src/stdfix/roundhk.h
+++ b/libc/src/stdfix/roundhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDHK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundhr.h b/libc/src/stdfix/roundhr.h
index ba5a67945d6c3b..6729bf5b139973 100644
--- a/libc/src/stdfix/roundhr.h
+++ b/libc/src/stdfix/roundhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDHR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundk.h b/libc/src/stdfix/roundk.h
index e9fa6d8f9c3b8c..02fb9a8c9b1a86 100644
--- a/libc/src/stdfix/roundk.h
+++ b/libc/src/stdfix/roundk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundlk.h b/libc/src/stdfix/roundlk.h
index 5fa0e90e855a64..28be9c00549498 100644
--- a/libc/src/stdfix/roundlk.h
+++ b/libc/src/stdfix/roundlk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDLK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDLK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundlr.h b/libc/src/stdfix/roundlr.h
index c015292e8f3f28..be97a35a64204d 100644
--- a/libc/src/stdfix/roundlr.h
+++ b/libc/src/stdfix/roundlr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDLR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDLR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundr.h b/libc/src/stdfix/roundr.h
index b5b1375c882e03..15523f8b6c9a38 100644
--- a/libc/src/stdfix/roundr.h
+++ b/libc/src/stdfix/roundr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduhk.h b/libc/src/stdfix/rounduhk.h
index 85ebf2903ec7e9..d1c4a4416d7636 100644
--- a/libc/src/stdfix/rounduhk.h
+++ b/libc/src/stdfix/rounduhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUHK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduhr.h b/libc/src/stdfix/rounduhr.h
index 1be0aab1f5a79d..6cecb733dd3b18 100644
--- a/libc/src/stdfix/rounduhr.h
+++ b/libc/src/stdfix/rounduhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUHR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduk.h b/libc/src/stdfix/rounduk.h
index 8dae89586c4901..4511d69525c5d5 100644
--- a/libc/src/stdfix/rounduk.h
+++ b/libc/src/stdfix/rounduk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundulk.h b/libc/src/stdfix/roundulk.h
index 81dfd1dceb6001..8bd90beeb830c5 100644
--- a/libc/src/stdfix/roundulk.h
+++ b/libc/src/stdfix/roundulk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDULK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDULK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundulr.h b/libc/src/stdfix/roundulr.h
index 002fc94907c613..65e5c27b1c8531 100644
--- a/libc/src/stdfix/roundulr.h
+++ b/libc/src/stdfix/roundulr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDULR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDULR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundur.h b/libc/src/stdfix/roundur.h
index 72de44b1e0c4e5..110e578da79319 100644
--- a/libc/src/stdfix/roundur.h
+++ b/libc/src/stdfix/roundur.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtuhk.h b/libc/src/stdfix/sqrtuhk.h
index 80000a0079696d..b57340003fa03c 100644
--- a/libc/src/stdfix/sqrtuhk.h
+++ b/libc/src/stdfix/sqrtuhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtuhr.h b/libc/src/stdfix/sqrtuhr.h
index fd95f0924e8d48..6b629a29de3c88 100644
--- a/libc/src/stdfix/sqrtuhr.h
+++ b/libc/src/stdfix/sqrtuhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtuk.h b/libc/src/stdfix/sqrtuk.h
index 04d0adadde9ad2..6bd7a2608716cd 100644
--- a/libc/src/stdfix/sqrtuk.h
+++ b/libc/src/stdfix/sqrtuk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUK_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtulr.h b/libc/src/stdfix/sqrtulr.h
index 284adaaf35bf59..d1982a6b1c0518 100644
--- a/libc/src/stdfix/sqrtulr.h
+++ b/libc/src/stdfix/sqrtulr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTULR_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTULR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtur.h b/libc/src/stdfix/sqrtur.h
index df9dfe5a0bf39e..13f7d1e5e466ec 100644
--- a/libc/src/stdfix/sqrtur.h
+++ b/libc/src/stdfix/sqrtur.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUR_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdio/printf_core/fixed_converter.h b/libc/src/stdio/printf_core/fixed_converter.h
index de69c603be6b63..c89971e20686e4 100644
--- a/libc/src/stdio/printf_core/fixed_converter.h
+++ b/libc/src/stdio/printf_core/fixed_converter.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/fixed_point/fx_rep.h"
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index 0876116a0bac86..13fdbf243a22e8 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/str_to_integer.h"
diff --git a/libc/src/sys/epoll/epoll_pwait.h b/libc/src/sys/epoll/epoll_pwait.h
index 9dcb55533009f9..16105850d6942e 100644
--- a/libc/src/sys/epoll/epoll_pwait.h
+++ b/libc/src/sys/epoll/epoll_pwait.h
@@ -10,8 +10,8 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_PWAIT_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/epoll_pwait2.h b/libc/src/sys/epoll/epoll_pwait2.h
index 622ede6a0f9f9a..f7b28d4fbc51d9 100644
--- a/libc/src/sys/epoll/epoll_pwait2.h
+++ b/libc/src/sys/epoll/epoll_pwait2.h
@@ -10,9 +10,9 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_PWAIT2_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
-// #include "include/llvm-libc-types/struct_timespec.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/struct_timespec.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/epoll_wait.h b/libc/src/sys/epoll/epoll_wait.h
index d51c9100846ce0..0dc487bba5bdf4 100644
--- a/libc/src/sys/epoll/epoll_wait.h
+++ b/libc/src/sys/epoll/epoll_wait.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_WAIT_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_pwait.cpp b/libc/src/sys/epoll/linux/epoll_pwait.cpp
index ee1b4e66e98444..e0c13a7a79602f 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait.cpp
@@ -15,8 +15,8 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_pwait2.cpp b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
index 671dede2a1058d..a44b0c2a9f70f0 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait2.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
@@ -15,9 +15,9 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
-// #include "include/llvm-libc-types/struct_timespec.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/struct_timespec.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_wait.cpp b/libc/src/sys/epoll/linux/epoll_wait.cpp
index 0c43edf7645454..b643e2dd720cb6 100644
--- a/libc/src/sys/epoll/linux/epoll_wait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_wait.cpp
@@ -14,8 +14,8 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 4668f0061975f8..466494f038f4e3 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -26,7 +26,8 @@ function(add_unittest_framework_library name)
       ${TEST_LIB_SRCS}
       ${TEST_LIB_HDRS}
     )
-    target_include_directories(${lib} PUBLIC ${LIBC_SOURCE_DIR})
+    target_include_directories(${lib} PUBLIC 
+                               ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include)
     list(APPEND compile_options -fno-exceptions -fno-rtti)
     if(TARGET libc.src.time.clock)
       target_compile_definitions(${lib} PRIVATE TARGET_SUPPORTS_CLOCK)
diff --git a/libc/test/UnitTest/LibcTest.cpp b/libc/test/UnitTest/LibcTest.cpp
index 7b0e4fca83683b..babd44f9b20630 100644
--- a/libc/test/UnitTest/LibcTest.cpp
+++ b/libc/test/UnitTest/LibcTest.cpp
@@ -8,7 +8,7 @@
 
 #include "LibcTest.h"
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/string.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/UInt128.h"
diff --git a/libc/test/include/stdbit_test.cpp b/libc/test/include/stdbit_test.cpp
index 46019075a7c109..84a4cde18b9f40 100644
--- a/libc/test/include/stdbit_test.cpp
+++ b/libc/test/include/stdbit_test.cpp
@@ -81,9 +81,14 @@ unsigned stdc_count_ones_us(unsigned short) noexcept { return 0x3BU; }
 unsigned stdc_count_ones_ui(unsigned) noexcept { return 0x3CU; }
 unsigned stdc_count_ones_ul(unsigned long) noexcept { return 0x3DU; }
 unsigned stdc_count_ones_ull(unsigned long long) noexcept { return 0x3FU; }
+bool stdc_has_single_bit_uc(unsigned char) noexcept { return false; }
+bool stdc_has_single_bit_us(unsigned short) noexcept { return false; }
+bool stdc_has_single_bit_ui(unsigned) noexcept { return false; }
+bool stdc_has_single_bit_ul(unsigned long) noexcept { return false; }
+bool stdc_has_single_bit_ull(unsigned long long) noexcept { return false; }
 }
 
-#include "include/llvm-libc-macros/stdbit-macros.h"
+#include "llvm-libc-macros/stdbit-macros.h"
 
 TEST(LlvmLibcStdbitTest, TypeGenericMacroLeadingZeros) {
   EXPECT_EQ(stdc_leading_zeros(static_cast<unsigned char>(0U)), 0xAAU);
@@ -164,3 +169,11 @@ TEST(LlvmLibcStdbitTest, TypeGenericMacroCountOnes) {
   EXPECT_EQ(stdc_count_ones(0UL), 0x3DU);
   EXPECT_EQ(stdc_count_ones(0ULL), 0x3FU);
 }
+
+TEST(LlvmLibcStdbitTest, TypeGenericMacroHasSingleBit) {
+  EXPECT_EQ(stdc_has_single_bit(static_cast<unsigned char>(1U)), false);
+  EXPECT_EQ(stdc_has_single_bit(static_cast<unsigned short>(1U)), false);
+  EXPECT_EQ(stdc_has_single_bit(1U), false);
+  EXPECT_EQ(stdc_has_single_bit(1UL), false);
+  EXPECT_EQ(stdc_has_single_bit(1ULL), false);
+}
diff --git a/libc/test/include/stdckdint_test.cpp b/libc/test/include/stdckdint_test.cpp
index 1180a6de9efe2e..5ac8c95f4ef26f 100644
--- a/libc/test/include/stdckdint_test.cpp
+++ b/libc/test/include/stdckdint_test.cpp
@@ -8,7 +8,7 @@
 
 #include "test/UnitTest/Test.h"
 
-#include "include/llvm-libc-macros/stdckdint-macros.h"
+#include "llvm-libc-macros/stdckdint-macros.h"
 
 TEST(LlvmLibcStdCkdIntTest, Add) {
   int result;
diff --git a/libc/test/integration/startup/CMakeLists.txt b/libc/test/integration/startup/CMakeLists.txt
index fb5d6bc787cc26..08c0d978602b80 100644
--- a/libc/test/integration/startup/CMakeLists.txt
+++ b/libc/test/integration/startup/CMakeLists.txt
@@ -31,6 +31,7 @@ function(add_startup_test target_name)
     ${fq_target_name}
     PRIVATE
       ${LIBC_SOURCE_DIR}
+      ${LIBC_SOURCE_DIR}/include
       ${LIBC_BUILD_DIR}
       ${LIBC_BUILD_DIR}/include
   )
diff --git a/libc/test/integration/startup/gpu/rpc_interface_test.cpp b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
index 674e2cc1ed7499..7bbd7085fc2f45 100644
--- a/libc/test/integration/startup/gpu/rpc_interface_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "test/IntegrationTest/test.h"
diff --git a/libc/test/integration/startup/gpu/rpc_stream_test.cpp b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
index 09a4ae67256e3a..9401f822904d04 100644
--- a/libc/test/integration/startup/gpu/rpc_stream_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/integer_to_string.h"
diff --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp
index 4032d890c53ec8..bb36b6cedb63cb 100644
--- a/libc/test/integration/startup/gpu/rpc_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "test/IntegrationTest/test.h"
diff --git a/libc/test/src/__support/fixed_point/fx_bits_test.cpp b/libc/test/src/__support/fixed_point/fx_bits_test.cpp
index 58627816eb8d97..5670687273d5ba 100644
--- a/libc/test/src/__support/fixed_point/fx_bits_test.cpp
+++ b/libc/test/src/__support/fixed_point/fx_bits_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/integer_literals.h"
diff --git a/libc/test/src/math/differential_testing/CMakeLists.txt b/libc/test/src/math/differential_testing/CMakeLists.txt
index 878f81f1d573c8..36bfdca1a442d6 100644
--- a/libc/test/src/math/differential_testing/CMakeLists.txt
+++ b/libc/test/src/math/differential_testing/CMakeLists.txt
@@ -47,6 +47,7 @@ function(add_diff_binary target_name)
     ${fq_target_name}
     PRIVATE
       ${LIBC_SOURCE_DIR}
+      ${LIBC_SOURCE_DIR}/include
   )
   if(DIFF_COMPILE_OPTIONS)
     target_compile_options(
diff --git a/libc/test/src/stdbit/CMakeLists.txt b/libc/test/src/stdbit/CMakeLists.txt
index 659e575fedea27..a886ee4a35325d 100644
--- a/libc/test/src/stdbit/CMakeLists.txt
+++ b/libc/test/src/stdbit/CMakeLists.txt
@@ -11,6 +11,7 @@ set(prefixes
   first_trailing_one
   count_zeros
   count_ones
+  has_single_bit
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp
new file mode 100644
index 00000000000000..6212b1ec765a5d
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_uc ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_uc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUcTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_uc(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUcTest, OneHot) {
+  for (unsigned i = 0U; i != UCHAR_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_uc(1U << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp
new file mode 100644
index 00000000000000..2e00507aa0258c
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_ui ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_ui.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUiTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ui(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUiTest, OneHot) {
+  for (unsigned i = 0U; i != UINT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ui(1U << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp
new file mode 100644
index 00000000000000..8c0178998bbec1
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_ul ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_ul.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUlTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ul(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUlTest, OneHot) {
+  for (unsigned i = 0U; i != ULONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ul(1UL << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp
new file mode 100644
index 00000000000000..1d9f976b6d6338
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_ull -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_ull.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUllTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ull(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUllTest, OneHot) {
+  for (unsigned i = 0U; i != ULLONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ull(1ULL << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp
new file mode 100644
index 00000000000000..52c4de88104459
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_us ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_us.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUsTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_us(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUsTest, OneHot) {
+  for (unsigned i = 0U; i != USHRT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_us(1U << i), true);
+}
diff --git a/libc/utils/LibcTableGenUtil/CMakeLists.txt b/libc/utils/LibcTableGenUtil/CMakeLists.txt
index dca6a7bb830655..60208ed790d574 100644
--- a/libc/utils/LibcTableGenUtil/CMakeLists.txt
+++ b/libc/utils/LibcTableGenUtil/CMakeLists.txt
@@ -5,5 +5,5 @@ add_llvm_library(
   DISABLE_LLVM_LINK_LLVM_DYLIB
   LINK_COMPONENTS Support TableGen
 )
-target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR})
+target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include)
 target_include_directories(LibcTableGenUtil PRIVATE ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index e2aabb08c11dac..d74d65e8993829 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -11,7 +11,7 @@
 
 #include "utils/gpu/server/llvmlibc_rpc_server.h"
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 
 #include <cstddef>
 #include <cstdint>
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 6c8f8d17af9b1f..78c6bb87a5a402 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -73,6 +73,11 @@ Deprecations and Removals
 
 - The ``_LIBCPP_INLINE_VISIBILITY`` and ``_VSTD`` macros have been removed in LLVM 19.
 
+- The ``_LIBCPP_ATOMIC_ONLY_USE_BUILTINS`` configuration option has been removed in LLVM 19. This should not affect
+  many users, except perhaps users using the library with ``-ffreestanding`` with a toolchain where compiler-rt or
+  libatomic is not available. If you are one such user, please reach out to the libc++ developers so we can collaborate
+  on a path for supporting atomics properly on freestanding platforms.
+
 
 Upcoming Deprecations and Removals
 ----------------------------------
diff --git a/libcxx/include/__atomic/aliases.h b/libcxx/include/__atomic/aliases.h
index 0fa289de54b0f1..db34f5ec02d744 100644
--- a/libcxx/include/__atomic/aliases.h
+++ b/libcxx/include/__atomic/aliases.h
@@ -92,7 +92,7 @@ using __largest_lock_free_type = short;
 #  elif ATOMIC_CHAR_LOCK_FREE == 2
 using __largest_lock_free_type = char;
 #  else
-#    define _LIBCPP_NO_LOCK_FREE_TYPES // There are no lockfree types (this can happen in freestanding)
+#    define _LIBCPP_NO_LOCK_FREE_TYPES // There are no lockfree types (this can happen on unusual platforms)
 #  endif
 
 #  ifndef _LIBCPP_NO_LOCK_FREE_TYPES
diff --git a/libcxx/include/__atomic/cxx_atomic_impl.h b/libcxx/include/__atomic/cxx_atomic_impl.h
index 1a0b808a0cb1c4..b900cc135f78f6 100644
--- a/libcxx/include/__atomic/cxx_atomic_impl.h
+++ b/libcxx/include/__atomic/cxx_atomic_impl.h
@@ -9,16 +9,13 @@
 #ifndef _LIBCPP___ATOMIC_CXX_ATOMIC_IMPL_H
 #define _LIBCPP___ATOMIC_CXX_ATOMIC_IMPL_H
 
-#include <__atomic/is_always_lock_free.h>
 #include <__atomic/memory_order.h>
 #include <__config>
 #include <__memory/addressof.h>
-#include <__type_traits/conditional.h>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/remove_const.h>
 #include <cstddef>
-#include <cstring>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -26,7 +23,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if defined(_LIBCPP_HAS_GCC_ATOMIC_IMP) || defined(_LIBCPP_ATOMIC_ONLY_USE_BUILTINS)
+#if defined(_LIBCPP_HAS_GCC_ATOMIC_IMP)
 
 // [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because
 // the default operator= in an object is not volatile, a byte-by-byte copy
@@ -44,10 +41,6 @@ _LIBCPP_HIDE_FROM_ABI void __cxx_atomic_assign_volatile(_Tp volatile& __a_value,
     *__to++ = *__from++;
 }
 
-#endif
-
-#if defined(_LIBCPP_HAS_GCC_ATOMIC_IMP)
-
 template <typename _Tp>
 struct __cxx_atomic_base_impl {
   _LIBCPP_HIDE_FROM_ABI
@@ -529,289 +522,7 @@ __cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_o
 
 #endif // _LIBCPP_HAS_GCC_ATOMIC_IMP, _LIBCPP_HAS_C_ATOMIC_IMP
 
-#ifdef _LIBCPP_ATOMIC_ONLY_USE_BUILTINS
-
-template <typename _Tp>
-struct __cxx_atomic_lock_impl {
-  _LIBCPP_HIDE_FROM_ABI __cxx_atomic_lock_impl() _NOEXCEPT : __a_value(), __a_lock(0) {}
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __cxx_atomic_lock_impl(_Tp value) _NOEXCEPT
-      : __a_value(value),
-        __a_lock(0) {}
-
-  _Tp __a_value;
-  mutable __cxx_atomic_base_impl<_LIBCPP_ATOMIC_FLAG_TYPE> __a_lock;
-
-  _LIBCPP_HIDE_FROM_ABI void __lock() const volatile {
-    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-      /*spin*/;
-  }
-  _LIBCPP_HIDE_FROM_ABI void __lock() const {
-    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-      /*spin*/;
-  }
-  _LIBCPP_HIDE_FROM_ABI void __unlock() const volatile {
-    __cxx_atomic_store(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(false), memory_order_release);
-  }
-  _LIBCPP_HIDE_FROM_ABI void __unlock() const {
-    __cxx_atomic_store(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(false), memory_order_release);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp __read() const volatile {
-    __lock();
-    _Tp __old;
-    __cxx_atomic_assign_volatile(__old, __a_value);
-    __unlock();
-    return __old;
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp __read() const {
-    __lock();
-    _Tp __old = __a_value;
-    __unlock();
-    return __old;
-  }
-  _LIBCPP_HIDE_FROM_ABI void __read_inplace(_Tp* __dst) const volatile {
-    __lock();
-    __cxx_atomic_assign_volatile(*__dst, __a_value);
-    __unlock();
-  }
-  _LIBCPP_HIDE_FROM_ABI void __read_inplace(_Tp* __dst) const {
-    __lock();
-    *__dst = __a_value;
-    __unlock();
-  }
-};
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_init(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __val) {
-  __cxx_atomic_assign_volatile(__a->__a_value, __val);
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_init(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __val) {
-  __a->__a_value = __val;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_store(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __val, memory_order) {
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__a->__a_value, __val);
-  __a->__unlock();
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_store(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __val, memory_order) {
-  __a->__lock();
-  __a->__a_value = __val;
-  __a->__unlock();
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp>* __a, memory_order) {
-  return __a->__read();
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp>* __a, memory_order) {
-  return __a->__read();
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void
-__cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __dst, memory_order) {
-  __a->__read_inplace(__dst);
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __dst, memory_order) {
-  __a->__read_inplace(__dst);
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_exchange(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __value, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_exchange(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __value, memory_order) {
-  __a->__lock();
-  _Tp __old      = __a->__a_value;
-  __a->__a_value = __value;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_strong(
-    volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  _Tp __temp;
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = (std::memcmp(&__temp, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  else
-    __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
-  __a->__unlock();
-  return __ret;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_strong(
-    __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
-  bool __ret = (std::memcmp(&__a->__a_value, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    std::memcpy(&__a->__a_value, &__value, sizeof(_Tp));
-  else
-    std::memcpy(__expected, &__a->__a_value, sizeof(_Tp));
-  __a->__unlock();
-  return __ret;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_weak(
-    volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  _Tp __temp;
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = (std::memcmp(&__temp, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  else
-    __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
-  __a->__unlock();
-  return __ret;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_weak(
-    __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
-  bool __ret = (std::memcmp(&__a->__a_value, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    std::memcpy(&__a->__a_value, &__value, sizeof(_Tp));
-  else
-    std::memcpy(__expected, &__a->__a_value, sizeof(_Tp));
-  __a->__unlock();
-  return __ret;
-}
-
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old + __delta));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value += __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp*
-__cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order) {
-  __a->__lock();
-  _Tp* __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, __old + __delta);
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp* __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order) {
-  __a->__lock();
-  _Tp* __old = __a->__a_value;
-  __a->__a_value += __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old - __delta));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_sub(__cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value -= __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__cxx_atomic_fetch_and(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old & __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_and(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value &= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__cxx_atomic_fetch_or(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old | __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_or(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value |= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__cxx_atomic_fetch_xor(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old ^ __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_xor(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value ^= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp,
-          typename _Base = typename conditional<__libcpp_is_always_lock_free<_Tp>::__value,
-                                                __cxx_atomic_base_impl<_Tp>,
-                                                __cxx_atomic_lock_impl<_Tp> >::type>
-#else
 template <typename _Tp, typename _Base = __cxx_atomic_base_impl<_Tp> >
-#endif //_LIBCPP_ATOMIC_ONLY_USE_BUILTINS
 struct __cxx_atomic_impl : public _Base {
   static_assert(is_trivially_copyable<_Tp>::value, "std::atomic<T> requires that 'T' be a trivially copyable type");
 
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 0797880cb2f5da..942bbe7cbb93cb 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1200,9 +1200,6 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 #    ifndef _LIBCPP_ATOMIC_FLAG_TYPE
 #      define _LIBCPP_ATOMIC_FLAG_TYPE bool
 #    endif
-#    ifdef _LIBCPP_FREESTANDING
-#      define _LIBCPP_ATOMIC_ONLY_USE_BUILTINS
-#    endif
 #  endif
 
 #  if defined(__FreeBSD__) && defined(__clang__) && __has_attribute(__no_thread_safety_analysis__)
diff --git a/libcxx/include/__utility/integer_sequence.h b/libcxx/include/__utility/integer_sequence.h
index e63f3f265b7d5e..ccce9433e7a801 100644
--- a/libcxx/include/__utility/integer_sequence.h
+++ b/libcxx/include/__utility/integer_sequence.h
@@ -31,65 +31,16 @@ struct __integer_sequence {
   using __to_tuple_indices = __tuple_indices<(_Values + _Sp)...>;
 };
 
-#if !__has_builtin(__make_integer_seq) || defined(_LIBCPP_TESTING_FALLBACK_MAKE_INTEGER_SEQUENCE)
-
-namespace __detail {
-
-template <typename _Tp, size_t... _Extra>
-struct __repeat;
-template <typename _Tp, _Tp... _Np, size_t... _Extra>
-struct __repeat<__integer_sequence<_Tp, _Np...>, _Extra...> {
-  typedef _LIBCPP_NODEBUG __integer_sequence<
-      _Tp,
-      _Np...,
-      sizeof...(_Np) + _Np...,
-      2 * sizeof...(_Np) + _Np...,
-      3 * sizeof...(_Np) + _Np...,
-      4 * sizeof...(_Np) + _Np...,
-      5 * sizeof...(_Np) + _Np...,
-      6 * sizeof...(_Np) + _Np...,
-      7 * sizeof...(_Np) + _Np...,
-      _Extra...>
-      type;
-};
-
-template <size_t _Np>
-struct __parity;
-template <size_t _Np>
-struct __make : __parity<_Np % 8>::template __pmake<_Np> {};
-
-// clang-format off
-template<> struct __make<0> { typedef __integer_sequence<size_t> type; };
-template<> struct __make<1> { typedef __integer_sequence<size_t, 0> type; };
-template<> struct __make<2> { typedef __integer_sequence<size_t, 0, 1> type; };
-template<> struct __make<3> { typedef __integer_sequence<size_t, 0, 1, 2> type; };
-template<> struct __make<4> { typedef __integer_sequence<size_t, 0, 1, 2, 3> type; };
-template<> struct __make<5> { typedef __integer_sequence<size_t, 0, 1, 2, 3, 4> type; };
-template<> struct __make<6> { typedef __integer_sequence<size_t, 0, 1, 2, 3, 4, 5> type; };
-template<> struct __make<7> { typedef __integer_sequence<size_t, 0, 1, 2, 3, 4, 5, 6> type; };
-
-template<> struct __parity<0> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type> {}; };
-template<> struct __parity<1> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 1> {}; };
-template<> struct __parity<2> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 2, _Np - 1> {}; };
-template<> struct __parity<3> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 3, _Np - 2, _Np - 1> {}; };
-template<> struct __parity<4> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 4, _Np - 3, _Np - 2, _Np - 1> {}; };
-template<> struct __parity<5> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 5, _Np - 4, _Np - 3, _Np - 2, _Np - 1> {}; };
-template<> struct __parity<6> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 6, _Np - 5, _Np - 4, _Np - 3, _Np - 2, _Np - 1> {}; };
-template<> struct __parity<7> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 7, _Np - 6, _Np - 5, _Np - 4, _Np - 3, _Np - 2, _Np - 1> {}; };
-// clang-format on
-
-} // namespace __detail
-
-#endif
-
 #if __has_builtin(__make_integer_seq)
 template <size_t _Ep, size_t _Sp>
 using __make_indices_imp =
     typename __make_integer_seq<__integer_sequence, size_t, _Ep - _Sp>::template __to_tuple_indices<_Sp>;
-#else
+#elif __has_builtin(__integer_pack)
 template <size_t _Ep, size_t _Sp>
-using __make_indices_imp = typename __detail::__make<_Ep - _Sp>::type::template __to_tuple_indices<_Sp>;
-
+using __make_indices_imp =
+    typename __integer_sequence<size_t, __integer_pack(_Ep - _Sp)...>::template __to_tuple_indices<_Sp>;
+#else
+#  error "No known way to get an integer pack from the compiler"
 #endif
 
 #if _LIBCPP_STD_VER >= 14
@@ -104,34 +55,20 @@ struct _LIBCPP_TEMPLATE_VIS integer_sequence {
 template <size_t... _Ip>
 using index_sequence = integer_sequence<size_t, _Ip...>;
 
-#  if __has_builtin(__make_integer_seq) && !defined(_LIBCPP_TESTING_FALLBACK_MAKE_INTEGER_SEQUENCE)
+#  if __has_builtin(__make_integer_seq)
 
 template <class _Tp, _Tp _Ep>
-using __make_integer_sequence _LIBCPP_NODEBUG = __make_integer_seq<integer_sequence, _Tp, _Ep>;
-
-#  else
+using make_integer_sequence _LIBCPP_NODEBUG = __make_integer_seq<integer_sequence, _Tp, _Ep>;
 
-template <typename _Tp, _Tp _Np>
-using __make_integer_sequence_unchecked _LIBCPP_NODEBUG =
-    typename __detail::__make<_Np>::type::template __convert<integer_sequence, _Tp>;
+#  elif __has_builtin(__integer_pack)
 
-template <class _Tp, _Tp _Ep>
-struct __make_integer_sequence_checked {
-  static_assert(is_integral<_Tp>::value, "std::make_integer_sequence can only be instantiated with an integral type");
-  static_assert(0 <= _Ep, "std::make_integer_sequence must have a non-negative sequence length");
-  // Workaround GCC bug by preventing bad installations when 0 <= _Ep
-  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68929
-  typedef _LIBCPP_NODEBUG __make_integer_sequence_unchecked<_Tp, 0 <= _Ep ? _Ep : 0> type;
-};
-
-template <class _Tp, _Tp _Ep>
-using __make_integer_sequence _LIBCPP_NODEBUG = typename __make_integer_sequence_checked<_Tp, _Ep>::type;
+template <class _Tp, _Tp _SequenceSize>
+using make_integer_sequence _LIBCPP_NODEBUG = integer_sequence<_Tp, __integer_pack(_SequenceSize)...>;
 
+#  else
+#    error "No known way to get an integer pack from the compiler"
 #  endif
 
-template <class _Tp, _Tp _Np>
-using make_integer_sequence = __make_integer_sequence<_Tp, _Np>;
-
 template <size_t _Np>
 using make_index_sequence = make_integer_sequence<size_t, _Np>;
 
diff --git a/libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.pass.cpp b/libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.pass.cpp
deleted file mode 100644
index ceeb4dd3eeec41..00000000000000
--- a/libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.pass.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <utility>
-
-// template<class T, T N>
-//   using make_integer_sequence = integer_sequence<T, 0, 1, ..., N-1>;
-
-// UNSUPPORTED: c++03, c++11
-
-#define _LIBCPP_TESTING_FALLBACK_MAKE_INTEGER_SEQUENCE
-#include "make_integer_seq.pass.cpp"
-
-#include "test_macros.h"
diff --git a/libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.verify.cpp b/libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.verify.cpp
deleted file mode 100644
index 32a4a5431333e2..00000000000000
--- a/libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.verify.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <utility>
-
-// template<class T, T N>
-//   using make_integer_sequence = integer_sequence<T, 0, 1, ..., N-1>;
-
-// UNSUPPORTED: c++03, c++11
-
-// This test hangs during recursive template instantiation with libstdc++
-// UNSUPPORTED: stdlib=libstdc++
-
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_TESTING_FALLBACK_MAKE_INTEGER_SEQUENCE
-
-#include <utility>
-
-typedef std::make_integer_sequence<int, -3> MakeSeqT;
-MakeSeqT i; // expected-error-re@*:* {{static assertion failed{{.*}}std::make_integer_sequence must have a non-negative sequence length}}
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index a57f60c5eed36b..018ceec97f204a 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -691,6 +691,8 @@ static PlatformVersion parsePlatformVersion(const Arg *arg) {
           .Cases("tvos-simulator", "8", PLATFORM_TVOSSIMULATOR)
           .Cases("watchos-simulator", "9", PLATFORM_WATCHOSSIMULATOR)
           .Cases("driverkit", "10", PLATFORM_DRIVERKIT)
+          .Cases("xros", "11", PLATFORM_XROS)
+          .Cases("xros-simulator", "12", PLATFORM_XROS_SIMULATOR)
           .Default(PLATFORM_UNKNOWN);
   if (platformVersion.platform == PLATFORM_UNKNOWN)
     error(Twine("malformed platform: ") + platformStr);
@@ -985,6 +987,8 @@ PlatformType macho::removeSimulator(PlatformType platform) {
     return PLATFORM_TVOS;
   case PLATFORM_WATCHOSSIMULATOR:
     return PLATFORM_WATCHOS;
+  case PLATFORM_XROS_SIMULATOR:
+    return PLATFORM_XROS;
   default:
     return platform;
   }
@@ -1001,15 +1005,17 @@ static bool shouldAdhocSignByDefault(Architecture arch, PlatformType platform) {
 
   return platform == PLATFORM_MACOS || platform == PLATFORM_IOSSIMULATOR ||
          platform == PLATFORM_TVOSSIMULATOR ||
-         platform == PLATFORM_WATCHOSSIMULATOR;
+         platform == PLATFORM_WATCHOSSIMULATOR ||
+         platform == PLATFORM_XROS_SIMULATOR;
 }
 
 static bool dataConstDefault(const InputArgList &args) {
-  static const std::array<std::pair<PlatformType, VersionTuple>, 5> minVersion =
+  static const std::array<std::pair<PlatformType, VersionTuple>, 6> minVersion =
       {{{PLATFORM_MACOS, VersionTuple(10, 15)},
         {PLATFORM_IOS, VersionTuple(13, 0)},
         {PLATFORM_TVOS, VersionTuple(13, 0)},
         {PLATFORM_WATCHOS, VersionTuple(6, 0)},
+        {PLATFORM_XROS, VersionTuple(1, 0)},
         {PLATFORM_BRIDGEOS, VersionTuple(4, 0)}}};
   PlatformType platform = removeSimulator(config->platformInfo.target.Platform);
   auto it = llvm::find_if(minVersion,
@@ -1045,11 +1051,12 @@ static bool shouldEmitChainedFixups(const InputArgList &args) {
   bool isRequested = arg != nullptr;
 
   // Version numbers taken from the Xcode 13.3 release notes.
-  static const std::array<std::pair<PlatformType, VersionTuple>, 4> minVersion =
+  static const std::array<std::pair<PlatformType, VersionTuple>, 5> minVersion =
       {{{PLATFORM_MACOS, VersionTuple(11, 0)},
         {PLATFORM_IOS, VersionTuple(13, 4)},
         {PLATFORM_TVOS, VersionTuple(14, 0)},
-        {PLATFORM_WATCHOS, VersionTuple(7, 0)}}};
+        {PLATFORM_WATCHOS, VersionTuple(7, 0)},
+        {PLATFORM_XROS, VersionTuple(1, 0)}}};
   PlatformType platform = removeSimulator(config->platformInfo.target.Platform);
   auto it = llvm::find_if(minVersion,
                           [&](const auto &p) { return p.first == platform; });
@@ -1688,8 +1695,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
   if (args.getLastArg(OPT_reproducible))
     config->zeroModTime = true;
 
-  std::array<PlatformType, 3> encryptablePlatforms{
-      PLATFORM_IOS, PLATFORM_WATCHOS, PLATFORM_TVOS};
+  std::array<PlatformType, 4> encryptablePlatforms{
+      PLATFORM_IOS, PLATFORM_WATCHOS, PLATFORM_TVOS, PLATFORM_XROS};
   config->emitEncryptionInfo =
       args.hasFlag(OPT_encryptable, OPT_no_encryption,
                    is_contained(encryptablePlatforms, config->platform()));
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 01e73b789f9aab..a524e4a4c50841 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -377,7 +377,7 @@ def grp_version : OptionGroup<"version">, HelpText<"VERSION TARGETING">;
 
 def platform_version : MultiArg<["-"], "platform_version", 3>,
     MetaVarName<"<platform> <min_version> <sdk_version>">,
-    HelpText<"Platform (e.g., macos, ios, tvos, watchos, bridgeos, mac-catalyst, ios-sim, tvos-sim, watchos-sim, driverkit) and version numbers">,
+    HelpText<"Platform (e.g., macos, ios, tvos, watchos, xros, bridgeos, mac-catalyst, ios-sim, tvos-sim, watchos-sim, xros-sim, driverkit) and version numbers">,
     Group<grp_version>;
 def sdk_version : Separate<["-"], "sdk_version">,
     HelpText<"This option is undocumented in ld64">,
diff --git a/lld/test/MachO/lc-build-version.s b/lld/test/MachO/lc-build-version.s
index 7b78f803428a7f..1fd7078919b159 100644
--- a/lld/test/MachO/lc-build-version.s
+++ b/lld/test/MachO/lc-build-version.s
@@ -64,6 +64,13 @@
 
 # WATCHOS-4-0: cmd LC_VERSION_MIN_WATCHOS
 
+# RUN: %no-arg-lld -arch x86_64 -platform_version xros 1.0 1.1 -o %t.xros-1-0 %t.o
+# RUN: llvm-objdump --macho --all-headers %t.xros-1-0 | FileCheck %s --check-prefix=XROS-1-0
+# RUN: %no-arg-lld -arch x86_64 -platform_version xros-simulator 1.0 1.1 -o %t.xros-sim-1-0 %t.o
+# RUN: llvm-objdump --macho --all-headers %t.xros-sim-1-0 | FileCheck %s --check-prefix=XROS-1-0
+
+# XROS-1-0: cmd LC_BUILD_VERSION
+
 .text
 .global _main
 _main:
diff --git a/lld/test/MachO/platform-version.s b/lld/test/MachO/platform-version.s
index 047aea02fcde31..57fbae62b2ffcd 100644
--- a/lld/test/MachO/platform-version.s
+++ b/lld/test/MachO/platform-version.s
@@ -55,7 +55,7 @@
 # RUN:        -platform_version 0 1 5 \
 # RUN:     | FileCheck --check-prefix=FAIL-PLATFORM %s
 # RUN: not %no-arg-lld -arch x86_64 -o %t %t.o 2>&1 \
-# RUN:        -platform_version 11 1 5 \
+# RUN:        -platform_version 13 1 5 \
 # RUN:     | FileCheck --check-prefix=FAIL-PLATFORM %s
 # FAIL-PLATFORM: malformed platform: {{.*}}
 # FAIL-PLATFORM-NOT: malformed {{minimum|sdk}} version: {{.*}}
diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp
index ce707e530d008b..e66271e8a6ee99 100644
--- a/lldb/source/Host/common/Editline.cpp
+++ b/lldb/source/Host/common/Editline.cpp
@@ -1029,8 +1029,11 @@ unsigned char Editline::TabCommand(int ch) {
     case CompletionMode::Normal: {
       std::string to_add = completion.GetCompletion();
       // Terminate the current argument with a quote if it started with a quote.
-      if (!request.GetParsedLine().empty() && request.GetParsedArg().IsQuoted())
+      Args &parsedLine = request.GetParsedLine();
+      if (!parsedLine.empty() && request.GetCursorIndex() < parsedLine.size() &&
+          request.GetParsedArg().IsQuoted()) {
         to_add.push_back(request.GetParsedArg().GetQuoteChar());
+      }
       to_add.push_back(' ');
       el_deletestr(m_editline, request.GetCursorArgumentPrefix().size());
       el_insertstr(m_editline, to_add.c_str());
diff --git a/lldb/test/API/repl/clang/TestClangREPL.py b/lldb/test/API/repl/clang/TestClangREPL.py
index 0b67955a7833c6..c37557fb94735d 100644
--- a/lldb/test/API/repl/clang/TestClangREPL.py
+++ b/lldb/test/API/repl/clang/TestClangREPL.py
@@ -1,7 +1,6 @@
-import lldb
 from lldbsuite.test.decorators import *
-from lldbsuite.test.lldbtest import *
 from lldbsuite.test.lldbpexpect import PExpectTest
+from lldbsuite.test.lldbtest import *
 
 
 class TestCase(PExpectTest):
@@ -17,13 +16,7 @@ def expect_repl(self, expr, substrs=[]):
         self.current_repl_line_number += 1
         self.child.expect_exact(str(self.current_repl_line_number) + ">")
 
-    # PExpect uses many timeouts internally and doesn't play well
-    # under ASAN on a loaded machine..
-    @skipIfAsan
-    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
-    @skipIfEditlineSupportMissing
-    def test_basic_completion(self):
-        """Test that we can complete a simple multiline expression"""
+    def start_repl(self):
         self.build()
         self.current_repl_line_number = 1
 
@@ -41,6 +34,14 @@ def test_basic_completion(self):
         self.child.send("expression --repl -l c --\n")
         self.child.expect_exact("1>")
 
+    # PExpect uses many timeouts internally and doesn't play well
+    # under ASAN on a loaded machine..
+    @skipIfAsan
+    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
+    @skipIfEditlineSupportMissing
+    def test_basic_completion(self):
+        """Test that we can complete a simple multiline expression"""
+        self.start_repl()
         # Try evaluating a simple expression.
         self.expect_repl("3 + 3", substrs=["(int) $0 = 6"])
 
@@ -54,3 +55,16 @@ def test_basic_completion(self):
         self.expect_repl("$persistent + 10", substrs=["(long) $2 = 17"])
 
         self.quit()
+
+    # PExpect uses many timeouts internally and doesn't play well
+    # under ASAN on a loaded machine..
+    @skipIfAsan
+    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
+    @skipIfEditlineSupportMissing
+    def test_completion_with_space_only_line(self):
+        """Test that we don't crash when completing lines with spaces only"""
+        self.start_repl()
+
+        self.child.send("   ")
+        self.child.send("\t")
+        self.expect_repl("3 + 3", substrs=["(int) $0 = 6"])
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index f5f7d3f3253fd3..651f17879fad24 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -718,6 +718,8 @@ if(LLVM_INDIVIDUAL_TEST_COVERAGE)
 endif()
 set(LLVM_LIT_ARGS "${LIT_ARGS_DEFAULT}" CACHE STRING "Default options for lit")
 
+option(LLVM_PARALLEL_LIT "Enable multiple lit suites to run in parallel" OFF)
+
 # On Win32 hosts, provide an option to specify the path to the GnuWin32 tools.
 if( WIN32 AND NOT CYGWIN )
   set(LLVM_LIT_TOOLS_DIR "" CACHE PATH "Path to GnuWin32 tools")
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 2b3b9ba45f629a..af58ff861c8915 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1951,11 +1951,18 @@ function(add_lit_target target comment)
     list(APPEND LIT_COMMAND --param ${param})
   endforeach()
   if (ARG_UNPARSED_ARGUMENTS)
-    add_custom_target(${target}
-      COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
-      COMMENT "${comment}"
-      USES_TERMINAL
-      )
+    if (LLVM_PARALLEL_LIT)
+     add_custom_target(${target}
+       COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
+       COMMENT "${comment}"
+       )
+    else()
+     add_custom_target(${target}
+       COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
+       COMMENT "${comment}"
+       USES_TERMINAL
+       )
+    endif()
   else()
     add_custom_target(${target}
       COMMAND ${CMAKE_COMMAND} -E echo "${target} does nothing, no tools built.")
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index abef4f8103140f..35c47989a7eef0 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -762,6 +762,12 @@ enabled sub-projects. Nearly all of these variable names begin with
 **LLVM_PARALLEL_LINK_JOBS**:STRING
   Define the maximum number of concurrent link jobs.
 
+**LLVM_PARALLEL_LIT**:BOOL
+  Defaults to ``OFF``. If set to ``OFF``, lit testsuites will be configured
+  with CMake's ``USES_TERMINAL`` flag to give direct access to the terminal. If
+  set to ``ON``, that flag will be removed allowing Ninja to schedule multiple
+  lit testsuites in parallel.
+
 **LLVM_RAM_PER_COMPILE_JOB**:STRING
   Calculates the amount of Ninja compile jobs according to available resources.
   Value has to be in MB, overwrites LLVM_PARALLEL_COMPILE_JOBS. Compile jobs 
diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst
index 9e3c19078f1cce..fdf17c7fe41285 100644
--- a/llvm/docs/CommandGuide/llvm-exegesis.rst
+++ b/llvm/docs/CommandGuide/llvm-exegesis.rst
@@ -89,6 +89,14 @@ properly.
   annotation requires the subprocess execution mode. This is useful in
   cases where the memory accessed by the snippet depends on the location
   of the snippet, like RIP-relative addressing.
+* `LLVM-EXEGESIS-LOOP-REGISTER <register name>` - This annotation specifies
+  the loop register to use for keeping track of the current iteration when
+  using the loop repetition mode. :program:`llvm-exegesis` needs to keep track
+  of the current loop iteration within the loop repetition mode in a performant
+  manner (i.e., no memory accesses), and uses a register to do this. This register
+  has an architecture specific default (e.g., `R8` on X86), but this might conflict
+  with some snippets. This annotation allows changing the register to prevent
+  interference between the loop index register and the snippet.
 
 EXAMPLE 1: benchmarking instructions
 ------------------------------------
diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 755291676abf6a..9d0cb7ad119589 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -455,6 +455,15 @@ them.
  Set the start address of the output to ``<addr>``. Overrides any previously
  specified :option:`--change-start` or :option:`--adjust-start` options.
 
+.. option:: --set-symbol-visibility <symbol>=<visibility>
+
+ Change the visibility of a symbol to the specified value.
+
+.. option:: --set-symbols-visibility <filename>=<visibility>
+
+ Read a list of symbols from <filename> and change their visibility to the
+ specified value. Visibility values: default, internal, hidden, protected.
+
 .. option:: --split-dwo <dwo-file>
 
  Equivalent to running :program:`llvm-objcopy` with :option:`--extract-dwo` and
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 8a3a0ec66ed874..51b6527f65bb04 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -147,6 +147,10 @@ Changes to the LLVM tools
   if it's not specified with the ``--format`` argument and cannot be inferred from
   input files.
 
+* llvm-objcopy now supports ``--set-symbol-visibility`` and
+  ``--set-symbols-visibility`` options for ELF input to change the
+  visibility of symbols.
+
 Changes to LLDB
 ---------------------------------
 
diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index c3dcd568216b71..a28e19edb4c6a6 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -141,7 +141,7 @@ enum class PartType {
 #include "DXContainerConstants.def"
 };
 
-#define SHADER_FLAG(Num, Val, Str) Val = 1ull << Num,
+#define SHADER_FEATURE_FLAG(Num, Val, Str) Val = 1ull << Num,
 enum class FeatureFlags : uint64_t {
 #include "DXContainerConstants.def"
 };
diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index 87dd0a5cb6ba70..80ed86bc3a499e 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -11,43 +11,43 @@ CONTAINER_PART(PSG1)
 #undef CONTAINER_PART
 #endif 
 
-#ifdef SHADER_FLAG
-
-SHADER_FLAG(0, Doubles, "Double-precision floating point")
-SHADER_FLAG(1, ComputeShadersPlusRawAndStructuredBuffers, "Raw and Structured buffers")
-SHADER_FLAG(2, UAVsAtEveryStage, "UAVs at every shader stage")
-SHADER_FLAG(3, Max64UAVs, "64 UAV slots")
-SHADER_FLAG(4, MinimumPrecision, "Minimum-precision data types")
-SHADER_FLAG(5, DX11_1_DoubleExtensions, "Double-precision extensions for 11.1")
-SHADER_FLAG(6, DX11_1_ShaderExtensions, "Shader extensions for 11.1")
-SHADER_FLAG(7, LEVEL9ComparisonFiltering, "Comparison filtering for feature level 9")
-SHADER_FLAG(8, TiledResources, "Tiled resources")
-SHADER_FLAG(9, StencilRef, "PS Output Stencil Ref")
-SHADER_FLAG(10, InnerCoverage, "PS Inner Coverage")
-SHADER_FLAG(11, TypedUAVLoadAdditionalFormats, "Typed UAV Load Additional Formats")
-SHADER_FLAG(12, ROVs, "Raster Ordered UAVs")
-SHADER_FLAG(13, ViewportAndRTArrayIndexFromAnyShaderFeedingRasterizer, "SV_RenderTargetArrayIndex or SV_ViewportArrayIndex from any shader feeding rasterizer")
-SHADER_FLAG(14, WaveOps, "Wave level operations")
-SHADER_FLAG(15, Int64Ops, "64-Bit integer")
-SHADER_FLAG(16, ViewID, "View Instancing")
-SHADER_FLAG(17, Barycentrics, "Barycentrics")
-SHADER_FLAG(18, NativeLowPrecision, "Use native low precision")
-SHADER_FLAG(19, ShadingRate, "Shading Rate")
-SHADER_FLAG(20, Raytracing_Tier_1_1, "Raytracing tier 1.1 features")
-SHADER_FLAG(21, SamplerFeedback, "Sampler feedback")
-SHADER_FLAG(22, AtomicInt64OnTypedResource, "64-bit Atomics on Typed Resources")
-SHADER_FLAG(23, AtomicInt64OnGroupShared, "64-bit Atomics on Group Shared")
-SHADER_FLAG(24, DerivativesInMeshAndAmpShaders, "Derivatives in mesh and amplification shaders")
-SHADER_FLAG(25, ResourceDescriptorHeapIndexing, "Resource descriptor heap indexing")
-SHADER_FLAG(26, SamplerDescriptorHeapIndexing, "Sampler descriptor heap indexing")
-SHADER_FLAG(27, RESERVED, "<RESERVED>")
-SHADER_FLAG(28, AtomicInt64OnHeapResource, "64-bit Atomics on Heap Resources")
-SHADER_FLAG(29, AdvancedTextureOps, "Advanced Texture Ops")
-SHADER_FLAG(30, WriteableMSAATextures, "Writeable MSAA Textures")
-
-SHADER_FLAG(31, NextUnusedBit, "Next reserved shader flag bit (not a flag)")
-
-#undef SHADER_FLAG
+#ifdef SHADER_FEATURE_FLAG
+
+SHADER_FEATURE_FLAG(0, Doubles, "Double-precision floating point")
+SHADER_FEATURE_FLAG(1, ComputeShadersPlusRawAndStructuredBuffers, "Raw and Structured buffers")
+SHADER_FEATURE_FLAG(2, UAVsAtEveryStage, "UAVs at every shader stage")
+SHADER_FEATURE_FLAG(3, Max64UAVs, "64 UAV slots")
+SHADER_FEATURE_FLAG(4, MinimumPrecision, "Minimum-precision data types")
+SHADER_FEATURE_FLAG(5, DX11_1_DoubleExtensions, "Double-precision extensions for 11.1")
+SHADER_FEATURE_FLAG(6, DX11_1_ShaderExtensions, "Shader extensions for 11.1")
+SHADER_FEATURE_FLAG(7, LEVEL9ComparisonFiltering, "Comparison filtering for feature level 9")
+SHADER_FEATURE_FLAG(8, TiledResources, "Tiled resources")
+SHADER_FEATURE_FLAG(9, StencilRef, "PS Output Stencil Ref")
+SHADER_FEATURE_FLAG(10, InnerCoverage, "PS Inner Coverage")
+SHADER_FEATURE_FLAG(11, TypedUAVLoadAdditionalFormats, "Typed UAV Load Additional Formats")
+SHADER_FEATURE_FLAG(12, ROVs, "Raster Ordered UAVs")
+SHADER_FEATURE_FLAG(13, ViewportAndRTArrayIndexFromAnyShaderFeedingRasterizer, "SV_RenderTargetArrayIndex or SV_ViewportArrayIndex from any shader feeding rasterizer")
+SHADER_FEATURE_FLAG(14, WaveOps, "Wave level operations")
+SHADER_FEATURE_FLAG(15, Int64Ops, "64-Bit integer")
+SHADER_FEATURE_FLAG(16, ViewID, "View Instancing")
+SHADER_FEATURE_FLAG(17, Barycentrics, "Barycentrics")
+SHADER_FEATURE_FLAG(18, NativeLowPrecision, "Use native low precision")
+SHADER_FEATURE_FLAG(19, ShadingRate, "Shading Rate")
+SHADER_FEATURE_FLAG(20, Raytracing_Tier_1_1, "Raytracing tier 1.1 features")
+SHADER_FEATURE_FLAG(21, SamplerFeedback, "Sampler feedback")
+SHADER_FEATURE_FLAG(22, AtomicInt64OnTypedResource, "64-bit Atomics on Typed Resources")
+SHADER_FEATURE_FLAG(23, AtomicInt64OnGroupShared, "64-bit Atomics on Group Shared")
+SHADER_FEATURE_FLAG(24, DerivativesInMeshAndAmpShaders, "Derivatives in mesh and amplification shaders")
+SHADER_FEATURE_FLAG(25, ResourceDescriptorHeapIndexing, "Resource descriptor heap indexing")
+SHADER_FEATURE_FLAG(26, SamplerDescriptorHeapIndexing, "Sampler descriptor heap indexing")
+SHADER_FEATURE_FLAG(27, RESERVED, "<RESERVED>")
+SHADER_FEATURE_FLAG(28, AtomicInt64OnHeapResource, "64-bit Atomics on Heap Resources")
+SHADER_FEATURE_FLAG(29, AdvancedTextureOps, "Advanced Texture Ops")
+SHADER_FEATURE_FLAG(30, WriteableMSAATextures, "Writeable MSAA Textures")
+
+SHADER_FEATURE_FLAG(31, NextUnusedBit, "Next reserved shader flag bit (not a flag)")
+
+#undef SHADER_FEATURE_FLAG
 #endif
 
 #ifdef SEMANTIC_KIND
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 5454df02914af6..bfac54a65c5b4e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -612,6 +612,7 @@ class IRTranslator : public MachineFunctionPass {
   AAResults *AA = nullptr;
   AssumptionCache *AC = nullptr;
   const TargetLibraryInfo *LibInfo = nullptr;
+  const TargetLowering *TLI = nullptr;
   FunctionLoweringInfo FuncInfo;
 
   // True when either the Target Machine specifies no optimizations or the
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
index 3d0f836b0c7578..29de6bd8685e06 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CODEGEN_SELECTIONDAGADDRESSANALYSIS_H
 #define LLVM_CODEGEN_SELECTIONDAGADDRESSANALYSIS_H
 
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include <cstdint>
 
@@ -81,10 +82,8 @@ class BaseIndexOffset {
 
   // Returns true `Op0` and `Op1` can be proven to alias/not alias, in
   // which case `IsAlias` is set to true/false.
-  static bool computeAliasing(const SDNode *Op0,
-                              const std::optional<int64_t> NumBytes0,
-                              const SDNode *Op1,
-                              const std::optional<int64_t> NumBytes1,
+  static bool computeAliasing(const SDNode *Op0, const LocationSize NumBytes0,
+                              const SDNode *Op1, const LocationSize NumBytes1,
                               const SelectionDAG &DAG, bool &IsAlias);
 
   /// Parses tree in N for base, index, offset addresses.
diff --git a/llvm/include/llvm/DebugInfo/DIContext.h b/llvm/include/llvm/DebugInfo/DIContext.h
index 288ddf77bdfda7..b75dc8db54336b 100644
--- a/llvm/include/llvm/DebugInfo/DIContext.h
+++ b/llvm/include/llvm/DebugInfo/DIContext.h
@@ -206,6 +206,7 @@ struct DIDumpOptions {
   bool IsEH = false;
   bool DumpNonSkeleton = false;
   bool ShowAggregateErrors = false;
+  std::string JsonErrSummaryFile;
   std::function<llvm::StringRef(uint64_t DwarfRegNum, bool IsEH)>
       GetNameForDWARFReg;
 
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 04bc58d8f63e47..d33af157543fec 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -5540,7 +5540,7 @@ Node *AbstractManglingParser<Alloc, Derived>::parseFloatingLiteral() {
     return nullptr;
   std::string_view Data(First, N);
   for (char C : Data)
-    if (!std::isxdigit(C))
+    if (!(C >= '0' && C <= '9') && !(C >= 'a' && C <= 'f'))
       return nullptr;
   First += N;
   if (!consumeIf('E'))
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 589a9066ac57af..5bbaa8c208b8cd 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1834,13 +1834,15 @@ class OpenMPIRBuilder {
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param FiniCB Callback to finalize variable copies.
   /// \param IsNowait If false, a barrier is emitted.
-  /// \param DidIt Local variable used as a flag to indicate 'single' thread
+  /// \param CPVars copyprivate variables.
+  /// \param CPFuncs copy functions to use for each copyprivate variable.
   ///
   /// \returns The insertion position *after* the single call.
   InsertPointTy createSingle(const LocationDescription &Loc,
                              BodyGenCallbackTy BodyGenCB,
                              FinalizeCallbackTy FiniCB, bool IsNowait,
-                             llvm::Value *DidIt);
+                             ArrayRef<llvm::Value *> CPVars = {},
+                             ArrayRef<llvm::Function *> CPFuncs = {});
 
   /// Generator for '#omp master'
   ///
diff --git a/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h b/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
index d77cb69b159db6..eafed92516c7df 100644
--- a/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
+++ b/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_OBJCOPY_ELF_ELFCONFIG_H
 #define LLVM_OBJCOPY_ELF_ELFCONFIG_H
 
+#include "llvm/ObjCopy/CommonConfig.h"
 #include "llvm/Object/ELFTypes.h"
 
 namespace llvm {
@@ -18,6 +19,8 @@ namespace objcopy {
 struct ELFConfig {
   uint8_t NewSymbolVisibility = (uint8_t)ELF::STV_DEFAULT;
 
+  std::vector<std::pair<NameMatcher, uint8_t>> SymbolsToSetVisibility;
+
   // ELF entry point address expression. The input parameter is an entry point
   // address in the input ELF file. The entry address in the output file is
   // calculated with EntryExpr(input_address), when either --set-start or
diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h
index a7f18c79969803..b6e3d321da2461 100644
--- a/llvm/include/llvm/Object/DXContainer.h
+++ b/llvm/include/llvm/Object/DXContainer.h
@@ -276,7 +276,7 @@ class DXContainer {
   dxbc::Header Header;
   SmallVector<uint32_t, 4> PartOffsets;
   std::optional<DXILData> DXIL;
-  std::optional<uint64_t> ShaderFlags;
+  std::optional<uint64_t> ShaderFeatureFlags;
   std::optional<dxbc::ShaderHash> Hash;
   std::optional<DirectX::PSVRuntimeInfo> PSVInfo;
   DirectX::Signature InputSignature;
@@ -286,7 +286,7 @@ class DXContainer {
   Error parseHeader();
   Error parsePartOffsets();
   Error parseDXILHeader(StringRef Part);
-  Error parseShaderFlags(StringRef Part);
+  Error parseShaderFeatureFlags(StringRef Part);
   Error parseHash(StringRef Part);
   Error parsePSVInfo(StringRef Part);
   Error parseSignature(StringRef Part, DirectX::Signature &Array);
@@ -368,7 +368,9 @@ class DXContainer {
 
   const std::optional<DXILData> &getDXIL() const { return DXIL; }
 
-  std::optional<uint64_t> getShaderFlags() const { return ShaderFlags; }
+  std::optional<uint64_t> getShaderFeatureFlags() const {
+    return ShaderFeatureFlags;
+  }
 
   std::optional<dxbc::ShaderHash> getShaderHash() const { return Hash; }
 
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index 66a6ac70bbea10..497f82bbd0f32a 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -56,10 +56,10 @@ struct DXILProgram {
   std::optional<std::vector<llvm::yaml::Hex8>> DXIL;
 };
 
-#define SHADER_FLAG(Num, Val, Str) bool Val = false;
-struct ShaderFlags {
-  ShaderFlags() = default;
-  ShaderFlags(uint64_t FlagData);
+#define SHADER_FEATURE_FLAG(Num, Val, Str) bool Val = false;
+struct ShaderFeatureFlags {
+  ShaderFeatureFlags() = default;
+  ShaderFeatureFlags(uint64_t FlagData);
   uint64_t getEncodedFlags();
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 };
@@ -151,7 +151,7 @@ struct Part {
   std::string Name;
   uint32_t Size;
   std::optional<DXILProgram> Program;
-  std::optional<ShaderFlags> Flags;
+  std::optional<ShaderFeatureFlags> Flags;
   std::optional<ShaderHash> Hash;
   std::optional<PSVInfo> Info;
   std::optional<DXContainerYAML::Signature> Signature;
@@ -195,8 +195,8 @@ template <> struct MappingTraits<DXContainerYAML::DXILProgram> {
   static void mapping(IO &IO, DXContainerYAML::DXILProgram &Program);
 };
 
-template <> struct MappingTraits<DXContainerYAML::ShaderFlags> {
-  static void mapping(IO &IO, DXContainerYAML::ShaderFlags &Flags);
+template <> struct MappingTraits<DXContainerYAML::ShaderFeatureFlags> {
+  static void mapping(IO &IO, DXContainerYAML::ShaderFeatureFlags &Flags);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ShaderHash> {
diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 1b793b0eccf3c7..68dbe1ea3062ab 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -321,8 +321,6 @@ class TypeSize : public details::FixedOrScalableQuantity<TypeSize, uint64_t> {
       : FixedOrScalableQuantity(V) {}
 
 public:
-  constexpr TypeSize() : FixedOrScalableQuantity(0, false) {}
-
   constexpr TypeSize(ScalarTy Quantity, bool Scalable)
       : FixedOrScalableQuantity(Quantity, Scalable) {}
 
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 38bb808dd5bd53..7c986dbbc2c7c8 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -596,8 +596,6 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
   const Value *CondVal = BrInst.getCondition();
   MachineBasicBlock *Succ1MBB = &getMBB(*BrInst.getSuccessor(1));
 
-  const auto &TLI = *MF->getSubtarget().getTargetLowering();
-
   // If this is a series of conditions that are or'd or and'd together, emit
   // this as a sequence of branches instead of setcc's with and/or operations.
   // As long as jumps are not expensive (exceptions for multi-use logic ops,
@@ -617,7 +615,7 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
   //     jle foo
   using namespace PatternMatch;
   const Instruction *CondI = dyn_cast<Instruction>(CondVal);
-  if (!TLI.isJumpExpensive() && CondI && CondI->hasOneUse() &&
+  if (!TLI->isJumpExpensive() && CondI && CondI->hasOneUse() &&
       !BrInst.hasMetadata(LLVMContext::MD_unpredictable)) {
     Instruction::BinaryOps Opcode = (Instruction::BinaryOps)0;
     Value *Vec;
@@ -1385,9 +1383,8 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
     return true;
   }
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
   MachineMemOperand::Flags Flags =
-      TLI.getLoadMemOperandFlags(LI, *DL, AC, LibInfo);
+      TLI->getLoadMemOperandFlags(LI, *DL, AC, LibInfo);
   if (AA && !(Flags & MachineMemOperand::MOInvariant)) {
     if (AA->pointsToConstantMemory(
             MemoryLocation(Ptr, LocationSize::precise(StoreSize), AAInfo))) {
@@ -1434,8 +1431,7 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
     return true;
   }
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  MachineMemOperand::Flags Flags = TLI.getStoreMemOperandFlags(SI, *DL);
+  MachineMemOperand::Flags Flags = TLI->getStoreMemOperandFlags(SI, *DL);
 
   for (unsigned i = 0; i < Vals.size(); ++i) {
     Register Addr;
@@ -1779,8 +1775,7 @@ void IRTranslator::getStackGuard(Register DstReg,
   auto MIB =
       MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD, {DstReg}, {});
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  Value *Global = TLI.getSDagStackGuard(*MF->getFunction().getParent());
+  Value *Global = TLI->getSDagStackGuard(*MF->getFunction().getParent());
   if (!Global)
     return;
 
@@ -2111,9 +2106,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     // does. Simplest intrinsic ever!
     return true;
   case Intrinsic::vastart: {
-    auto &TLI = *MF->getSubtarget().getTargetLowering();
     Value *Ptr = CI.getArgOperand(0);
-    unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8;
+    unsigned ListSize = TLI->getVaListSizeInBits(*DL) / 8;
     Align Alignment = getKnownAlignment(Ptr, *DL);
 
     MIRBuilder.buildInstr(TargetOpcode::G_VASTART, {}, {getOrCreateVReg(*Ptr)})
@@ -2189,14 +2183,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return translateFixedPointIntrinsic(TargetOpcode::G_UDIVFIXSAT, CI, MIRBuilder);
   case Intrinsic::fmuladd: {
     const TargetMachine &TM = MF->getTarget();
-    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
     Register Dst = getOrCreateVReg(CI);
     Register Op0 = getOrCreateVReg(*CI.getArgOperand(0));
     Register Op1 = getOrCreateVReg(*CI.getArgOperand(1));
     Register Op2 = getOrCreateVReg(*CI.getArgOperand(2));
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
-        TLI.isFMAFasterThanFMulAndFAdd(*MF,
-                                       TLI.getValueType(*DL, CI.getType()))) {
+        TLI->isFMAFasterThanFMulAndFAdd(*MF,
+                                        TLI->getValueType(*DL, CI.getType()))) {
       // TODO: Revisit this to see if we should move this part of the
       // lowering to the combiner.
       MIRBuilder.buildFMA(Dst, Op0, Op1, Op2,
@@ -2254,10 +2247,9 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     getStackGuard(getOrCreateVReg(CI), MIRBuilder);
     return true;
   case Intrinsic::stackprotector: {
-    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
     LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
     Register GuardVal;
-    if (TLI.useLoadStackGuardNode()) {
+    if (TLI->useLoadStackGuardNode()) {
       GuardVal = MRI->createGenericVirtualRegister(PtrTy);
       getStackGuard(GuardVal, MIRBuilder);
     } else
@@ -2635,10 +2627,9 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   }
 
   // Add a MachineMemOperand if it is a target mem intrinsic.
-  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
   TargetLowering::IntrinsicInfo Info;
   // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
-  if (TLI.getTgtMemIntrinsic(Info, CI, *MF, ID)) {
+  if (TLI->getTgtMemIntrinsic(Info, CI, *MF, ID)) {
     Align Alignment = Info.align.value_or(
         DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
     LLT MemTy = Info.memVT.isSimple()
@@ -2818,10 +2809,9 @@ bool IRTranslator::translateLandingPad(const User &U,
 
   // If there aren't registers to copy the values into (e.g., during SjLj
   // exceptions), then don't bother.
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
   const Constant *PersonalityFn = MF->getFunction().getPersonalityFn();
-  if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
-      TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
+  if (TLI->getExceptionPointerRegister(PersonalityFn) == 0 &&
+      TLI->getExceptionSelectorRegister(PersonalityFn) == 0)
     return true;
 
   // If landingpad's return type is token type, we don't create DAG nodes
@@ -2852,7 +2842,7 @@ bool IRTranslator::translateLandingPad(const User &U,
   assert(Tys.size() == 2 && "Only two-valued landingpads are supported");
 
   // Mark exception register as live in.
-  Register ExceptionReg = TLI.getExceptionPointerRegister(PersonalityFn);
+  Register ExceptionReg = TLI->getExceptionPointerRegister(PersonalityFn);
   if (!ExceptionReg)
     return false;
 
@@ -2860,7 +2850,7 @@ bool IRTranslator::translateLandingPad(const User &U,
   ArrayRef<Register> ResRegs = getOrCreateVRegs(LP);
   MIRBuilder.buildCopy(ResRegs[0], ExceptionReg);
 
-  Register SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn);
+  Register SelectorReg = TLI->getExceptionSelectorRegister(PersonalityFn);
   if (!SelectorReg)
     return false;
 
@@ -2986,8 +2976,7 @@ bool IRTranslator::translateExtractElement(const User &U,
 
   Register Res = getOrCreateVReg(U);
   Register Val = getOrCreateVReg(*U.getOperand(0));
-  const auto &TLI = *MF->getSubtarget().getTargetLowering();
-  unsigned PreferredVecIdxWidth = TLI.getVectorIdxTy(*DL).getSizeInBits();
+  unsigned PreferredVecIdxWidth = TLI->getVectorIdxTy(*DL).getSizeInBits();
   Register Idx;
   if (auto *CI = dyn_cast<ConstantInt>(U.getOperand(1))) {
     if (CI->getBitWidth() != PreferredVecIdxWidth) {
@@ -3039,8 +3028,7 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
   const AtomicCmpXchgInst &I = cast<AtomicCmpXchgInst>(U);
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  auto Flags = TLI.getAtomicMemOperandFlags(I, *DL);
+  auto Flags = TLI->getAtomicMemOperandFlags(I, *DL);
 
   auto Res = getOrCreateVRegs(I);
   Register OldValRes = Res[0];
@@ -3061,8 +3049,7 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
 bool IRTranslator::translateAtomicRMW(const User &U,
                                       MachineIRBuilder &MIRBuilder) {
   const AtomicRMWInst &I = cast<AtomicRMWInst>(U);
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  auto Flags = TLI.getAtomicMemOperandFlags(I, *DL);
+  auto Flags = TLI->getAtomicMemOperandFlags(I, *DL);
 
   Register Res = getOrCreateVReg(I);
   Register Addr = getOrCreateVReg(*I.getPointerOperand());
@@ -3302,8 +3289,7 @@ bool IRTranslator::translate(const Instruction &Inst) {
   CurBuilder->setDebugLoc(Inst.getDebugLoc());
   CurBuilder->setPCSections(Inst.getMetadata(LLVMContext::MD_pcsections));
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  if (TLI.fallBackToDAGISel(Inst))
+  if (TLI->fallBackToDAGISel(Inst))
     return false;
 
   switch (Inst.getOpcode()) {
@@ -3454,9 +3440,8 @@ bool IRTranslator::finalizeBasicBlock(const BasicBlock &BB,
   // Check if we need to generate stack-protector guard checks.
   StackProtector &SP = getAnalysis<StackProtector>();
   if (SP.shouldEmitSDCheck(BB)) {
-    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
     bool FunctionBasedInstrumentation =
-        TLI.getSSPStackGuardCheck(*MF->getFunction().getParent());
+        TLI->getSSPStackGuardCheck(*MF->getFunction().getParent());
     SPDescriptor.initialize(&BB, &MBB, FunctionBasedInstrumentation);
   }
   // Handle stack protector.
@@ -3501,10 +3486,9 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
                                           MachineBasicBlock *ParentBB) {
   CurBuilder->setInsertPt(*ParentBB, ParentBB->end());
   // First create the loads to the guard/stack slot for the comparison.
-  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
   Type *PtrIRTy = PointerType::getUnqual(MF->getFunction().getContext());
   const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
-  LLT PtrMemTy = getLLTForMVT(TLI.getPointerMemTy(*DL));
+  LLT PtrMemTy = getLLTForMVT(TLI->getPointerMemTy(*DL));
 
   MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
   int FI = MFI.getStackProtectorIndex();
@@ -3522,13 +3506,13 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
                       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile)
           .getReg(0);
 
-  if (TLI.useStackGuardXorFP()) {
+  if (TLI->useStackGuardXorFP()) {
     LLVM_DEBUG(dbgs() << "Stack protector xor'ing with FP not yet implemented");
     return false;
   }
 
   // Retrieve guard check function, nullptr if instrumentation is inlined.
-  if (const Function *GuardCheckFn = TLI.getSSPStackGuardCheck(M)) {
+  if (const Function *GuardCheckFn = TLI->getSSPStackGuardCheck(M)) {
     // This path is currently untestable on GlobalISel, since the only platform
     // that needs this seems to be Windows, and we fall back on that currently.
     // The code still lives here in case that changes.
@@ -3563,13 +3547,13 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
 
   // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
   // Otherwise, emit a volatile load to retrieve the stack guard value.
-  if (TLI.useLoadStackGuardNode()) {
+  if (TLI->useLoadStackGuardNode()) {
     Guard =
         MRI->createGenericVirtualRegister(LLT::scalar(PtrTy.getSizeInBits()));
     getStackGuard(Guard, *CurBuilder);
   } else {
     // TODO: test using android subtarget when we support @llvm.thread.pointer.
-    const Value *IRGuard = TLI.getSDagStackGuard(M);
+    const Value *IRGuard = TLI->getSDagStackGuard(M);
     Register GuardPtr = getOrCreateVReg(*IRGuard);
 
     Guard = CurBuilder
@@ -3593,13 +3577,12 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
 bool IRTranslator::emitSPDescriptorFailure(StackProtectorDescriptor &SPD,
                                            MachineBasicBlock *FailureBB) {
   CurBuilder->setInsertPt(*FailureBB, FailureBB->end());
-  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
 
   const RTLIB::Libcall Libcall = RTLIB::STACKPROTECTOR_CHECK_FAIL;
-  const char *Name = TLI.getLibcallName(Libcall);
+  const char *Name = TLI->getLibcallName(Libcall);
 
   CallLowering::CallLoweringInfo Info;
-  Info.CallConv = TLI.getLibcallCallingConv(Libcall);
+  Info.CallConv = TLI->getLibcallCallingConv(Libcall);
   Info.Callee = MachineOperand::CreateES(Name);
   Info.OrigRet = {Register(), Type::getVoidTy(MF->getFunction().getContext()),
                   0};
@@ -3662,6 +3645,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   bool EnableCSE = EnableCSEInIRTranslator.getNumOccurrences()
                        ? EnableCSEInIRTranslator
                        : TPC->isGISelCSEEnabled();
+  TLI = MF->getSubtarget().getTargetLowering();
 
   if (EnableCSE) {
     EntryBuilder = std::make_unique<CSEMIRBuilder>(CurMF);
@@ -3696,12 +3680,8 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   FuncInfo.CanLowerReturn = CLI->checkReturnTypeForCallConv(*MF);
 
-  const auto &TLI = *MF->getSubtarget().getTargetLowering();
-
   SL = std::make_unique<GISelSwitchLowering>(this, FuncInfo);
-  SL->init(TLI, TM, *DL);
-
-
+  SL->init(*TLI, TM, *DL);
 
   assert(PendingPHIs.empty() && "stale PHIs");
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c4d49adc21c4b7..33ada3655dc731 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27835,7 +27835,7 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
     bool IsAtomic;
     SDValue BasePtr;
     int64_t Offset;
-    std::optional<int64_t> NumBytes;
+    LocationSize NumBytes;
     MachineMemOperand *MMO;
   };
 
@@ -27853,7 +27853,8 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
               LSN->isAtomic(),
               LSN->getBasePtr(),
               Offset /*base offset*/,
-              std::optional<int64_t>(Size),
+              Size != ~UINT64_C(0) ? LocationSize::precise(Size)
+                                   : LocationSize::beforeOrAfterPointer(),
               LSN->getMemOperand()};
     }
     if (const auto *LN = cast<LifetimeSDNode>(N))
@@ -27861,13 +27862,15 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
               /*isAtomic*/ false,
               LN->getOperand(1),
               (LN->hasOffset()) ? LN->getOffset() : 0,
-              (LN->hasOffset()) ? std::optional<int64_t>(LN->getSize())
-                                : std::optional<int64_t>(),
+              (LN->hasOffset()) ? LocationSize::precise(LN->getSize())
+                                : LocationSize::beforeOrAfterPointer(),
               (MachineMemOperand *)nullptr};
     // Default.
     return {false /*isvolatile*/,
-            /*isAtomic*/ false,          SDValue(),
-            (int64_t)0 /*offset*/,       std::optional<int64_t>() /*size*/,
+            /*isAtomic*/ false,
+            SDValue(),
+            (int64_t)0 /*offset*/,
+            LocationSize::beforeOrAfterPointer() /*size*/,
             (MachineMemOperand *)nullptr};
   };
 
@@ -27922,18 +27925,20 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
   int64_t SrcValOffset1 = MUC1.MMO->getOffset();
   Align OrigAlignment0 = MUC0.MMO->getBaseAlign();
   Align OrigAlignment1 = MUC1.MMO->getBaseAlign();
-  auto &Size0 = MUC0.NumBytes;
-  auto &Size1 = MUC1.NumBytes;
+  LocationSize Size0 = MUC0.NumBytes;
+  LocationSize Size1 = MUC1.NumBytes;
   if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
-      Size0.has_value() && Size1.has_value() && *Size0 == *Size1 &&
-      OrigAlignment0 > *Size0 && SrcValOffset0 % *Size0 == 0 &&
-      SrcValOffset1 % *Size1 == 0) {
+      Size0.hasValue() && Size1.hasValue() && Size0 == Size1 &&
+      OrigAlignment0 > Size0.getValue() &&
+      SrcValOffset0 % Size0.getValue() == 0 &&
+      SrcValOffset1 % Size1.getValue() == 0) {
     int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value();
     int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value();
 
     // There is no overlap between these relatively aligned accesses of
     // similar size. Return no alias.
-    if ((OffAlign0 + *Size0) <= OffAlign1 || (OffAlign1 + *Size1) <= OffAlign0)
+    if ((OffAlign0 + (int64_t)Size0.getValue()) <= OffAlign1 ||
+        (OffAlign1 + (int64_t)Size1.getValue()) <= OffAlign0)
       return false;
   }
 
@@ -27946,12 +27951,12 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
     UseAA = false;
 #endif
 
-  if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() && Size0 &&
-      Size1) {
+  if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() &&
+      Size0.hasValue() && Size1.hasValue()) {
     // Use alias analysis information.
     int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
-    int64_t Overlap0 = *Size0 + SrcValOffset0 - MinOffset;
-    int64_t Overlap1 = *Size1 + SrcValOffset1 - MinOffset;
+    int64_t Overlap0 = Size0.getValue() + SrcValOffset0 - MinOffset;
+    int64_t Overlap1 = Size1.getValue() + SrcValOffset1 - MinOffset;
     if (AA->isNoAlias(
             MemoryLocation(MUC0.MMO->getValue(), Overlap0,
                            UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index 66825d845c1910..9670c3ac8430eb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -91,11 +91,10 @@ bool BaseIndexOffset::equalBaseIndex(const BaseIndexOffset &Other,
 }
 
 bool BaseIndexOffset::computeAliasing(const SDNode *Op0,
-                                      const std::optional<int64_t> NumBytes0,
+                                      const LocationSize NumBytes0,
                                       const SDNode *Op1,
-                                      const std::optional<int64_t> NumBytes1,
+                                      const LocationSize NumBytes1,
                                       const SelectionDAG &DAG, bool &IsAlias) {
-
   BaseIndexOffset BasePtr0 = match(Op0, DAG);
   if (!BasePtr0.getBase().getNode())
     return false;
@@ -105,27 +104,26 @@ bool BaseIndexOffset::computeAliasing(const SDNode *Op0,
     return false;
 
   int64_t PtrDiff;
-  if (NumBytes0 && NumBytes1 &&
-      BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) {
+  if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) {
     // If the size of memory access is unknown, do not use it to analysis.
     // One example of unknown size memory access is to load/store scalable
     // vector objects on the stack.
     // BasePtr1 is PtrDiff away from BasePtr0. They alias if none of the
     // following situations arise:
-    if (PtrDiff >= 0 &&
-        *NumBytes0 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+    if (PtrDiff >= 0 && NumBytes0.hasValue() && !NumBytes0.isScalable()) {
       // [----BasePtr0----]
       //                         [---BasePtr1--]
       // ========PtrDiff========>
-      IsAlias = !(*NumBytes0 <= PtrDiff);
+      IsAlias = !(static_cast<int64_t>(NumBytes0.getValue().getFixedValue()) <=
+                  PtrDiff);
       return true;
     }
-    if (PtrDiff < 0 &&
-        *NumBytes1 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+    if (PtrDiff < 0 && NumBytes1.hasValue() && !NumBytes1.isScalable()) {
       //                     [----BasePtr0----]
       // [---BasePtr1--]
       // =====(-PtrDiff)====>
-      IsAlias = !((PtrDiff + *NumBytes1) <= 0);
+      IsAlias = !((PtrDiff + static_cast<int64_t>(
+                                 NumBytes1.getValue().getFixedValue())) <= 0);
       return true;
     }
     return false;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 20ef59e7b4422e..520debe513d9f1 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -29,7 +29,9 @@
 #include "llvm/Support/DJB.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/JSON.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
@@ -2026,12 +2028,37 @@ void OutputCategoryAggregator::EnumerateResults(
 }
 
 void DWARFVerifier::summarize() {
-  if (ErrorCategory.GetNumCategories() && DumpOpts.ShowAggregateErrors) {
+  if (DumpOpts.ShowAggregateErrors && ErrorCategory.GetNumCategories()) {
     error() << "Aggregated error counts:\n";
     ErrorCategory.EnumerateResults([&](StringRef s, unsigned count) {
       error() << s << " occurred " << count << " time(s).\n";
     });
   }
+  if (!DumpOpts.JsonErrSummaryFile.empty()) {
+    std::error_code EC;
+    raw_fd_ostream JsonStream(DumpOpts.JsonErrSummaryFile, EC,
+                              sys::fs::OF_Text);
+    if (EC) {
+      error() << "unable to open json summary file '"
+              << DumpOpts.JsonErrSummaryFile
+              << "' for writing: " << EC.message() << '\n';
+      return;
+    }
+
+    llvm::json::Object Categories;
+    uint64_t ErrorCount = 0;
+    ErrorCategory.EnumerateResults([&](StringRef Category, unsigned Count) {
+      llvm::json::Object Val;
+      Val.try_emplace("count", Count);
+      Categories.try_emplace(Category, std::move(Val));
+      ErrorCount += Count;
+    });
+    llvm::json::Object RootNode;
+    RootNode.try_emplace("error-categories", std::move(Categories));
+    RootNode.try_emplace("error-count", ErrorCount);
+
+    JsonStream << llvm::json::Value(std::move(RootNode));
+  }
 }
 
 raw_ostream &DWARFVerifier::error() const { return WithColor::error(OS); }
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 09f59c81123ea5..d65ed8c11d86cc 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4047,13 +4047,17 @@ OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
 
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSingle(
     const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
-    FinalizeCallbackTy FiniCB, bool IsNowait, llvm::Value *DidIt) {
+    FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
+    ArrayRef<llvm::Function *> CPFuncs) {
 
   if (!updateToLocation(Loc))
     return Loc.IP;
 
-  // If needed (i.e. not null), initialize `DidIt` with 0
-  if (DidIt) {
+  // If needed allocate and initialize `DidIt` with 0.
+  // DidIt: flag variable: 1=single thread; 0=not single thread.
+  llvm::Value *DidIt = nullptr;
+  if (!CPVars.empty()) {
+    DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
     Builder.CreateStore(Builder.getInt32(0), DidIt);
   }
 
@@ -4070,17 +4074,36 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSingle(
   Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
   Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
 
+  auto FiniCBWrapper = [&](InsertPointTy IP) {
+    FiniCB(IP);
+
+    // The thread that executes the single region must set `DidIt` to 1.
+    // This is used by __kmpc_copyprivate, to know if the caller is the
+    // single thread or not.
+    if (DidIt)
+      Builder.CreateStore(Builder.getInt32(1), DidIt);
+  };
+
   // generates the following:
   // if (__kmpc_single()) {
   //		.... single region ...
   // 		__kmpc_end_single
   // }
+  // __kmpc_copyprivate
   // __kmpc_barrier
 
-  EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
+  EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
                        /*Conditional*/ true,
                        /*hasFinalize*/ true);
-  if (!IsNowait)
+
+  if (DidIt) {
+    for (size_t I = 0, E = CPVars.size(); I < E; ++I)
+      // NOTE BufSize is currently unused, so just pass 0.
+      createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
+                        /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
+                        CPFuncs[I], DidIt);
+    // NOTE __kmpc_copyprivate already inserts a barrier
+  } else if (!IsNowait)
     createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
                   omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
                   /* CheckCancelFlag */ false);
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index 1b3a58298ec08a..f52bcb74938d15 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -300,6 +300,10 @@ static Error updateAndRemoveSymbols(const CommonConfig &Config,
          Config.SymbolsToLocalize.matches(Sym.Name)))
       Sym.Binding = STB_LOCAL;
 
+    for (auto &[Matcher, Visibility] : ELFConfig.SymbolsToSetVisibility)
+      if (Matcher.matches(Sym.Name))
+        Sym.Visibility = Visibility;
+
     // Note: these two globalize flags have very similar names but different
     // meanings:
     //
diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp
index 0401c20b98ec8e..935749afe33852 100644
--- a/llvm/lib/Object/DXContainer.cpp
+++ b/llvm/lib/Object/DXContainer.cpp
@@ -72,13 +72,13 @@ Error DXContainer::parseDXILHeader(StringRef Part) {
   return Error::success();
 }
 
-Error DXContainer::parseShaderFlags(StringRef Part) {
-  if (ShaderFlags)
+Error DXContainer::parseShaderFeatureFlags(StringRef Part) {
+  if (ShaderFeatureFlags)
     return parseFailed("More than one SFI0 part is present in the file");
   uint64_t FlagValue = 0;
   if (Error Err = readInteger(Part, Part.begin(), FlagValue))
     return Err;
-  ShaderFlags = FlagValue;
+  ShaderFeatureFlags = FlagValue;
   return Error::success();
 }
 
@@ -168,7 +168,7 @@ Error DXContainer::parsePartOffsets() {
         return Err;
       break;
     case dxbc::PartType::SFI0:
-      if (Error Err = parseShaderFlags(PartData))
+      if (Error Err = parseShaderFeatureFlags(PartData))
         return Err;
       break;
     case dxbc::PartType::HASH:
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index 1f03f2c7d39966..7dc9822bdd221d 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -23,15 +23,15 @@ namespace llvm {
 static_assert((uint64_t)dxbc::FeatureFlags::NextUnusedBit <= 1ull << 63,
               "Shader flag bits exceed enum size.");
 
-DXContainerYAML::ShaderFlags::ShaderFlags(uint64_t FlagData) {
-#define SHADER_FLAG(Num, Val, Str)                                             \
+DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) {
+#define SHADER_FEATURE_FLAG(Num, Val, Str)                                     \
   Val = (FlagData & (uint64_t)dxbc::FeatureFlags::Val) > 0;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 }
 
-uint64_t DXContainerYAML::ShaderFlags::getEncodedFlags() {
+uint64_t DXContainerYAML::ShaderFeatureFlags::getEncodedFlags() {
   uint64_t Flag = 0;
-#define SHADER_FLAG(Num, Val, Str)                                             \
+#define SHADER_FEATURE_FLAG(Num, Val, Str)                                     \
   if (Val)                                                                     \
     Flag |= (uint64_t)dxbc::FeatureFlags::Val;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
@@ -103,9 +103,9 @@ void MappingTraits<DXContainerYAML::DXILProgram>::mapping(
   IO.mapOptional("DXIL", Program.DXIL);
 }
 
-void MappingTraits<DXContainerYAML::ShaderFlags>::mapping(
-    IO &IO, DXContainerYAML::ShaderFlags &Flags) {
-#define SHADER_FLAG(Num, Val, Str) IO.mapRequired(#Val, Flags.Val);
+void MappingTraits<DXContainerYAML::ShaderFeatureFlags>::mapping(
+    IO &IO, DXContainerYAML::ShaderFeatureFlags &Flags) {
+#define SHADER_FEATURE_FLAG(Num, Val, Str) IO.mapRequired(#Val, Flags.Val);
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 }
 
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 142bd50b3798e0..991c3ac8f7446c 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -707,7 +707,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   // Re-consider control flow based optimizations after redundancy elimination,
   // redo DCE, etc.
-  if (EnableDFAJumpThreading && Level.getSizeLevel() == 0)
+  if (EnableDFAJumpThreading)
     FPM.addPass(DFAJumpThreadingPass());
 
   FPM.addPass(JumpThreadingPass());
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 635beeed0df833..49bcab588e5204 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -5179,7 +5179,8 @@ FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
                                         const TargetLibraryInfo *LibInfo) {
 
   SMEAttrs CallerAttrs(*FuncInfo.Fn);
-  if (CallerAttrs.hasZAState() || CallerAttrs.hasStreamingInterfaceOrBody() ||
+  if (CallerAttrs.hasZAState() || CallerAttrs.hasZT0State() ||
+      CallerAttrs.hasStreamingInterfaceOrBody() ||
       CallerAttrs.hasStreamingCompatibleInterface())
     return nullptr;
   return new AArch64FastISel(FuncInfo, LibInfo);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3b92e95d7c2876..c21bc3a4abbc00 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -271,11 +271,9 @@ static bool isMergePassthruOpcode(unsigned Opc) {
 static bool isZeroingInactiveLanes(SDValue Op) {
   switch (Op.getOpcode()) {
   default:
-    // We guarantee i1 splat_vectors to zero the other lanes by
-    // implementing it with ptrue and possibly a punpklo for nxv1i1.
-    if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
-      return true;
     return false;
+  // We guarantee i1 splat_vectors to zero the other lanes
+  case ISD::SPLAT_VECTOR:
   case AArch64ISD::PTRUE:
   case AArch64ISD::SETCC_MERGE_ZERO:
     return true;
@@ -25892,7 +25890,8 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
     auto CallerAttrs = SMEAttrs(*Inst.getFunction());
     auto CalleeAttrs = SMEAttrs(*Base);
     if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
-        CallerAttrs.requiresLazySave(CalleeAttrs))
+        CallerAttrs.requiresLazySave(CalleeAttrs) ||
+        CallerAttrs.requiresPreservingZT0(CalleeAttrs))
       return true;
   }
   return false;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 3dc3d31a34e84e..26dbad713594aa 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -535,7 +535,8 @@ bool AArch64CallLowering::fallBackToDAGISel(const MachineFunction &MF) const {
   }
 
   SMEAttrs Attrs(F);
-  if (Attrs.hasZAState() || Attrs.hasStreamingInterfaceOrBody() ||
+  if (Attrs.hasZAState() || Attrs.hasZT0State() ||
+      Attrs.hasStreamingInterfaceOrBody() ||
       Attrs.hasStreamingCompatibleInterface())
     return true;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index a2e805e8cb56dc..91323e456a5ef8 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -52,6 +52,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   const LLT v16s8 = LLT::fixed_vector(16, 8);
   const LLT v8s8 = LLT::fixed_vector(8, 8);
   const LLT v4s8 = LLT::fixed_vector(4, 8);
+  const LLT v2s8 = LLT::fixed_vector(2, 8);
   const LLT v8s16 = LLT::fixed_vector(8, 16);
   const LLT v4s16 = LLT::fixed_vector(4, 16);
   const LLT v2s16 = LLT::fixed_vector(2, 16);
@@ -387,8 +388,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(0, s32, 4)
       .clampMaxNumElements(0, s64, 2)
       .clampMaxNumElements(0, p0, 2)
+      // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
+      .bitcastIf(typeInSet(0, {v4s8}),
+                 [=](const LegalityQuery &Query) {
+                   const LLT VecTy = Query.Types[0];
+                   return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
+                 })
       .customIf(IsPtrVecPred)
-      .scalarizeIf(typeIs(0, v2s16), 0);
+      .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0);
 
   getActionDefinitionsBuilder(G_STORE)
       .customIf([=](const LegalityQuery &Query) {
@@ -422,8 +429,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(0, s64, 2)
       .clampMaxNumElements(0, p0, 2)
       .lowerIfMemSizeNotPow2()
+      // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
+      .bitcastIf(typeInSet(0, {v4s8}),
+                 [=](const LegalityQuery &Query) {
+                   const LLT VecTy = Query.Types[0];
+                   return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
+                 })
       .customIf(IsPtrVecPred)
-      .scalarizeIf(typeIs(0, v2s16), 0);
+      .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0);
 
   getActionDefinitionsBuilder(G_INDEXED_STORE)
       // Idx 0 == Ptr, Idx 1 == Val
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index e3f72485079538..57769fe998d1fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2337,8 +2337,6 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
 
   ScheduleDAGMI *DAG;
 
-  std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations;
-
   // Organize lists of SchedGroups by their SyncID. SchedGroups /
   // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
   // between then.
@@ -2381,10 +2379,7 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
   AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
 
   IGroupLPDAGMutation() = default;
-  IGroupLPDAGMutation(
-      AMDGPU::SchedulingPhase Phase,
-      std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations)
-      : SavedMutations(SavedMutations), Phase(Phase) {}
+  IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
 };
 
 unsigned SchedGroup::NumSchedGroups = 0;
@@ -2602,13 +2597,6 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
     PS.solve();
     return;
   }
-
-  if (!SavedMutations)
-    return;
-
-  // We did not apply a mutation, fall back to SavedMutations
-  for (auto &m : *SavedMutations)
-    m->apply(DAG);
 }
 
 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
@@ -2707,10 +2695,9 @@ namespace llvm {
 /// same scheduling region (e.g. pre and post-RA scheduling / multiple
 /// scheduling "phases"), we can reenter this mutation framework more than once
 /// for a given region.
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
-    AMDGPU::SchedulingPhase Phase,
-    std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations) {
-  return std::make_unique<IGroupLPDAGMutation>(Phase, SavedMutations);
+std::unique_ptr<ScheduleDAGMutation>
+createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
+  return std::make_unique<IGroupLPDAGMutation>(Phase);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index 46ef4d702d0022..aff7096f26d671 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -20,9 +20,8 @@ namespace AMDGPU {
 enum class SchedulingPhase { Initial, PreRAReentry, PostRA };
 } // namespace AMDGPU
 
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
-    AMDGPU::SchedulingPhase Phase,
-    std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations);
+std::unique_ptr<ScheduleDAGMutation>
+createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase);
 
 } // namespace llvm
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0d830df1f1f1df..76e843455bab75 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -461,8 +461,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(
-      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
+  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -472,8 +471,7 @@ static ScheduleDAGInstrs *
 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
-  DAG->addMutation(
-      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
+  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
   return DAG;
 }
 
@@ -937,7 +935,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
     DAG->addMutation(
-        createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA, nullptr));
+        createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
       DAG->addMutation(createVOPDPairingMutation());
     return DAG;
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b7b471d8dc7b39..18d51087ff5fba 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1685,24 +1685,48 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
 private:
   struct OperandInfoTy {
     SMLoc Loc;
-    int64_t Id;
+    int64_t Val;
     bool IsSymbolic = false;
     bool IsDefined = false;
 
-    OperandInfoTy(int64_t Id_) : Id(Id_) {}
+    OperandInfoTy(int64_t Val) : Val(Val) {}
   };
 
+  struct StructuredOpField : OperandInfoTy {
+    StringLiteral Id;
+    StringLiteral Desc;
+    unsigned Width;
+    bool IsDefined = false;
+
+    StructuredOpField(StringLiteral Id, StringLiteral Desc, unsigned Width,
+                      int64_t Default)
+        : OperandInfoTy(Default), Id(Id), Desc(Desc), Width(Width) {}
+    virtual ~StructuredOpField() = default;
+
+    bool Error(AMDGPUAsmParser &Parser, const Twine &Err) const {
+      Parser.Error(Loc, "invalid " + Desc + ": " + Err);
+      return false;
+    }
+
+    virtual bool validate(AMDGPUAsmParser &Parser) const {
+      if (IsSymbolic && Val == OPR_ID_UNSUPPORTED)
+        return Error(Parser, "not supported on this GPU");
+      if (!isUIntN(Width, Val))
+        return Error(Parser, "only " + Twine(Width) + "-bit values are legal");
+      return true;
+    }
+  };
+
+  ParseStatus parseStructuredOpFields(ArrayRef<StructuredOpField *> Fields);
+  bool validateStructuredOpFields(ArrayRef<const StructuredOpField *> Fields);
+
   bool parseSendMsgBody(OperandInfoTy &Msg, OperandInfoTy &Op, OperandInfoTy &Stream);
   bool validateSendMsg(const OperandInfoTy &Msg,
                        const OperandInfoTy &Op,
                        const OperandInfoTy &Stream);
 
-  bool parseHwregBody(OperandInfoTy &HwReg,
-                      OperandInfoTy &Offset,
-                      OperandInfoTy &Width);
-  bool validateHwreg(const OperandInfoTy &HwReg,
-                     const OperandInfoTy &Offset,
-                     const OperandInfoTy &Width);
+  ParseStatus parseHwregFunc(OperandInfoTy &HwReg, OperandInfoTy &Offset,
+                             OperandInfoTy &Width);
 
   SMLoc getFlatOffsetLoc(const OperandVector &Operands) const;
   SMLoc getSMEMOffsetLoc(const OperandVector &Operands) const;
@@ -7197,71 +7221,44 @@ bool AMDGPUOperand::isDepCtr() const { return isS16Imm(); }
 // hwreg
 //===----------------------------------------------------------------------===//
 
-bool
-AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg,
-                                OperandInfoTy &Offset,
-                                OperandInfoTy &Width) {
+ParseStatus AMDGPUAsmParser::parseHwregFunc(OperandInfoTy &HwReg,
+                                            OperandInfoTy &Offset,
+                                            OperandInfoTy &Width) {
   using namespace llvm::AMDGPU::Hwreg;
 
+  if (!trySkipId("hwreg", AsmToken::LParen))
+    return ParseStatus::NoMatch;
+
   // The register may be specified by name or using a numeric code
   HwReg.Loc = getLoc();
   if (isToken(AsmToken::Identifier) &&
-      (HwReg.Id = getHwregId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
+      (HwReg.Val = getHwregId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
     HwReg.IsSymbolic = true;
     lex(); // skip register name
-  } else if (!parseExpr(HwReg.Id, "a register name")) {
-    return false;
+  } else if (!parseExpr(HwReg.Val, "a register name")) {
+    return ParseStatus::Failure;
   }
 
   if (trySkipToken(AsmToken::RParen))
-    return true;
+    return ParseStatus::Success;
 
   // parse optional params
   if (!skipToken(AsmToken::Comma, "expected a comma or a closing parenthesis"))
-    return false;
+    return ParseStatus::Failure;
 
   Offset.Loc = getLoc();
-  if (!parseExpr(Offset.Id))
-    return false;
+  if (!parseExpr(Offset.Val))
+    return ParseStatus::Failure;
 
   if (!skipToken(AsmToken::Comma, "expected a comma"))
-    return false;
+    return ParseStatus::Failure;
 
   Width.Loc = getLoc();
-  return parseExpr(Width.Id) &&
-         skipToken(AsmToken::RParen, "expected a closing parenthesis");
-}
-
-bool
-AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg,
-                               const OperandInfoTy &Offset,
-                               const OperandInfoTy &Width) {
-
-  using namespace llvm::AMDGPU::Hwreg;
+  if (!parseExpr(Width.Val) ||
+      !skipToken(AsmToken::RParen, "expected a closing parenthesis"))
+    return ParseStatus::Failure;
 
-  if (HwReg.IsSymbolic) {
-    if (HwReg.Id == OPR_ID_UNSUPPORTED) {
-      Error(HwReg.Loc,
-            "specified hardware register is not supported on this GPU");
-      return false;
-    }
-  } else {
-    if (!isValidHwreg(HwReg.Id)) {
-      Error(HwReg.Loc,
-            "invalid code of hardware register: only 6-bit values are legal");
-      return false;
-    }
-  }
-  if (!isValidHwregOffset(Offset.Id)) {
-    Error(Offset.Loc, "invalid bit offset: only 5-bit values are legal");
-    return false;
-  }
-  if (!isValidHwregWidth(Width.Id)) {
-    Error(Width.Loc,
-          "invalid bitfield width: only values from 1 to 32 are legal");
-    return false;
-  }
-  return true;
+  return ParseStatus::Success;
 }
 
 ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
@@ -7270,24 +7267,40 @@ ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
   int64_t ImmVal = 0;
   SMLoc Loc = getLoc();
 
-  if (trySkipId("hwreg", AsmToken::LParen)) {
-    OperandInfoTy HwReg(OPR_ID_UNKNOWN);
-    OperandInfoTy Offset(HwregOffset::Default);
-    OperandInfoTy Width(HwregSize::Default);
-    if (parseHwregBody(HwReg, Offset, Width) &&
-        validateHwreg(HwReg, Offset, Width)) {
-      ImmVal = HwregEncoding::encode(HwReg.Id, Offset.Id, Width.Id);
-    } else {
-      return ParseStatus::Failure;
+  StructuredOpField HwReg("id", "hardware register", HwregId::Width,
+                          HwregId::Default);
+  StructuredOpField Offset("offset", "bit offset", HwregOffset::Width,
+                           HwregOffset::Default);
+  struct : StructuredOpField {
+    using StructuredOpField::StructuredOpField;
+    bool validate(AMDGPUAsmParser &Parser) const override {
+      if (!isUIntN(Width, Val - 1))
+        return Error(Parser, "only values from 1 to 32 are legal");
+      return true;
     }
-  } else if (parseExpr(ImmVal, "a hwreg macro")) {
-    if (ImmVal < 0 || !isUInt<16>(ImmVal))
-      return Error(Loc, "invalid immediate: only 16-bit values are legal");
-  } else {
-    return ParseStatus::Failure;
+  } Width("size", "bitfield width", HwregSize::Width, HwregSize::Default);
+  ParseStatus Res = parseStructuredOpFields({&HwReg, &Offset, &Width});
+
+  if (Res.isNoMatch())
+    Res = parseHwregFunc(HwReg, Offset, Width);
+
+  if (Res.isSuccess()) {
+    if (!validateStructuredOpFields({&HwReg, &Offset, &Width}))
+      return ParseStatus::Failure;
+    ImmVal = HwregEncoding::encode(HwReg.Val, Offset.Val, Width.Val);
   }
 
-  Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg));
+  if (Res.isNoMatch() &&
+      parseExpr(ImmVal, "a hwreg macro, structured immediate"))
+    Res = ParseStatus::Success;
+
+  if (!Res.isSuccess())
+    return ParseStatus::Failure;
+
+  if (!isUInt<16>(ImmVal))
+    return Error(Loc, "invalid immediate: only 16-bit values are legal");
+  Operands.push_back(
+      AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg));
   return ParseStatus::Success;
 }
 
@@ -7307,10 +7320,10 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
 
   Msg.Loc = getLoc();
   if (isToken(AsmToken::Identifier) &&
-      (Msg.Id = getMsgId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
+      (Msg.Val = getMsgId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
     Msg.IsSymbolic = true;
     lex(); // skip message name
-  } else if (!parseExpr(Msg.Id, "a message name")) {
+  } else if (!parseExpr(Msg.Val, "a message name")) {
     return false;
   }
 
@@ -7318,16 +7331,16 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
     Op.IsDefined = true;
     Op.Loc = getLoc();
     if (isToken(AsmToken::Identifier) &&
-        (Op.Id = getMsgOpId(Msg.Id, getTokenStr())) >= 0) {
+        (Op.Val = getMsgOpId(Msg.Val, getTokenStr())) >= 0) {
       lex(); // skip operation name
-    } else if (!parseExpr(Op.Id, "an operation name")) {
+    } else if (!parseExpr(Op.Val, "an operation name")) {
       return false;
     }
 
     if (trySkipToken(AsmToken::Comma)) {
       Stream.IsDefined = true;
       Stream.Loc = getLoc();
-      if (!parseExpr(Stream.Id))
+      if (!parseExpr(Stream.Val))
         return false;
     }
   }
@@ -7347,17 +7360,17 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
   bool Strict = Msg.IsSymbolic;
 
   if (Strict) {
-    if (Msg.Id == OPR_ID_UNSUPPORTED) {
+    if (Msg.Val == OPR_ID_UNSUPPORTED) {
       Error(Msg.Loc, "specified message id is not supported on this GPU");
       return false;
     }
   } else {
-    if (!isValidMsgId(Msg.Id, getSTI())) {
+    if (!isValidMsgId(Msg.Val, getSTI())) {
       Error(Msg.Loc, "invalid message id");
       return false;
     }
   }
-  if (Strict && (msgRequiresOp(Msg.Id, getSTI()) != Op.IsDefined)) {
+  if (Strict && (msgRequiresOp(Msg.Val, getSTI()) != Op.IsDefined)) {
     if (Op.IsDefined) {
       Error(Op.Loc, "message does not support operations");
     } else {
@@ -7365,16 +7378,16 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
     }
     return false;
   }
-  if (!isValidMsgOp(Msg.Id, Op.Id, getSTI(), Strict)) {
+  if (!isValidMsgOp(Msg.Val, Op.Val, getSTI(), Strict)) {
     Error(Op.Loc, "invalid operation id");
     return false;
   }
-  if (Strict && !msgSupportsStream(Msg.Id, Op.Id, getSTI()) &&
+  if (Strict && !msgSupportsStream(Msg.Val, Op.Val, getSTI()) &&
       Stream.IsDefined) {
     Error(Stream.Loc, "message operation does not support streams");
     return false;
   }
-  if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, getSTI(), Strict)) {
+  if (!isValidMsgStream(Msg.Val, Op.Val, Stream.Val, getSTI(), Strict)) {
     Error(Stream.Loc, "invalid message stream id");
     return false;
   }
@@ -7393,7 +7406,7 @@ ParseStatus AMDGPUAsmParser::parseSendMsg(OperandVector &Operands) {
     OperandInfoTy Stream(STREAM_ID_NONE_);
     if (parseSendMsgBody(Msg, Op, Stream) &&
         validateSendMsg(Msg, Op, Stream)) {
-      ImmVal = encodeMsg(Msg.Id, Op.Id, Stream.Id);
+      ImmVal = encodeMsg(Msg.Val, Op.Val, Stream.Val);
     } else {
       return ParseStatus::Failure;
     }
@@ -7730,6 +7743,48 @@ AMDGPUAsmParser::getConstLoc(const OperandVector &Operands) const {
   return getOperandLoc(Test, Operands);
 }
 
+ParseStatus
+AMDGPUAsmParser::parseStructuredOpFields(ArrayRef<StructuredOpField *> Fields) {
+  if (!trySkipToken(AsmToken::LCurly))
+    return ParseStatus::NoMatch;
+
+  bool First = true;
+  while (!trySkipToken(AsmToken::RCurly)) {
+    if (!First &&
+        !skipToken(AsmToken::Comma, "comma or closing brace expected"))
+      return ParseStatus::Failure;
+
+    StringRef Id = getTokenStr();
+    SMLoc IdLoc = getLoc();
+    if (!skipToken(AsmToken::Identifier, "field name expected") ||
+        !skipToken(AsmToken::Colon, "colon expected"))
+      return ParseStatus::Failure;
+
+    auto I =
+        find_if(Fields, [Id](StructuredOpField *F) { return F->Id == Id; });
+    if (I == Fields.end())
+      return Error(IdLoc, "unknown field");
+    if ((*I)->IsDefined)
+      return Error(IdLoc, "duplicate field");
+
+    // TODO: Support symbolic values.
+    (*I)->Loc = getLoc();
+    if (!parseExpr((*I)->Val))
+      return ParseStatus::Failure;
+    (*I)->IsDefined = true;
+
+    First = false;
+  }
+  return ParseStatus::Success;
+}
+
+bool AMDGPUAsmParser::validateStructuredOpFields(
+    ArrayRef<const StructuredOpField *> Fields) {
+  return all_of(Fields, [this](const StructuredOpField *F) {
+    return F->validate(*this);
+  });
+}
+
 //===----------------------------------------------------------------------===//
 // swizzle
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index f993ec8409c997..9f419a7fbf6834 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -713,8 +713,8 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
     return false;
 
   SavedMutations.swap(DAG.Mutations);
-  DAG.addMutation(createIGroupLPDAGMutation(
-      AMDGPU::SchedulingPhase::PreRAReentry, nullptr));
+  DAG.addMutation(
+      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
 
   InitialOccupancy = DAG.MinOccupancy;
   // Aggressivly try to reduce register pressure in the unclustered high RP
@@ -858,8 +858,7 @@ bool GCNSchedStage::initGCNRegion() {
                           StageID == GCNSchedStageID::ILPInitialSchedule;
     DAG.addMutation(createIGroupLPDAGMutation(
         IsInitialStage ? AMDGPU::SchedulingPhase::Initial
-                       : AMDGPU::SchedulingPhase::PreRAReentry,
-        &SavedMutations));
+                       : AMDGPU::SchedulingPhase::PreRAReentry));
   }
 
   return true;
@@ -1573,8 +1572,7 @@ void GCNPostScheduleDAGMILive::schedule() {
   if (HasIGLPInstrs) {
     SavedMutations.clear();
     SavedMutations.swap(Mutations);
-    addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA,
-                                          &SavedMutations));
+    addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
   }
 
   ScheduleDAGMI::schedule();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index d774826c1d08c0..a8a33a5fecb413 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -949,6 +949,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
       return AMDGPU::S_WAIT_BVHCNT;
     case AMDGPU::S_WAIT_DSCNT_soft:
       return AMDGPU::S_WAIT_DSCNT;
+    case AMDGPU::S_WAIT_KMCNT_soft:
+      return AMDGPU::S_WAIT_KMCNT;
     default:
       return Opcode;
     }
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index f62e808b33e42b..4069a368f68719 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -312,6 +312,10 @@ class SICacheControl {
                                               SIMemOp Op, bool IsVolatile,
                                               bool IsNonTemporal) const = 0;
 
+  virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
+    return false;
+  };
+
   /// Inserts any necessary instructions at position \p Pos relative
   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
   /// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -589,6 +593,15 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
   bool setScope(const MachineBasicBlock::iterator MI,
                 AMDGPU::CPol::CPol Value) const;
 
+  // Stores with system scope (SCOPE_SYS) need to wait for:
+  // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
+  // - non-returning-atomics       - wait for STORECNT==0
+  //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
+  //   since it does not distinguish atomics-with-return from regular stores.
+  // There is no need to wait if memory is cached (mtype != UC).
+  bool
+  insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
+
 public:
   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
 
@@ -603,6 +616,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                       bool IsVolatile,
                                       bool IsNonTemporal) const override;
+
+  bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
 };
 
 class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -2194,6 +2209,22 @@ bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
   return false;
 }
 
+bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
+    const MachineBasicBlock::iterator MI) const {
+  // TODO: implement flag for frontend to give us a hint not to insert waits.
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  const DebugLoc &DL = MI->getDebugLoc();
+
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
+
+  return true;
+}
+
 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicScope Scope,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -2364,6 +2395,9 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   if (IsVolatile) {
     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
 
+    if (Op == SIMemOp::STORE)
+      Changed |= insertWaitsBeforeSystemScopeStore(MI);
+
     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
     // request cross address space as only the global address space can be
@@ -2381,6 +2415,15 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   return Changed;
 }
 
+bool SIGfx12CacheControl::expandSystemScopeStore(
+    MachineBasicBlock::iterator &MI) const {
+  MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+  if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
+    return insertWaitsBeforeSystemScopeStore(MI);
+
+  return false;
+}
+
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
   if (AtomicPseudoMIs.empty())
     return false;
@@ -2467,6 +2510,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
   Changed |= CC->enableVolatileAndOrNonTemporal(
       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
       MOI.isNonTemporal());
+
+  // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
+  // instruction field, do not confuse it with atomic scope.
+  Changed |= CC->expandSystemScopeStore(MI);
   return Changed;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 53fc2c0686245f..afc380b4203457 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -472,12 +472,11 @@ bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
   }
 
   // Move MI before v_or_b32
-  auto MBB = MI.getParent();
-  MBB->remove(&MI);
-  MBB->insert(getParentInst(), &MI);
+  MI.getParent()->remove(&MI);
+  getParentInst()->getParent()->insert(getParentInst(), &MI);
 
   // Add Implicit use of preserved register
-  MachineInstrBuilder MIB(*MBB->getParent(), MI);
+  MachineInstrBuilder MIB(*MI.getMF(), MI);
   MIB.addReg(getPreservedOperand()->getReg(),
              RegState::ImplicitKill,
              getPreservedOperand()->getSubReg());
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 0fe2845f8edc31..b5de311f8c58ce 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1601,6 +1601,7 @@ let SubtargetPredicate = isGFX12Plus in {
   def S_WAIT_SAMPLECNT_soft : SOPP_Pseudo <"s_soft_wait_samplecnt", (ins s16imm:$simm16), "$simm16">;
   def S_WAIT_BVHCNT_soft : SOPP_Pseudo <"s_soft_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">;
   def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
+  def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
 }
 
 def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 177d99a0ac0abe..963dc2882fcc0d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1700,16 +1700,6 @@ int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) {
   return (Idx < 0) ? Idx : Opr[Idx].Encoding;
 }
 
-bool isValidHwreg(int64_t Id) { return 0 <= Id && isUInt<HwregId::Width>(Id); }
-
-bool isValidHwregOffset(int64_t Offset) {
-  return 0 <= Offset && isUInt<HwregOffset::Width>(Offset);
-}
-
-bool isValidHwregWidth(int64_t Width) {
-  return 0 <= (Width - 1) && isUInt<HwregSize::Width>(Width - 1);
-}
-
 StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
   int Idx = getOprIdx<const MCSubtargetInfo &>(Id, Opr, OPR_SIZE, STI);
   return (Idx < 0) ? "" : Opr[Idx].Name;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 9a6d0834679eae..b2fc7d874fe588 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1068,15 +1068,6 @@ using HwregEncoding = EncodingFields<HwregId, HwregOffset, HwregSize>;
 LLVM_READONLY
 int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI);
 
-LLVM_READNONE
-bool isValidHwreg(int64_t Id);
-
-LLVM_READNONE
-bool isValidHwregOffset(int64_t Offset);
-
-LLVM_READNONE
-bool isValidHwregWidth(int64_t Width);
-
 LLVM_READNONE
 StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI);
 
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index f0b69b0b09809f..e78ea63c3b4a56 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1468,15 +1468,21 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8(
   if (passesFPReg)
     assert(STI->hasFPRegs() && "Subtarget needs fpregs");
 
-  // Lazy store all fp registers to the stack.
+  // Lazy store all fp registers to the stack
   // This executes as NOP in the absence of floating-point support.
-  MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
-                                  .addReg(ARM::SP)
-                                  .add(predOps(ARMCC::AL));
-  for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
-                 ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
-    VLSTM.addReg(R, RegState::Implicit |
-                        (LiveRegs.contains(R) ? 0 : RegState::Undef));
+  MachineInstrBuilder VLSTM =
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+          .addReg(ARM::SP)
+          .add(predOps(ARMCC::AL))
+          .addImm(0); // Represents a pseoudo register list, has no effect on
+                      // the encoding.
+  // Mark non-live registers as undef
+  for (MachineOperand &MO : VLSTM->implicit_operands()) {
+    if (MO.isReg() && !MO.isDef()) {
+      Register Reg = MO.getReg();
+      MO.setIsUndef(!LiveRegs.contains(Reg));
+    }
+  }
 
   // Restore all arguments
   for (const auto &Regs : ClearedFPRegs) {
@@ -1563,14 +1569,20 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV81(MachineBasicBlock &MBB,
         .addImm(CMSE_FP_SAVE_SIZE >> 2)
         .add(predOps(ARMCC::AL));
 
-    // Lazy store all FP registers to the stack
-    MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
-                                    .addReg(ARM::SP)
-                                    .add(predOps(ARMCC::AL));
-    for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
-                   ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
-      VLSTM.addReg(R, RegState::Implicit |
-                          (LiveRegs.contains(R) ? 0 : RegState::Undef));
+    // Lazy store all fp registers to the stack.
+    MachineInstrBuilder VLSTM =
+        BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+            .addReg(ARM::SP)
+            .add(predOps(ARMCC::AL))
+            .addImm(0); // Represents a pseoudo register list, has no effect on
+                        // the encoding.
+    // Mark non-live registers as undef
+    for (MachineOperand &MO : VLSTM->implicit_operands()) {
+      if (MO.isReg() && MO.isImplicit() && !MO.isDef()) {
+        Register Reg = MO.getReg();
+        MO.setIsUndef(!LiveRegs.contains(Reg));
+      }
+    }
   } else {
     // Push all the callee-saved registers (s16-s31).
     MachineInstrBuilder VPUSH =
@@ -1673,9 +1685,12 @@ void ARMExpandPseudo::CMSERestoreFPRegsV8(
 
   // Lazy load fp regs from stack.
   // This executes as NOP in the absence of floating-point support.
-  MachineInstrBuilder VLLDM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
-                                  .addReg(ARM::SP)
-                                  .add(predOps(ARMCC::AL));
+  MachineInstrBuilder VLLDM =
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
+          .addReg(ARM::SP)
+          .add(predOps(ARMCC::AL))
+          .addImm(0); // Represents a pseoudo register list, has no effect on
+                      // the encoding.
 
   if (STI->fixCMSE_CVE_2021_35465()) {
     auto Bundler = MIBundleBuilder(MBB, VLLDM);
@@ -1757,7 +1772,9 @@ void ARMExpandPseudo::CMSERestoreFPRegsV81(
     // Load FP registers from stack.
     BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
         .addReg(ARM::SP)
-        .add(predOps(ARMCC::AL));
+        .add(predOps(ARMCC::AL))
+        .addImm(0); // Represents a pseoudo register list, has no effect on the
+                    // encoding.
 
     // Pop the stack space
     BuildMI(MBB, MBBI, DL, TII->get(ARM::tADDspi), ARM::SP)
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 14e315534570d2..404085820a6660 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -1749,6 +1749,37 @@ class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
   let Inst{8}     = 0;          // Single precision
 }
 
+// Single Precision with fixed registers.
+// For when the registers-to-be-stored/loaded are fixed, e.g. VLLDM and VLSTM
+class AXSI4FR<string asm, bit et, bit load>
+    : InstARM<AddrMode4, 4, IndexModeNone, VFPLdStMulFrm, VFPDomain, "", NoItinerary> {
+  // Instruction operands.
+  bits<4> Rn;
+  bits<13> regs;    // Does not affect encoding, for assembly/disassembly only.
+  list<Predicate> Predicates = [HasVFP2];
+  let OutOperandList = (outs);
+  let InOperandList = (ins GPRnopc:$Rn, pred:$p, dpr_reglist:$regs);
+  let AsmString = asm;
+  let Pattern = [];
+  let DecoderNamespace = "VFP";
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{31-28} = 0b1110;
+  let Inst{27-25} = 0b110;
+  let Inst{24}    = 0b0;
+  let Inst{23}    = 0b0;
+  let Inst{22}    = 0b0;
+  let Inst{21}    = 0b1;
+  let Inst{20}    = load;       // Distinguishes vlldm from vlstm
+  let Inst{15-12} = 0b0000;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{7}     = et;         // encoding type, 0 for T1 and 1 for T2.
+  let Inst{6-0}   = 0b0000000;
+  let mayLoad     = load;
+  let mayStore    = !eq(load, 0);
+}
+
 // Double precision, unary
 class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 55d3efbd9b9a2b..3094a4db2b4d12 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -313,29 +313,51 @@ def : MnemonicAlias<"vstm", "vstmia">;
 //===----------------------------------------------------------------------===//
 //  Lazy load / store multiple Instructions
 //
-def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
-                  NoItinerary, "vlldm${p}\t$Rn", "", []>,
+// VLLDM and VLSTM:
+// 2 encoding options:
+// T1 (bit 7 is 0):
+// T1 takes an optional dpr_reglist, must be '{d0-d15}' (exactly)
+// T1 require v8-M.Main, secure state, target with 16 D registers (or with no D registers - NOP)
+// T2 (bit 7 is 1):
+// T2 takes a mandatory dpr_reglist, must be '{d0-d31}' (exactly)
+// T2 require v8.1-M.Main, secure state, target with 16/32 D registers (or with no D registers - NOP)
+// (source: Arm v8-M ARM, DDI0553B.v ID16122022)
+
+def VLLDM : AXSI4FR<"vlldm${p}\t$Rn, $regs", 0, 1>,
             Requires<[HasV8MMainline, Has8MSecExt]> {
-    let Inst{24-23} = 0b00;
-    let Inst{22}    = 0;
-    let Inst{21}    = 1;
-    let Inst{20}    = 1;
-    let Inst{15-12} = 0;
-    let Inst{7-0}   = 0;
-    let mayLoad     = 1;
-    let Defs = [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, VPR, FPSCR, FPSCR_NZCV];
-}
-
-def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
-                  NoItinerary, "vlstm${p}\t$Rn", "", []>,
+    let Defs = [VPR, FPSCR, FPSCR_NZCV, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
+}
+// T1: assembly does not contains the register list.
+def : InstAlias<"vlldm${p}\t$Rn", (VLLDM GPRnopc:$Rn, pred:$p, 0)>,
+                Requires<[HasV8MMainline, Has8MSecExt]>;
+// T2: assembly must contains the register list.
+// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
+def VLLDM_T2 : AXSI4FR<"vlldm${p}\t$Rn, $regs", 1, 1>,
+            Requires<[HasV8_1MMainline, Has8MSecExt]> {
+    let Defs = [VPR, FPSCR, FPSCR_NZCV, D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
+                                        D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
+}
+// T1: assembly contains the register list.
+// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
+def VLSTM : AXSI4FR<"vlstm${p}\t$Rn, $regs", 0, 0>,
             Requires<[HasV8MMainline, Has8MSecExt]> {
-    let Inst{24-23} = 0b00;
-    let Inst{22}    = 0;
-    let Inst{21}    = 1;
-    let Inst{20}    = 0;
-    let Inst{15-12} = 0;
-    let Inst{7-0}   = 0;
-    let mayStore    = 1;
+    let Defs = [VPR, FPSCR, FPSCR_NZCV];
+    let Uses = [VPR, FPSCR, FPSCR_NZCV, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
+}
+// T1: assembly does not contain the register list.
+def : InstAlias<"vlstm${p}\t$Rn", (VLSTM GPRnopc:$Rn, pred:$p, 0)>,
+                Requires<[HasV8MMainline, Has8MSecExt]>;
+// T2: assembly must contain the register list.
+// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
+def VLSTM_T2 : AXSI4FR<"vlstm${p}\t$Rn, $regs", 1, 0>,
+            Requires<[HasV8_1MMainline, Has8MSecExt]> {
+    let Defs = [VPR, FPSCR, FPSCR_NZCV];
+    let Uses = [VPR, FPSCR, FPSCR_NZCV, D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
+                                        D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
 }
 
 def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r), 0>,
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 37bfb76a494dee..5efbaf0d41060c 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -450,11 +450,12 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands,
                            unsigned ListNo);
 
-  int tryParseRegister();
+  int tryParseRegister(bool AllowOutofBoundReg = false);
   bool tryParseRegisterWithWriteBack(OperandVector &);
   int tryParseShiftRegister(OperandVector &);
   bool parseRegisterList(OperandVector &, bool EnforceOrder = true,
-                         bool AllowRAAC = false);
+                         bool AllowRAAC = false,
+                         bool AllowOutOfBoundReg = false);
   bool parseMemory(OperandVector &);
   bool parseOperand(OperandVector &, StringRef Mnemonic);
   bool parseImmExpr(int64_t &Out);
@@ -4072,7 +4073,7 @@ ParseStatus ARMAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
 /// Try to parse a register name.  The token must be an Identifier when called,
 /// and if it is a register name the token is eaten and the register number is
 /// returned.  Otherwise return -1.
-int ARMAsmParser::tryParseRegister() {
+int ARMAsmParser::tryParseRegister(bool AllowOutOfBoundReg) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) return -1;
@@ -4116,7 +4117,8 @@ int ARMAsmParser::tryParseRegister() {
   }
 
   // Some FPUs only have 16 D registers, so D16-D31 are invalid
-  if (!hasD32() && RegNum >= ARM::D16 && RegNum <= ARM::D31)
+  if (!AllowOutOfBoundReg && !hasD32() && RegNum >= ARM::D16 &&
+      RegNum <= ARM::D31)
     return -1;
 
   Parser.Lex(); // Eat identifier token.
@@ -4456,7 +4458,7 @@ insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
 
 /// Parse a register list.
 bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
-                                     bool AllowRAAC) {
+                                     bool AllowRAAC, bool AllowOutOfBoundReg) {
   MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::LCurly))
     return TokError("Token is not a Left Curly Brace");
@@ -4510,7 +4512,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
         return Error(RegLoc, "pseudo-register not allowed");
       Parser.Lex(); // Eat the minus.
       SMLoc AfterMinusLoc = Parser.getTok().getLoc();
-      int EndReg = tryParseRegister();
+      int EndReg = tryParseRegister(AllowOutOfBoundReg);
       if (EndReg == -1)
         return Error(AfterMinusLoc, "register expected");
       if (EndReg == ARM::RA_AUTH_CODE)
@@ -4545,7 +4547,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
     RegLoc = Parser.getTok().getLoc();
     int OldReg = Reg;
     const AsmToken RegTok = Parser.getTok();
-    Reg = tryParseRegister();
+    Reg = tryParseRegister(AllowOutOfBoundReg);
     if (Reg == -1)
       return Error(RegLoc, "register expected");
     if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE)
@@ -6085,8 +6087,11 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
   }
   case AsmToken::LBrac:
     return parseMemory(Operands);
-  case AsmToken::LCurly:
-    return parseRegisterList(Operands, !Mnemonic.starts_with("clr"));
+  case AsmToken::LCurly: {
+    bool AllowOutOfBoundReg = Mnemonic == "vlldm" || Mnemonic == "vlstm";
+    return parseRegisterList(Operands, !Mnemonic.starts_with("clr"), false,
+                             AllowOutOfBoundReg);
+  }
   case AsmToken::Dollar:
   case AsmToken::Hash: {
     // #42 -> immediate
@@ -7596,6 +7601,33 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
 
   const unsigned Opcode = Inst.getOpcode();
   switch (Opcode) {
+  case ARM::VLLDM:
+  case ARM::VLLDM_T2:
+  case ARM::VLSTM:
+  case ARM::VLSTM_T2: {
+    // Since in some cases both T1 and T2 are valid, tablegen can not always
+    // pick the correct instruction.
+    if (Operands.size() == 4) { // a register list has been provided
+      ARMOperand &Op = static_cast<ARMOperand &>(
+          *Operands[3]); // the register list, a dpr_reglist
+      assert(Op.isDPRRegList());
+      auto &RegList = Op.getRegList();
+      // T2 requires v8.1-M.Main (cannot be handled by tablegen)
+      if (RegList.size() == 32 && !hasV8_1MMainline()) {
+        return Error(Op.getEndLoc(), "T2 version requires v8.1-M.Main");
+      }
+      // When target has 32 D registers, T1 is undefined.
+      if (hasD32() && RegList.size() != 32) {
+        return Error(Op.getEndLoc(), "operand must be exactly {d0-d31}");
+      }
+      // When target has 16 D registers, both T1 and T2 are valid.
+      if (!hasD32() && (RegList.size() != 16 && RegList.size() != 32)) {
+        return Error(Op.getEndLoc(),
+                     "operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)");
+      }
+    }
+    return false;
+  }
   case ARM::t2IT: {
     // Encoding is unpredictable if it ever results in a notional 'NV'
     // predicate. Since we don't parse 'NV' directly this means an 'AL'
@@ -8731,6 +8763,32 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   }
 
   switch (Inst.getOpcode()) {
+  case ARM::VLLDM:
+  case ARM::VLSTM: {
+    // In some cases both T1 and T2 are valid, causing tablegen pick T1 instead
+    // of T2
+    if (Operands.size() == 4) { // a register list has been provided
+      ARMOperand &Op = static_cast<ARMOperand &>(
+          *Operands[3]); // the register list, a dpr_reglist
+      assert(Op.isDPRRegList());
+      auto &RegList = Op.getRegList();
+      // When the register list is {d0-d31} the instruction has to be the T2
+      // variant
+      if (RegList.size() == 32) {
+        const unsigned Opcode =
+            (Inst.getOpcode() == ARM::VLLDM) ? ARM::VLLDM_T2 : ARM::VLSTM_T2;
+        MCInst TmpInst;
+        TmpInst.setOpcode(Opcode);
+        TmpInst.addOperand(Inst.getOperand(0));
+        TmpInst.addOperand(Inst.getOperand(1));
+        TmpInst.addOperand(Inst.getOperand(2));
+        TmpInst.addOperand(Inst.getOperand(3));
+        Inst = TmpInst;
+        return true;
+      }
+    }
+    return false;
+  }
   // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
   case ARM::LDRT_POST:
   case ARM::LDRBT_POST: {
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 604f22d7111900..705f3cbce12f02 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -700,6 +700,9 @@ DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address,
 static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder);
+static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 
 #include "ARMGenDisassemblerTables.inc"
 
@@ -7030,3 +7033,23 @@ static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
 
   return DS;
 }
+
+static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  const unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  // Adding Rn, holding memory location to save/load to/from, the only argument
+  // that is being encoded.
+  // '$Rn' in the assembly.
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // An optional predicate, '$p' in the assembly.
+  DecodePredicateOperand(Inst, ARMCC::AL, Address, Decoder);
+  // An immediate that represents a floating point registers list. '$regs' in
+  // the assembly.
+  Inst.addOperand(MCOperand::createImm(0)); // Arbitrary value, has no effect.
+
+  return S;
+}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index fbd067d79af0b3..24e627cd9a4e1f 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -91,6 +91,38 @@ void ARMInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   unsigned Opcode = MI->getOpcode();
 
   switch (Opcode) {
+  case ARM::VLLDM: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlldm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d15}";
+    return;
+  }
+  case ARM::VLLDM_T2: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlldm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d31}";
+    return;
+  }
+  case ARM::VLSTM: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlstm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d15}";
+    return;
+  }
+  case ARM::VLSTM_T2: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlstm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d31}";
+    return;
+  }
   // Check for MOVs and print canonical forms, instead.
   case ARM::MOVsr: {
     // FIXME: Thumb variants?
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index bbb56435660211..66a9dc46bcbfbf 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -51,7 +51,7 @@ void ComputedShaderFlags::print(raw_ostream &OS) const {
   if (FlagVal == 0)
     return;
   OS << "; Note: shader requires additional functionality:\n";
-#define SHADER_FLAG(bit, FlagName, Str)                                        \
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str)                                \
   if (FlagName)                                                                \
     OS << ";       " Str "\n";
 #include "llvm/BinaryFormat/DXContainerConstants.def"
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.h b/llvm/lib/Target/DirectX/DXILShaderFlags.h
index 4f51873a2d0b34..574a7b090f5281 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.h
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.h
@@ -29,17 +29,17 @@ class GlobalVariable;
 namespace dxil {
 
 struct ComputedShaderFlags {
-#define SHADER_FLAG(bit, FlagName, Str) bool FlagName : 1;
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str) bool FlagName : 1;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 
-#define SHADER_FLAG(bit, FlagName, Str) FlagName = false;
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str) FlagName = false;
   ComputedShaderFlags() {
 #include "llvm/BinaryFormat/DXContainerConstants.def"
   }
 
   operator uint64_t() const {
     uint64_t FlagValue = 0;
-#define SHADER_FLAG(bit, FlagName, Str)                                        \
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str)                                \
   FlagValue |=                                                                 \
       FlagName ? static_cast<uint64_t>(dxbc::FeatureFlags::FlagName) : 0ull;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e34750d057301c..dde1882f5eea83 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9728,8 +9728,15 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
 
   auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
 
+  ElementCount EndIndex =
+      ElementCount::getScalable(RemIdx) + SubVecVT.getVectorElementCount();
   VL = computeVLMax(SubVecVT, DL, DAG);
 
+  // Use tail agnostic policy if we're inserting over InterSubVT's tail.
+  unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
+  if (EndIndex == InterSubVT.getVectorElementCount())
+    Policy = RISCVII::TAIL_AGNOSTIC;
+
   // If we're inserting into the lowest elements, use a tail undisturbed
   // vmv.v.v.
   if (RemIdx == 0) {
@@ -9743,7 +9750,7 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     VL = DAG.getNode(ISD::ADD, DL, XLenVT, SlideupAmt, VL);
 
     SubVec = getVSlideup(DAG, Subtarget, DL, InterSubVT, AlignedExtract, SubVec,
-                         SlideupAmt, Mask, VL);
+                         SlideupAmt, Mask, VL, Policy);
   }
 
   // If required, insert this subvector back into the correct vector register.
diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt
index bb2ccea5c14598..f430be2653b4ee 100644
--- a/llvm/lib/Target/WebAssembly/CMakeLists.txt
+++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyOptimizeLiveIntervals.cpp
   WebAssemblyOptimizeReturned.cpp
   WebAssemblyPeephole.cpp
+  WebAssemblyRefTypeMem2Local.cpp
   WebAssemblyRegisterInfo.cpp
   WebAssemblyRegColoring.cpp
   WebAssemblyRegNumbering.cpp
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.h b/llvm/lib/Target/WebAssembly/WebAssembly.h
index 91765ad117bdb0..1c40addb6d6f78 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -30,6 +30,7 @@ ModulePass *createWebAssemblyAddMissingPrototypes();
 ModulePass *createWebAssemblyFixFunctionBitcasts();
 FunctionPass *createWebAssemblyOptimizeReturned();
 FunctionPass *createWebAssemblyLowerRefTypesIntPtrConv();
+FunctionPass *createWebAssemblyRefTypeMem2Local();
 
 // ISel and immediate followup passes.
 FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
@@ -59,6 +60,7 @@ ModulePass *createWebAssemblyMCLowerPrePass();
 // PassRegistry initialization declarations.
 void initializeFixFunctionBitcastsPass(PassRegistry &);
 void initializeOptimizeReturnedPass(PassRegistry &);
+void initializeWebAssemblyRefTypeMem2LocalPass(PassRegistry &);
 void initializeWebAssemblyAddMissingPrototypesPass(PassRegistry &);
 void initializeWebAssemblyArgumentMovePass(PassRegistry &);
 void initializeWebAssemblyCFGSortPass(PassRegistry &);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 36f067956e63a5..7c47790d1e3515 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -43,8 +43,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-lower"
 
-extern cl::opt<bool> WasmEmitMultiValue;
-
 WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     const TargetMachine &TM, const WebAssemblySubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -1290,7 +1288,7 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     LLVMContext & /*Context*/) const {
   // WebAssembly can only handle returning tuples with multivalue enabled
-  return (Subtarget->hasMultivalue() && WasmEmitMultiValue) || Outs.size() <= 1;
+  return Subtarget->hasMultivalue() || Outs.size() <= 1;
 }
 
 SDValue WebAssemblyTargetLowering::LowerReturn(
@@ -1298,8 +1296,7 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
     SelectionDAG &DAG) const {
-  assert(((Subtarget->hasMultivalue() && WasmEmitMultiValue) ||
-          Outs.size() <= 1) &&
+  assert((Subtarget->hasMultivalue() || Outs.size() <= 1) &&
          "MVP WebAssembly can only return up to one value");
   if (!callingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index b969b8370a3e5e..1e959111a4dbcb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -22,8 +22,6 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-extern cl::opt<bool> WasmEmitMultiValue;
-
 WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor.
 
 MachineFunctionInfo *WebAssemblyFunctionInfo::clone(
@@ -73,8 +71,7 @@ void llvm::computeSignatureVTs(const FunctionType *Ty,
 
   MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits());
   if (Results.size() > 1 &&
-      (!TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue() ||
-       !WasmEmitMultiValue)) {
+      !TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue()) {
     // WebAssembly can't lower returns of multiple values without demoting to
     // sret unless multivalue is enabled (see
     // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp
new file mode 100644
index 00000000000000..d3c60ee289dfd2
--- /dev/null
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp
@@ -0,0 +1,91 @@
+//=== WebAssemblyRefTypeMem2Local.cpp - WebAssembly RefType Mem2Local -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Assign reference type allocas to local addrspace (addrspace(1)) so that
+/// their loads and stores can be lowered to local.gets/local.sets.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Utils/WasmAddressSpaces.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
+#include "WebAssembly.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-ref-type-mem2local"
+
+namespace {
+class WebAssemblyRefTypeMem2Local final
+    : public FunctionPass,
+      public InstVisitor<WebAssemblyRefTypeMem2Local> {
+  StringRef getPassName() const override {
+    return "WebAssembly Reference Types Memory to Local";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override;
+  bool Changed = false;
+
+public:
+  static char ID;
+  WebAssemblyRefTypeMem2Local() : FunctionPass(ID) {}
+
+  void visitAllocaInst(AllocaInst &AI);
+};
+} // End anonymous namespace
+
+char WebAssemblyRefTypeMem2Local::ID = 0;
+INITIALIZE_PASS(WebAssemblyRefTypeMem2Local, DEBUG_TYPE,
+                "Assign reference type allocas to local address space", true,
+                false)
+
+FunctionPass *llvm::createWebAssemblyRefTypeMem2Local() {
+  return new WebAssemblyRefTypeMem2Local();
+}
+
+void WebAssemblyRefTypeMem2Local::visitAllocaInst(AllocaInst &AI) {
+  if (WebAssembly::isWebAssemblyReferenceType(AI.getAllocatedType())) {
+    Changed = true;
+    IRBuilder<> IRB(AI.getContext());
+    IRB.SetInsertPoint(&AI);
+    auto *NewAI = IRB.CreateAlloca(AI.getAllocatedType(),
+                                   WebAssembly::WASM_ADDRESS_SPACE_VAR, nullptr,
+                                   AI.getName() + ".var");
+
+    // The below is basically equivalent to AI.replaceAllUsesWith(NewAI), but we
+    // cannot use it because it requires the old and new types be the same,
+    // which is not true here because the address spaces are different.
+    if (AI.hasValueHandle())
+      ValueHandleBase::ValueIsRAUWd(&AI, NewAI);
+    if (AI.isUsedByMetadata())
+      ValueAsMetadata::handleRAUW(&AI, NewAI);
+    while (!AI.materialized_use_empty()) {
+      Use &U = *AI.materialized_use_begin();
+      U.set(NewAI);
+    }
+
+    AI.eraseFromParent();
+  }
+}
+
+bool WebAssemblyRefTypeMem2Local::runOnFunction(Function &F) {
+  LLVM_DEBUG(dbgs() << "********** WebAssembly RefType Mem2Local **********\n"
+                       "********** Function: "
+                    << F.getName() << '\n');
+
+  visit(F);
+  return Changed;
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 2a84c90c89602e..3e2e029695ab6f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -24,8 +24,6 @@
 
 using namespace llvm;
 
-extern cl::opt<bool> WasmEmitMultiValue;
-
 namespace {
 
 enum RuntimeLibcallSignature {
@@ -696,7 +694,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(PtrTy);
     break;
   case i64_i64_func_f32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -705,7 +703,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::F32);
     break;
   case i64_i64_func_f64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -714,7 +712,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::F64);
     break;
   case i16_i16_func_i16_i16:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I32);
       Rets.push_back(wasm::ValType::I32);
     } else {
@@ -724,7 +722,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i32_i32_func_i32_i32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I32);
       Rets.push_back(wasm::ValType::I32);
     } else {
@@ -734,7 +732,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i64_i64_func_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -744,7 +742,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -756,7 +754,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64_iPTR:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -769,7 +767,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(PtrTy);
     break;
   case i64_i64_i64_i64_func_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
@@ -783,7 +781,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -853,7 +851,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -867,7 +865,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -876,7 +874,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i64_i64_func_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index b2f7ee970a732f..4d4cae110148d6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -54,15 +54,6 @@ static cl::opt<bool> WasmDisableFixIrreducibleControlFlowPass(
              " irreducible control flow optimization pass"),
     cl::init(false));
 
-// A temporary option to control emission of multivalue until multivalue
-// implementation is stable enough. We currently don't emit multivalue by
-// default even if the feature section allows it.
-// TODO Stabilize multivalue and delete this option
-cl::opt<bool>
-    WasmEmitMultiValue("wasm-emit-multivalue", cl::Hidden,
-                       cl::desc("WebAssembly: Emit multivalue in the backend"),
-                       cl::init(false));
-
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   // Register the target.
   RegisterTargetMachine<WebAssemblyTargetMachine> X(
@@ -77,6 +68,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   initializeLowerGlobalDtorsLegacyPassPass(PR);
   initializeFixFunctionBitcastsPass(PR);
   initializeOptimizeReturnedPass(PR);
+  initializeWebAssemblyRefTypeMem2LocalPass(PR);
   initializeWebAssemblyArgumentMovePass(PR);
   initializeWebAssemblySetP2AlignOperandsPass(PR);
   initializeWebAssemblyReplacePhysRegsPass(PR);
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index be416fb0db0695..d914e1b61ab075 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -1418,6 +1418,34 @@ bool X86FrameLowering::needsDwarfCFI(const MachineFunction &MF) const {
   return !isWin64Prologue(MF) && MF.needsFrameMoves();
 }
 
+/// Return true if an opcode is part of the REP group of instructions
+static bool isOpcodeRep(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::REPNE_PREFIX:
+  case X86::REP_MOVSB_32:
+  case X86::REP_MOVSB_64:
+  case X86::REP_MOVSD_32:
+  case X86::REP_MOVSD_64:
+  case X86::REP_MOVSQ_32:
+  case X86::REP_MOVSQ_64:
+  case X86::REP_MOVSW_32:
+  case X86::REP_MOVSW_64:
+  case X86::REP_PREFIX:
+  case X86::REP_STOSB_32:
+  case X86::REP_STOSB_64:
+  case X86::REP_STOSD_32:
+  case X86::REP_STOSD_64:
+  case X86::REP_STOSQ_32:
+  case X86::REP_STOSQ_64:
+  case X86::REP_STOSW_32:
+  case X86::REP_STOSW_64:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+
 /// emitPrologue - Push callee-saved registers onto the stack, which
 /// automatically adjust the stack pointer. Adjust the stack pointer to allocate
 /// space for local variables. Also emit labels used by the exception handler to
@@ -2194,13 +2222,44 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction
   // in each prologue of interrupt handler function.
   //
-  // FIXME: Create "cld" instruction only in these cases:
+  // Create "cld" instruction only in these cases:
   // 1. The interrupt handling function uses any of the "rep" instructions.
   // 2. Interrupt handling function calls another function.
+  // 3. If there are any inline asm blocks, as we do not know what they do
   //
-  if (Fn.getCallingConv() == CallingConv::X86_INTR)
-    BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
-        .setMIFlag(MachineInstr::FrameSetup);
+  // TODO: We should also emit cld if we detect the use of std, but as of now,
+  // the compiler does not even emit that instruction or even define it, so in
+  // practice, this would only happen with inline asm, which we cover anyway.
+  if (Fn.getCallingConv() == CallingConv::X86_INTR) {
+    bool NeedsCLD = false;
+
+    for (const MachineBasicBlock &B : MF) {
+      for (const MachineInstr &MI : B) {
+        if (MI.isCall()) {
+          NeedsCLD = true;
+          break;
+        }
+
+        if (isOpcodeRep(MI.getOpcode())) {
+          NeedsCLD = true;
+          break;
+        }
+
+        if (MI.isInlineAsm()) {
+          // TODO: Parse asm for rep instructions or call sites?
+          // For now, let's play it safe and emit a cld instruction
+          // just in case.
+          NeedsCLD = true;
+          break;
+        }
+      }
+    }
+
+    if (NeedsCLD) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+  }
 
   // At this point we know if the function has WinCFI or not.
   MF.setHasWinCFI(HasWinCFI);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0722c402348ee0..bec13d1c00ef7d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55789,6 +55789,15 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  auto IsExtractFree = [](SDValue V) {
+    V = peekThroughBitcasts(V);
+    if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
+      return true;
+    if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()))
+      return true;
+    return V.isUndef();
+  };
+
   // If we're extracting the lowest subvector and we're the only user,
   // we may be able to perform this with a smaller vector width.
   unsigned InOpcode = InVec.getOpcode();
@@ -55830,14 +55839,27 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
     }
     if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
-        (VT.is128BitVector() || VT.is256BitVector())) {
+        (SizeInBits == 128 || SizeInBits == 256)) {
       SDValue InVecSrc = InVec.getOperand(0);
       unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
       SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
       return DAG.getNode(InOpcode, DL, VT, Ext);
     }
+    if ((InOpcode == X86ISD::CMPP || InOpcode == X86ISD::PCMPEQ ||
+         InOpcode == X86ISD::PCMPGT) &&
+        (IsExtractFree(InVec.getOperand(0)) ||
+         IsExtractFree(InVec.getOperand(1))) &&
+        SizeInBits == 128) {
+      SDValue Ext0 =
+          extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
+      SDValue Ext1 =
+          extractSubVector(InVec.getOperand(1), IdxVal, DAG, DL, SizeInBits);
+      if (InOpcode == X86ISD::CMPP)
+        return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, InVec.getOperand(2));
+      return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1);
+    }
     if (InOpcode == X86ISD::MOVDDUP &&
-        (VT.is128BitVector() || VT.is256BitVector())) {
+        (SizeInBits == 128 || SizeInBits == 256)) {
       SDValue Ext0 =
           extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
       return DAG.getNode(InOpcode, DL, VT, Ext0);
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8a2864a0787312..5d5c4ea57ed56c 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1957,6 +1957,10 @@ bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
     // On COFF, don't instrument non-ODR linkages.
     if (G->isInterposable())
       return false;
+    // If the global has AvailableExternally linkage, then it is not in this
+    // module, which means it does not need to be instrumented.
+    if (G->hasAvailableExternallyLinkage())
+      return false;
   }
 
   // If a comdat is present, it must have a selection kind that implies ODR
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 9b6a39e98f5ce8..7e48c28176bd1c 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -461,7 +461,7 @@ static Decomposition decomposeGEP(GEPOperator &GEP,
 
     // If Op0 is signed non-negative, the GEP is increasing monotonically and
     // can be de-composed.
-    if (!isKnownNonNegative(Index, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1))
+    if (!isKnownNonNegative(Index, DL))
       Preconditions.emplace_back(CmpInst::ICMP_SGE, Index,
                                  ConstantInt::get(Index->getType(), 0));
   }
@@ -560,10 +560,10 @@ static Decomposition decompose(Value *V,
     return MergeResults(Op0, Op1, IsSigned);
   }
   if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) {
-    if (!isKnownNonNegative(Op0, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1))
+    if (!isKnownNonNegative(Op0, DL))
       Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0,
                                  ConstantInt::get(Op0->getType(), 0));
-    if (!isKnownNonNegative(Op1, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1))
+    if (!isKnownNonNegative(Op1, DL))
       Preconditions.emplace_back(CmpInst::ICMP_SGE, Op1,
                                  ConstantInt::get(Op1->getType(), 0));
 
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 6c8785d52c4eab..c7b9ce2e93120a 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -293,7 +293,7 @@ calculateFragment(DILocalVariable *Variable,
   if (!CurrentFragment) {
     if (auto Size = Variable->getSizeInBits()) {
       // Treat the current fragment as covering the whole variable.
-      CurrentFragment =  DIExpression::FragmentInfo(*Size, 0);
+      CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
       if (Target == CurrentFragment)
         return UseNoFrag;
     }
@@ -1213,8 +1213,9 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
     if (!IsOffsetKnown)
       return PI.setAborted(&II);
 
-    insertUse(II, Offset, Length ? Length->getLimitedValue()
-                                 : AllocSize - Offset.getLimitedValue(),
+    insertUse(II, Offset,
+              Length ? Length->getLimitedValue()
+                     : AllocSize - Offset.getLimitedValue(),
               (bool)Length);
   }
 
@@ -1669,7 +1670,7 @@ static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
   }
 
   // Inject loads into all of the pred blocks.
-  DenseMap<BasicBlock*, Value*> InjectedLoads;
+  DenseMap<BasicBlock *, Value *> InjectedLoads;
   for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
     BasicBlock *Pred = PN.getIncomingBlock(Idx);
     Value *InVal = PN.getIncomingValue(Idx);
@@ -1678,7 +1679,7 @@ static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
     // basic block, as long as the value is the same. So if we already injected
     // a load in the predecessor, then we should reuse the same load for all
     // duplicated entries.
-    if (Value* V = InjectedLoads.lookup(Pred)) {
+    if (Value *V = InjectedLoads.lookup(Pred)) {
       NewPN->addIncoming(V, Pred);
       continue;
     }
@@ -2077,8 +2078,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
   if (BeginIndex * ElementSize != BeginOffset ||
       BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
     return false;
-  uint64_t EndOffset =
-      std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
+  uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
   uint64_t EndIndex = EndOffset / ElementSize;
   if (EndIndex * ElementSize != EndOffset ||
       EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
@@ -2754,8 +2754,8 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
     Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
     IRB.SetInsertPoint(OldUserI);
     IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
-    IRB.getInserter().SetNamePrefix(
-        Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
+    IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
+                                    Twine(BeginOffset) + ".");
 
     CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
     if (VecTy || IntTy)
@@ -2808,7 +2808,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
 #else
                           Twine()
 #endif
-                          );
+    );
   }
 
   /// Compute suitable alignment to access this slice of the *new*
@@ -3189,8 +3189,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
     const bool CanContinue = [&]() {
       if (VecTy || IntTy)
         return true;
-      if (BeginOffset > NewAllocaBeginOffset ||
-          EndOffset < NewAllocaEndOffset)
+      if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
         return false;
       // Length must be in range for FixedVectorType.
       auto *C = cast<ConstantInt>(II.getLength());
@@ -3957,11 +3956,11 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     return false;
   }
 
-  // Fold gep (select cond, ptr1, ptr2), idx
+  // Unfold gep (select cond, ptr1, ptr2), idx
   //   => select cond, gep(ptr1, idx), gep(ptr2, idx)
   // and  gep ptr, (select cond, idx1, idx2)
   //   => select cond, gep(ptr, idx1), gep(ptr, idx2)
-  bool foldGEPSelect(GetElementPtrInst &GEPI) {
+  bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
     // Check whether the GEP has exactly one select operand and all indices
     // will become constant after the transform.
     SelectInst *Sel = dyn_cast<SelectInst>(GEPI.getPointerOperand());
@@ -3984,9 +3983,9 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     if (!Sel)
       return false;
 
-    LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):"
-                      << "\n    original: " << *Sel
-                      << "\n              " << GEPI);
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):\n";
+               dbgs() << "    original: " << *Sel << "\n";
+               dbgs() << "              " << GEPI << "\n";);
 
     auto GetNewOps = [&](Value *SelOp) {
       SmallVector<Value *> NewOps;
@@ -4023,74 +4022,107 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     Visited.insert(NSelI);
     enqueueUsers(*NSelI);
 
-    LLVM_DEBUG(dbgs() << "\n          to: " << *NTrue
-                      << "\n              " << *NFalse
-                      << "\n              " << *NSel << '\n');
+    LLVM_DEBUG(dbgs() << "          to: " << *NTrue << "\n";
+               dbgs() << "              " << *NFalse << "\n";
+               dbgs() << "              " << *NSel << "\n";);
 
     return true;
   }
 
-  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
-  bool foldGEPPhi(GetElementPtrInst &GEPI) {
-    if (!GEPI.hasAllConstantIndices())
-      return false;
+  // Unfold gep (phi ptr1, ptr2), idx
+  //   => phi ((gep ptr1, idx), (gep ptr2, idx))
+  // and  gep ptr, (phi idx1, idx2)
+  //   => phi ((gep ptr, idx1), (gep ptr, idx2))
+  bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
+    // To prevent infinitely expanding recursive phis, bail if the GEP pointer
+    // operand (looking through the phi if it is the phi we want to unfold) is
+    // an instruction besides an alloca.
+    PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
+    auto IsInvalidPointerOperand = [](Value *V) {
+      return isa<Instruction>(V) && !isa<AllocaInst>(V);
+    };
+    if (Phi) {
+      if (any_of(Phi->operands(), IsInvalidPointerOperand))
+        return false;
+    } else {
+      if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
+        return false;
+    }
+    // Check whether the GEP has exactly one phi operand (including the pointer
+    // operand) and all indices will become constant after the transform.
+    for (Value *Op : GEPI.indices()) {
+      if (auto *SI = dyn_cast<PHINode>(Op)) {
+        if (Phi)
+          return false;
+
+        Phi = SI;
+        if (!all_of(Phi->incoming_values(),
+                    [](Value *V) { return isa<ConstantInt>(V); }))
+          return false;
+        continue;
+      }
+
+      if (!isa<ConstantInt>(Op))
+        return false;
+    }
 
-    PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
-    if (GEPI.getParent() != PHI->getParent() ||
-        llvm::any_of(PHI->incoming_values(), [](Value *In)
-          { Instruction *I = dyn_cast<Instruction>(In);
-            return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
-                   succ_empty(I->getParent()) ||
-                   !I->getParent()->isLegalToHoistInto();
-          }))
+    if (!Phi)
       return false;
 
-    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):"
-                      << "\n    original: " << *PHI
-                      << "\n              " << GEPI
-                      << "\n          to: ");
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):\n";
+               dbgs() << "    original: " << *Phi << "\n";
+               dbgs() << "              " << GEPI << "\n";);
 
-    SmallVector<Value *, 4> Index(GEPI.indices());
-    bool IsInBounds = GEPI.isInBounds();
-    IRB.SetInsertPoint(GEPI.getParent(), GEPI.getParent()->getFirstNonPHIIt());
-    PHINode *NewPN = IRB.CreatePHI(GEPI.getType(), PHI->getNumIncomingValues(),
-                                   PHI->getName() + ".sroa.phi");
-    for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
-      BasicBlock *B = PHI->getIncomingBlock(I);
-      Value *NewVal = nullptr;
-      int Idx = NewPN->getBasicBlockIndex(B);
-      if (Idx >= 0) {
-        NewVal = NewPN->getIncomingValue(Idx);
-      } else {
-        Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
+    auto GetNewOps = [&](Value *PhiOp) {
+      SmallVector<Value *> NewOps;
+      for (Value *Op : GEPI.operands())
+        if (Op == Phi)
+          NewOps.push_back(PhiOp);
+        else
+          NewOps.push_back(Op);
+      return NewOps;
+    };
 
-        IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator()));
-        Type *Ty = GEPI.getSourceElementType();
-        NewVal = IRB.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep",
-                               IsInBounds);
-      }
-      NewPN->addIncoming(NewVal, B);
+    IRB.SetInsertPoint(Phi);
+    PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
+                                    Phi->getName() + ".sroa.phi");
+
+    bool IsInBounds = GEPI.isInBounds();
+    Type *SourceTy = GEPI.getSourceElementType();
+    // We only handle arguments, constants, and static allocas here, so we can
+    // insert GEPs at the beginning of the function after static allocas.
+    IRB.SetInsertPointPastAllocas(GEPI.getFunction());
+    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+      Value *Op = Phi->getIncomingValue(I);
+      BasicBlock *BB = Phi->getIncomingBlock(I);
+      SmallVector<Value *> NewOps = GetNewOps(Op);
+
+      Value *NewGEP =
+          IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
+                        Phi->getName() + ".sroa.gep", IsInBounds);
+      NewPhi->addIncoming(NewGEP, BB);
     }
 
     Visited.erase(&GEPI);
-    GEPI.replaceAllUsesWith(NewPN);
+    GEPI.replaceAllUsesWith(NewPhi);
     GEPI.eraseFromParent();
-    Visited.insert(NewPN);
-    enqueueUsers(*NewPN);
+    Visited.insert(NewPhi);
+    enqueueUsers(*NewPhi);
 
-    LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
-                 dbgs() << "\n              " << *In;
-               dbgs() << "\n              " << *NewPN << '\n');
+    LLVM_DEBUG(dbgs() << "          to: ";
+               for (Value *In
+                    : NewPhi->incoming_values()) dbgs()
+               << "\n              " << *In;
+               dbgs() << "\n              " << *NewPhi << '\n');
 
     return true;
   }
 
   bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
-    if (foldGEPSelect(GEPI))
+    if (unfoldGEPSelect(GEPI))
       return true;
 
-    if (isa<PHINode>(GEPI.getPointerOperand()) &&
-        foldGEPPhi(GEPI))
+    if (unfoldGEPPhi(GEPI))
       return true;
 
     enqueueUsers(GEPI);
@@ -4162,17 +4194,17 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
     return nullptr;
 
   if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
-     Type *ElementTy;
-     uint64_t TyNumElements;
-     if (auto *AT = dyn_cast<ArrayType>(Ty)) {
-       ElementTy = AT->getElementType();
-       TyNumElements = AT->getNumElements();
-     } else {
-       // FIXME: This isn't right for vectors with non-byte-sized or
-       // non-power-of-two sized elements.
-       auto *VT = cast<FixedVectorType>(Ty);
-       ElementTy = VT->getElementType();
-       TyNumElements = VT->getNumElements();
+    Type *ElementTy;
+    uint64_t TyNumElements;
+    if (auto *AT = dyn_cast<ArrayType>(Ty)) {
+      ElementTy = AT->getElementType();
+      TyNumElements = AT->getNumElements();
+    } else {
+      // FIXME: This isn't right for vectors with non-byte-sized or
+      // non-power-of-two sized elements.
+      auto *VT = cast<FixedVectorType>(Ty);
+      ElementTy = VT->getElementType();
+      TyNumElements = VT->getNumElements();
     }
     uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
     uint64_t NumSkippedElements = Offset / ElementSize;
@@ -4853,9 +4885,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     ++NumNewAllocas;
   }
 
-  LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
-                    << "[" << P.beginOffset() << "," << P.endOffset()
-                    << ") to: " << *NewAI << "\n");
+  LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
+                    << "," << P.endOffset() << ") to: " << *NewAI << "\n");
 
   // Track the high watermark on the worklist as it is only relevant for
   // promoted allocas. We will reset it to this point if the alloca is not in
@@ -5040,8 +5071,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
         IsSorted = false;
       }
     }
-  }
-  else {
+  } else {
     // We only allow whole-alloca splittable loads and stores
     // for a large alloca to avoid creating too large BitVector.
     for (Slice &S : AS) {
@@ -5069,7 +5099,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
     uint64_t Offset;
     uint64_t Size;
     Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
-      : Alloca(AI), Offset(O), Size(S) {}
+        : Alloca(AI), Offset(O), Size(S) {}
   };
   SmallVector<Fragment, 4> Fragments;
 
@@ -5083,7 +5113,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
             DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue();
         // Don't include any padding.
         uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
-        Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
+        Fragments.push_back(
+            Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
       }
     }
     ++NumPartitions;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e5deac7975728f..b81cd508663c32 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -315,12 +315,6 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
     cl::desc(
         "Enable runtime interleaving until load/store ports are saturated"));
 
-/// Interleave small loops with scalar reductions.
-static cl::opt<bool> InterleaveSmallLoopScalarReduction(
-    "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
-    cl::desc("Enable interleaving for loops with small iteration counts that "
-             "contain scalar reductions to expose ILP."));
-
 /// The number of stores in a loop that are allowed to need predication.
 static cl::opt<unsigned> NumberOfStoresToPredicate(
     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
@@ -1510,19 +1504,36 @@ class LoopVectorizationCostModel {
   }
 
   /// Returns the TailFoldingStyle that is best for the current loop.
-  TailFoldingStyle
-  getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
-    if (!CanFoldTailByMasking)
-      return TailFoldingStyle::None;
+  TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
+    return IVUpdateMayOverflow ? ChosenTailFoldingStyle.first
+                               : ChosenTailFoldingStyle.second;
+  }
+
+  /// Selects and saves TailFoldingStyle for 2 options - if IV update may
+  /// overflow or not.
+  void setTailFoldingStyles() {
+    assert(ChosenTailFoldingStyle.first == TailFoldingStyle::None &&
+           ChosenTailFoldingStyle.second == TailFoldingStyle::None &&
+           "Tail folding must not be selected yet.");
+    if (!Legal->prepareToFoldTailByMasking())
+      return;
 
-    if (ForceTailFoldingStyle.getNumOccurrences())
-      return ForceTailFoldingStyle;
+    if (ForceTailFoldingStyle.getNumOccurrences()) {
+      ChosenTailFoldingStyle.first = ChosenTailFoldingStyle.second =
+          ForceTailFoldingStyle;
+      return;
+    }
 
-    return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
+    ChosenTailFoldingStyle.first =
+        TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true);
+    ChosenTailFoldingStyle.second =
+        TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false);
   }
 
   /// Returns true if all loop blocks should be masked to fold tail loop.
   bool foldTailByMasking() const {
+    // TODO: check if it is possible to check for None style independent of
+    // IVUpdateMayOverflow flag in getTailFoldingStyle.
     return getTailFoldingStyle() != TailFoldingStyle::None;
   }
 
@@ -1675,8 +1686,10 @@ class LoopVectorizationCostModel {
   /// iterations to execute in the scalar loop.
   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
 
-  /// All blocks of loop are to be masked to fold tail of scalar iterations.
-  bool CanFoldTailByMasking = false;
+  /// Control finally chosen tail folding style. The first element is used if
+  /// the IV update may overflow, the second element - if it does not.
+  std::pair<TailFoldingStyle, TailFoldingStyle> ChosenTailFoldingStyle =
+      std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
 
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
@@ -4633,10 +4646,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  if (Legal->prepareToFoldTailByMasking()) {
-    CanFoldTailByMasking = true;
+  setTailFoldingStyles();
+  if (foldTailByMasking())
     return MaxFactors;
-  }
 
   // If there was a tail-folding hint/switch, but we can't fold the tail by
   // masking, fallback to a vectorization with a scalar epilogue.
@@ -5477,8 +5489,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
 
     // If there are scalar reductions and TTI has enabled aggressive
     // interleaving for reductions, we will interleave to expose ILP.
-    if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
-        AggressivelyInterleaveReductions) {
+    if (VF.isScalar() && AggressivelyInterleaveReductions) {
       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
       // Interleave no less than SmallIC but not as aggressive as the normal IC
       // to satisfy the rare situation when resources are too limited.
@@ -7450,12 +7461,14 @@ LoopVectorizationPlanner::executePlan(
       (IsEpilogueVectorization || !ExpandedSCEVs) &&
       "expanded SCEVs to reuse can only be used during epilogue vectorization");
 
-  LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
-                    << '\n');
-
   if (!IsEpilogueVectorization)
     VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
 
+  LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
+                    << ", UF=" << BestUF << '\n');
+  BestVPlan.setName("Final VPlan");
+  LLVM_DEBUG(BestVPlan.dump());
+
   // Perform the actual loop transformation.
   VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
                          OrigLoop->getHeader()->getContext());
@@ -10073,6 +10086,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
           auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
               ExpandedSCEVs.find(ExpandR->getSCEV())->second);
           ExpandR->replaceAllUsesWith(ExpandedVal);
+          if (BestEpiPlan.getTripCount() == ExpandR)
+            BestEpiPlan.resetTripCount(ExpandedVal);
           ExpandR->eraseFromParent();
         }
 
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index de4e56ff80659a..2b7d518c1c1a78 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9364,7 +9364,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
                 VectorCasts
                     .insert(std::make_pair(ScalarTE, FTy->getElementType()))
                     .second) {
-              unsigned BWSz = It->second.second;
+              unsigned BWSz = It->second.first;
               unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
               unsigned VecOpcode;
               if (DstBWSz < BWSz)
@@ -9376,7 +9376,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
               InstructionCost C = TTI->getCastInstrCost(
                   VecOpcode, FTy,
                   FixedVectorType::get(
-                      IntegerType::get(FTy->getContext(), It->second.first),
+                      IntegerType::get(FTy->getContext(), BWSz),
                       FTy->getNumElements()),
                   TTI::CastContextHint::None, CostKind);
               LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 47cebecdb27083..16c09a83e777dd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2931,6 +2931,14 @@ class VPlan {
     return TripCount;
   }
 
+  /// Resets the trip count for the VPlan. The caller must make sure all uses of
+  /// the original trip count have been replaced.
+  void resetTripCount(VPValue *NewTripCount) {
+    assert(TripCount && NewTripCount && TripCount->getNumUsers() == 0 &&
+           "TripCount always must be set");
+    TripCount = NewTripCount;
+  }
+
   /// The backedge taken count of the original loop.
   VPValue *getOrCreateBackedgeTakenCount() {
     if (!BackedgeTakenCount)
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
index 7244ac949ab88c..9a4e01a29ecb6d 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
@@ -14,12 +14,12 @@ define void @array_1D(ptr %addr) #0 {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #3
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -81,18 +81,18 @@ define void @array_2D(ptr %addr) #0 {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 48 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #5, mul vl]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x0, #4, mul vl]
-; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x0, #3, mul vl]
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #5, mul vl]
-; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #4, mul vl]
-; CHECK-NEXT:    st1d { z5.d }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    st1d { z4.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #5, mul vl]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #5, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    st1d { z5.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z4.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #6
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
index f03a6f018d34d0..7292d52aaf4765 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
@@ -13,12 +13,12 @@ define void @test(ptr %addr) #0 {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #3
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
new file mode 100644
index 00000000000000..7f4540d915ab37
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -0,0 +1,318 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; ===== Legal Scalars =====
+
+define i8 @load_i8(ptr %ptr){
+; CHECK-LABEL: load_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w0, [x0]
+; CHECK-NEXT:    ret
+    %a = load i8 , ptr %ptr
+    ret i8 %a
+}
+
+define i16 @load_i16(ptr %ptr){
+; CHECK-LABEL: load_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w0, [x0]
+; CHECK-NEXT:    ret
+    %a = load i16 , ptr %ptr
+    ret i16 %a
+}
+
+define i32 @load_i32(ptr %ptr){
+; CHECK-LABEL: load_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:    ret
+    %a = load i32 , ptr %ptr
+    ret i32 %a
+}
+
+define i64 @load_i64(ptr %ptr){
+; CHECK-LABEL: load_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x0, [x0]
+; CHECK-NEXT:    ret
+    %a = load i64 , ptr %ptr
+    ret i64 %a
+}
+
+; ===== Legal Vector Types =====
+
+define <8 x i8> @load_v8i8(ptr %ptr){
+; CHECK-LABEL: load_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <8 x i8>, ptr %ptr
+    ret <8 x i8> %a
+}
+
+define <16 x i8> @load_v16i8(ptr %ptr){
+; CHECK-LABEL: load_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <16 x i8>, ptr %ptr
+    ret <16 x i8> %a
+}
+
+define <4 x i16> @load_v4i16(ptr %ptr){
+; CHECK-LABEL: load_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <4 x i16>, ptr %ptr
+    ret <4 x i16> %a
+}
+
+define <8 x i16> @load_v8i16(ptr %ptr){
+; CHECK-LABEL: load_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <8 x i16>, ptr %ptr
+    ret <8 x i16> %a
+}
+
+define <2 x i32> @load_v2i32(ptr %ptr){
+; CHECK-LABEL: load_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <2 x i32>, ptr %ptr
+    ret <2 x i32> %a
+}
+
+define <4 x i32> @load_v4i32(ptr %ptr){
+; CHECK-LABEL: load_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <4 x i32>, ptr %ptr
+    ret <4 x i32> %a
+}
+
+define <2 x i64> @load_v2i64(ptr %ptr){
+; CHECK-LABEL: load_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <2 x i64>, ptr %ptr
+    ret <2 x i64> %a
+}
+
+; ===== Smaller/Larger Width Vectors with Legal Element Sizes =====
+
+define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b){
+; CHECK-SD-LABEL: load_v2i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-SD-NEXT:    add x8, x0, #1
+; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v2i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %a = load <2 x i8>, ptr %ptr
+    ret <2 x i8> %a
+}
+
+define i32 @load_v4i8(ptr %ptr, <4 x i8> %b){
+; CHECK-LABEL: load_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <4 x i8>, ptr %ptr
+    %c = bitcast <4 x i8> %a to i32
+    ret i32 %c
+}
+
+define <32 x i8> @load_v32i8(ptr %ptr){
+; CHECK-LABEL: load_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    %a = load <32 x i8>, ptr %ptr
+    ret <32 x i8> %a
+}
+
+define <2 x i16> @load_v2i16(ptr %ptr){
+; CHECK-SD-LABEL: load_v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-SD-NEXT:    add x8, x0, #2
+; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %a = load <2 x i16>, ptr %ptr
+    ret <2 x i16> %a
+}
+
+define <16 x i16> @load_v16i16(ptr %ptr){
+; CHECK-LABEL: load_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    %a = load <16 x i16>, ptr %ptr
+    ret <16 x i16> %a
+}
+
+define <1 x i32> @load_v1i32(ptr %ptr){
+; CHECK-LABEL: load_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <1 x i32>, ptr %ptr
+    ret <1 x i32> %a
+}
+
+define <8 x i32> @load_v8i32(ptr %ptr){
+; CHECK-LABEL: load_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    %a = load <8 x i32>, ptr %ptr
+    ret <8 x i32> %a
+}
+
+define <4 x i64> @load_v4i64(ptr %ptr){
+; CHECK-LABEL: load_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    %a = load <4 x i64>, ptr %ptr
+    ret <4 x i64> %a
+}
+
+; ===== Vectors with Non-Pow 2 Widths =====
+
+define <3 x i8> @load_v3i8(ptr %ptr){
+; CHECK-SD-LABEL: load_v3i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    umov w0, v0.b[0]
+; CHECK-SD-NEXT:    umov w1, v0.b[1]
+; CHECK-SD-NEXT:    umov w2, v0.b[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v3i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldrb w8, [x0]
+; CHECK-GI-NEXT:    ldrb w1, [x0, #1]
+; CHECK-GI-NEXT:    ldrb w2, [x0, #2]
+; CHECK-GI-NEXT:    mov w0, w8
+; CHECK-GI-NEXT:    ret
+    %a = load <3 x i8>, ptr %ptr
+    ret <3 x i8> %a
+}
+
+define <7 x i8> @load_v7i8(ptr %ptr){
+; CHECK-SD-LABEL: load_v7i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v7i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #3]
+; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #4]
+; CHECK-GI-NEXT:    mov v0.b[4], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #5]
+; CHECK-GI-NEXT:    mov v0.b[5], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #6]
+; CHECK-GI-NEXT:    mov v0.b[6], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %a = load <7 x i8>, ptr %ptr
+    ret <7 x i8> %a
+}
+
+define <3 x i16> @load_v3i16(ptr %ptr){
+; CHECK-SD-LABEL: load_v3i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v3i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #4]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %a = load <3 x i16>, ptr %ptr
+    ret <3 x i16> %a
+}
+
+define <7 x i16> @load_v7i16(ptr %ptr){
+; CHECK-SD-LABEL: load_v7i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v7i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #4]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #6]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #8]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #10]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #12]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[7], v0.h[0]
+; CHECK-GI-NEXT:    ret
+    %a = load <7 x i16>, ptr %ptr
+    ret <7 x i16> %a
+}
+
+define <3 x i32> @load_v3i32(ptr %ptr){
+; CHECK-SD-LABEL: load_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldp s0, s1, [x0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    ldr s1, [x0, #8]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
+; CHECK-GI-NEXT:    ret
+    %a = load <3 x i32>, ptr %ptr
+    ret <3 x i32> %a
+}
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 2a78012045ff42..cd348be5d771d1 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme < %s \
+; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-FISEL
-; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme < %s \
+; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-GISEL
 
 
@@ -447,3 +447,64 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
   %res = frem float %a, %b
   ret float %res
 }
+
+;
+; Check ZT0 State
+;
+
+declare double @zt0_shared_callee(double) "aarch64_inout_zt0"
+
+define double  @zt0_new_caller_to_zt0_shared_callee(double %x) nounwind noinline optnone "aarch64_new_zt0" {
+; CHECK-COMMON-LABEL: zt0_new_caller_to_zt0_shared_callee:
+; CHECK-COMMON:       // %bb.0: // %prelude
+; CHECK-COMMON-NEXT:    sub sp, sp, #80
+; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-COMMON-NEXT:    cbz x8, .LBB13_2
+; CHECK-COMMON-NEXT:    b .LBB13_1
+; CHECK-COMMON-NEXT:  .LBB13_1: // %save.za
+; CHECK-COMMON-NEXT:    mov x8, sp
+; CHECK-COMMON-NEXT:    str zt0, [x8]
+; CHECK-COMMON-NEXT:    bl __arm_tpidr2_save
+; CHECK-COMMON-NEXT:    ldr zt0, [x8]
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-COMMON-NEXT:    b .LBB13_2
+; CHECK-COMMON-NEXT:  .LBB13_2: // %entry
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    zero { zt0 }
+; CHECK-COMMON-NEXT:    bl zt0_shared_callee
+; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
+; CHECK-COMMON-NEXT:    fmov d1, x8
+; CHECK-COMMON-NEXT:    fadd d0, d0, d1
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #80
+; CHECK-COMMON-NEXT:    ret
+entry:
+  %call = call double @zt0_shared_callee(double %x)
+  %add = fadd double %call, 4.200000e+01
+  ret double %add;
+}
+
+define double  @zt0_shared_caller_to_normal_callee(double %x) nounwind noinline optnone "aarch64_inout_zt0" {
+; CHECK-COMMON-LABEL: zt0_shared_caller_to_normal_callee:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    sub sp, sp, #80
+; CHECK-COMMON-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    mov x19, sp
+; CHECK-COMMON-NEXT:    str zt0, [x19]
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    bl normal_callee
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    ldr zt0, [x19]
+; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
+; CHECK-COMMON-NEXT:    fmov d1, x8
+; CHECK-COMMON-NEXT:    fadd d0, d0, d1
+; CHECK-COMMON-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #80
+; CHECK-COMMON-NEXT:    ret
+entry:
+  %call = call double @normal_callee(double %x)
+  %add = fadd double %call, 4.200000e+01
+  ret double %add;
+}
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
new file mode 100644
index 00000000000000..bf22d79a4df994
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -0,0 +1,342 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; ===== Legal Scalars =====
+define void @store_i8(i8 %a, ptr %ptr){
+; CHECK-LABEL: store_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    strb w0, [x1]
+; CHECK-NEXT:    ret
+    store i8 %a, ptr %ptr
+    ret void
+}
+
+define void @store_i16(i16 %a, ptr %ptr){
+; CHECK-LABEL: store_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    strh w0, [x1]
+; CHECK-NEXT:    ret
+    store i16 %a, ptr %ptr
+    ret void
+}
+
+define void @store_i32(i32 %a, ptr %ptr){
+; CHECK-LABEL: store_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str w0, [x1]
+; CHECK-NEXT:    ret
+    store i32 %a, ptr %ptr
+    ret void
+}
+
+define void @store_i64(i64 %a, ptr %ptr){
+; CHECK-LABEL: store_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ret
+    store i64 %a, ptr %ptr
+    ret void
+}
+
+; ===== Legal Vector Types =====
+
+define void @store_v8i8(<8 x i8> %a, ptr %ptr){
+; CHECK-LABEL: store_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+    store <8 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v16i8(<16 x i8> %a, ptr %ptr){
+; CHECK-LABEL: store_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+    store <16 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v4i16(<4 x i16> %a, ptr %ptr){
+; CHECK-LABEL: store_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+    store <4 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v8i16(<8 x i16> %a, ptr %ptr){
+; CHECK-LABEL: store_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+    store <8 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v2i32(<2 x i32> %a, ptr %ptr){
+; CHECK-LABEL: store_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+    store <2 x i32> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v4i32(<4 x i32> %a, ptr %ptr){
+; CHECK-LABEL: store_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+    store <4 x i32> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v2i64(<2 x i64> %a, ptr %ptr){
+; CHECK-LABEL: store_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+    store <2 x i64> %a, ptr %ptr
+    ret void
+}
+
+; ===== Smaller/Larger Width Vectors with Legal Element Sizes =====
+
+define void @store_v2i8(<2 x i8> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v2i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strb w9, [x0]
+; CHECK-SD-NEXT:    strb w8, [x0, #1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v2i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    str b0, [x0]
+; CHECK-GI-NEXT:    str b1, [x0, #1]
+; CHECK-GI-NEXT:    ret
+    store <2 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v4i8(i32 %a, ptr %ptr) {
+; CHECK-LABEL: store_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str w0, [x1]
+; CHECK-NEXT:    ret
+    %c = bitcast i32 %a to <4 x i8>
+    store <4 x i8> %c, ptr %ptr
+    ret void
+}
+
+define void @store_v32i8(<32 x i8> %a, ptr %ptr){
+; CHECK-LABEL: store_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    store <32 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v2i16(<2 x i16> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x0]
+; CHECK-SD-NEXT:    strh w8, [x0, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    str h1, [x0, #2]
+; CHECK-GI-NEXT:    ret
+    store <2 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v16i16(<16 x i16> %a, ptr %ptr){
+; CHECK-LABEL: store_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    store <16 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v1i32(<1 x i32> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v1i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v1i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    ret
+    store <1 x i32> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v8i32(<8 x i32> %a, ptr %ptr){
+; CHECK-LABEL: store_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    store <8 x i32> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v4i64(<4 x i64> %a, ptr %ptr){
+; CHECK-LABEL: store_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    store <4 x i64> %a, ptr %ptr
+    ret void
+}
+
+; ===== Vectors with Non-Pow 2 Widths =====
+
+define void @store_v3i8(<3 x i8> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v3i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.h[1], w1
+; CHECK-SD-NEXT:    mov v0.h[2], w2
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    str s0, [sp, #12]
+; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    strb w2, [x3, #2]
+; CHECK-SD-NEXT:    strh w8, [x3]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v3i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    strb w0, [x3]
+; CHECK-GI-NEXT:    strb w1, [x3, #1]
+; CHECK-GI-NEXT:    strb w2, [x3, #2]
+; CHECK-GI-NEXT:    ret
+    store <3 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v7i8(<7 x i8> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v7i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add x8, x0, #6
+; CHECK-SD-NEXT:    add x9, x0, #4
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    st1 { v0.b }[6], [x8]
+; CHECK-SD-NEXT:    st1 { v0.h }[2], [x9]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v7i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    add x9, x0, #2
+; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    st1 { v0.b }[1], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #3
+; CHECK-GI-NEXT:    st1 { v0.b }[3], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #4
+; CHECK-GI-NEXT:    st1 { v0.b }[4], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #5
+; CHECK-GI-NEXT:    st1 { v0.b }[5], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #6
+; CHECK-GI-NEXT:    st1 { v0.b }[2], [x9]
+; CHECK-GI-NEXT:    st1 { v0.b }[6], [x8]
+; CHECK-GI-NEXT:    ret
+    store <7 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v3i16(<3 x i16> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v3i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add x8, x0, #4
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v3i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x0, #4
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ret
+    store <3 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v7i16(<7 x i16> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v7i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add x8, x0, #12
+; CHECK-SD-NEXT:    add x9, x0, #8
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    st1 { v0.h }[6], [x8]
+; CHECK-SD-NEXT:    st1 { v0.s }[2], [x9]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v7i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x0, #4
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #6
+; CHECK-GI-NEXT:    st1 { v0.h }[3], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #8
+; CHECK-GI-NEXT:    st1 { v0.h }[4], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #10
+; CHECK-GI-NEXT:    st1 { v0.h }[5], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #12
+; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    st1 { v0.h }[6], [x8]
+; CHECK-GI-NEXT:    ret
+    store <7 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v3i32(<3 x i32> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add x8, x0, #8
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    st1 { v0.s }[2], [x8]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #4
+; CHECK-GI-NEXT:    add x9, x0, #8
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.s }[1], [x8]
+; CHECK-GI-NEXT:    st1 { v0.s }[2], [x9]
+; CHECK-GI-NEXT:    ret
+    store <3 x i32> %a, ptr %ptr
+    ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
index 82bf756f822898..c7c102f5d567d9 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
 
@@ -150,6 +150,46 @@ define <vscale x 16 x i1> @chained_reinterpret() {
   ret <vscale x 16 x i1> %out
 }
 
+define <vscale x 16 x i1> @reinterpret_scalar_bool_h(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @reinterpret_scalar_bool_s(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @reinterpret_scalar_bool_q(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_q:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+
 declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
 declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
 declare <vscale x 8 x i1> @llvm.aarch64.sve.cmpgt.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 921bdb5015c79a..63e7339d829e1d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -256,6 +256,7 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -607,6 +608,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -921,6 +923,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -1089,6 +1092,7 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -1242,6 +1246,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -1306,6 +1311,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -1389,6 +1395,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_mov_b32 s0, 1
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -1478,6 +1485,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_mov_b32 s0, 1
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 7c1c24f4a67dfa..dfadd8d205b04e 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -525,6 +525,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_max_num_f32_e32 v1, 0, v1
 ; GFX12-NEXT:    v_min_num_f32_e32 v2, 1.0, v1
 ; GFX12-NEXT:    global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v1, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 687d8456569229..850be72f06c7d0 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -893,6 +893,7 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -964,6 +965,7 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -2137,6 +2139,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -2221,6 +2224,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -3382,6 +3386,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -3468,6 +3473,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -3714,6 +3720,7 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -3789,6 +3796,7 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -3998,6 +4006,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4055,6 +4064,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4107,6 +4117,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4164,6 +4175,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4220,6 +4232,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 3
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -4282,6 +4295,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -4340,6 +4354,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
@@ -4405,6 +4420,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
@@ -4456,6 +4472,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
@@ -4523,6 +4540,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
@@ -4576,6 +4594,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
@@ -4644,6 +4663,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll
new file mode 100644
index 00000000000000..1113acb3c0305a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 < %s | FileCheck %s
+
+; Test should not result in build failure
+; CHECK-LABEL: shouldNotReApply
+
+define amdgpu_kernel void @shouldNotReApply() {
+entry:
+  tail call void @llvm.amdgcn.sched.barrier(i32 0)
+  store <4 x i32> zeroinitializer, ptr addrspace(3) null, align 2147483648
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 0, i32 0)
+  tail call void @llvm.amdgcn.sched.barrier(i32 0)
+  store i32 0, ptr addrspace(5) null, align 2147483648
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 0, i32 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index afa914c8375f64..9547f08d3eba6b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -269,6 +269,7 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
 ; GFX12-NEXT:    v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
 ; GFX12-NEXT:    s_lshr_b32 s1, ttmp7, 16
 ; GFX12-NEXT:    v_mov_b32_e32 v8, s1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[2:3], v7, off scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll
index fa1ca66ef4156c..769d035858ca83 100644
--- a/llvm/test/CodeGen/AMDGPU/omod.ll
+++ b/llvm/test/CodeGen/AMDGPU/omod.ll
@@ -651,8 +651,8 @@ define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
 ; GFX12-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mul_f32_e32 v1, 4.0, v0
-; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b32 v[0:1], v1, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index f93456ccacb806..4c61e6803febf3 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -160,3 +160,60 @@ body:             |
     S_ENDPGM 0
 
 ...
+---
+name:            add_f16_u32_preserve_different_bb
+tracksRegLiveness: true
+body:             |
+  ; SDWA-LABEL: name: add_f16_u32_preserve_different_bb
+  ; SDWA: bb.0:
+  ; SDWA-NEXT:   successors: %bb.1(0x80000000)
+  ; SDWA-NEXT:   liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+  ; SDWA-NEXT:   [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+  ; SDWA-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+  ; SDWA-NEXT:   [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+  ; SDWA-NEXT:   [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+  ; SDWA-NEXT:   [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 65535, [[FLAT_LOAD_DWORD]], implicit $exec
+  ; SDWA-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec
+  ; SDWA-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec
+  ; SDWA-NEXT:   [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT: bb.1:
+  ; SDWA-NEXT:   successors: %bb.2(0x80000000)
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT:   [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT: bb.2:
+  ; SDWA-NEXT:   [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F32_sdwa]](tied-def 0)
+  ; SDWA-NEXT:   FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+  ; SDWA-NEXT:   $sgpr30_sgpr31 = COPY [[COPY]]
+  ; SDWA-NEXT:   S_SETPC_B64_return $sgpr30_sgpr31
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+
+    %2:sreg_64 = COPY $sgpr30_sgpr31
+    %1:vreg_64 = COPY $vgpr2_vgpr3
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+    %4:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+
+    %5:vgpr_32 = V_AND_B32_e32 65535, %3, implicit $exec
+    %6:vgpr_32 = V_LSHRREV_B32_e64 16, %4, implicit $exec
+    %7:vgpr_32 = V_BFE_U32_e64 %3, 8, 8, implicit $exec
+    %8:vgpr_32 = V_LSHRREV_B32_e32 24, %4, implicit $exec
+
+    %9:vgpr_32 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
+    %10:vgpr_32 = V_LSHLREV_B16_e64 8, %9, implicit $exec
+
+  bb.1:
+    %11:vgpr_32 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit $mode, implicit $exec
+    %12:vgpr_32 = V_LSHLREV_B32_e64 16, %11, implicit $exec
+
+  bb.2:
+    %13:vgpr_32 = V_OR_B32_e64 %10, %12, implicit $exec
+
+    FLAT_STORE_DWORD %0, %13, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    $sgpr30_sgpr31 = COPY %2
+    S_SETPC_B64_return $sgpr30_sgpr31
+...
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
index 137bd0f5d9f14e..4efa1e9353ab3a 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
@@ -28,6 +28,7 @@ define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
 ; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v5, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
@@ -116,6 +117,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
@@ -174,6 +176,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
@@ -208,6 +211,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
new file mode 100644
index 00000000000000..e6fbe97f8dc0a5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+
+define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: intrinsic_store_system_scope:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 24)
+  ret void
+}
+
+define amdgpu_ps void @generic_store_volatile(i32 %val, ptr addrspace(1) %out) {
+; GFX12-LABEL: generic_store_volatile:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b32 v[1:2], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  store volatile i32 %val, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir
new file mode 100644
index 00000000000000..acf8bd3a6ab563
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir
@@ -0,0 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-memory-legalizer  %s -o - | FileCheck -check-prefix=GFX12 %s
+
+---
+name: intrinsic_store_system_scope
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX12-LABEL: name: intrinsic_store_system_scope
+    ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
+    ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_KMCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+    ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+    ; GFX12-NEXT: S_ENDPGM 0
+    BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+    S_ENDPGM 0
+...
+
+---
+name: generic_store_volatile
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX12-LABEL: name: generic_store_volatile
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
+    ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_KMCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 24, implicit $exec :: (volatile store (s32), addrspace 1)
+    ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+    ; GFX12-NEXT: S_ENDPGM 0
+    GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir b/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
index 2bc4288884f192..d006aa9ba38e84 100644
--- a/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
+++ b/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
@@ -89,7 +89,7 @@ body:             |
 # CHECK: $sp = t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, $r4, $r5, $r6, undef $r7, $r8, $r9, $r10, $r11
 # CHECK-NEXT:  $r0 = t2BICri $r0, 1, 14 /* CC::al */, $noreg, $noreg
 # CHECK-NEXT:  $sp = tSUBspi $sp, 34, 14 /* CC::al */, $noreg
-# CHECK-NEXT:  VLSTM $sp, 14 /* CC::al */, $noreg, implicit undef $vpr, implicit undef $fpscr, implicit undef $fpscr_nzcv, implicit undef $q0, implicit undef $q1, implicit undef $q2, implicit undef $q3, implicit undef $q4, implicit undef $q5, implicit undef $q6, implicit undef $q7
+# CHECK-NEXT:  VLSTM $sp, 14 /* CC::al */, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit undef $vpr, implicit undef $fpscr, implicit undef $fpscr_nzcv, implicit undef $d0, implicit undef $d1, implicit undef $d2, implicit undef $d3, implicit undef $d4, implicit undef $d5, implicit undef $d6, implicit undef $d7, implicit $d8, implicit $d9, implicit $d10, implicit $d11, implicit $d12, implicit $d13, implicit $d14, implicit $d15
 # CHECK-NEXT:  $r1 = tMOVr $r0, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $r2 = tMOVr $r0, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $r3 = tMOVr $r0, 14 /* CC::al */, $noreg
@@ -105,7 +105,7 @@ body:             |
 # CHECK-NEXT:  t2MSR_M 3072, $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr
 # CHECK-NEXT:  tBLXNSr 14 /* CC::al */, $noreg, killed $r0, csr_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $s0
 # CHECK-NEXT:  $r12 = VMOVRS $s0, 14 /* CC::al */, $noreg
-# CHECK-NEXT:  VLLDM $sp, 14 /* CC::al */, $noreg, implicit-def $q0, implicit-def $q1, implicit-def $q2, implicit-def $q3, implicit-def $q4, implicit-def $q5, implicit-def $q6, implicit-def $q7, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv
+# CHECK-NEXT:  VLLDM $sp, 14 /* CC::al */, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit-def $d0, implicit-def $d1, implicit-def $d2, implicit-def $d3, implicit-def $d4, implicit-def $d5, implicit-def $d6, implicit-def $d7, implicit-def $d8, implicit-def $d9, implicit-def $d10, implicit-def $d11, implicit-def $d12, implicit-def $d13, implicit-def $d14, implicit-def $d15
 # CHECK-NEXT:  $s0 = VMOVSR $r12, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $sp = tADDspi $sp, 34, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $sp = t2LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11
diff --git a/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir b/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
index 8c49a531674115..ad53addcc21a35 100644
--- a/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
+++ b/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
@@ -2,7 +2,7 @@
 --- |
   target triple = "thumbv8m.main-arm-none-eabi"
 
-  define hidden void @foo(ptr nocapture %baz) local_unnamed_addr #0 {
+  define hidden void @foo(void ()* nocapture %baz) local_unnamed_addr #0 {
   entry:
     %call = call i32 @bar() #0
     %tobool = icmp eq i32 %call, 0
@@ -55,14 +55,13 @@ body:             |
     tBL 14, $noreg, @bar, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $r0
 
   bb.2.land.end:
-    liveins: $r4
-
+    liveins: $r4, $vpr, $fpscr, $fpscr_nzcv, $d0, $d1, $d2, $d3, $d4, $d5, $d6, $d7, $d8, $d9, $d10, $d11, $d12, $d13, $d14, $d15
     $sp = t2STMDB_UPD $sp, 14, $noreg, $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $r11
     $r4 = t2BICri $r4, 1, 14, $noreg, $noreg
     $sp = tSUBspi $sp, 34, 14, $noreg
-    VLSTM $sp, 14, $noreg
+    VLSTM $sp, 14, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit $vpr, implicit $fpscr, implicit $fpscr_nzcv, implicit $d0, implicit $d1, implicit $d2, implicit $d3, implicit $d4, implicit $d5, implicit $d6, implicit $d7, implicit $d8, implicit $d9, implicit $d10, implicit $d11, implicit $d12, implicit $d13, implicit $d14, implicit $d15
     tBLXNSr 14, $noreg, killed $r4, csr_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp
-    VLLDM $sp, 14, $noreg, implicit-def $q0, implicit-def $q1, implicit-def $q2, implicit-def $q3, implicit-def $q4, implicit-def $q5, implicit-def $q6, implicit-def $q7, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv
+    VLLDM $sp, 14, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit-def $d0, implicit-def $d1, implicit-def $d2, implicit-def $d3, implicit-def $d4, implicit-def $d5, implicit-def $d6, implicit-def $d7, implicit-def $d8, implicit-def $d9, implicit-def $d10, implicit-def $d11, implicit-def $d12, implicit-def $d13, implicit-def $d14, implicit-def $d15
     $sp = tADDspi $sp, 34, 14, $noreg
     $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11
     $sp = t2LDMIA_RET $sp, 14, $noreg, def $r4, def $pc
diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
index 1fe91c721f4dd2..1d025a2f776f82 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
@@ -18,15 +18,15 @@ define void @test(ptr %addr) {
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    vl1re64.v v8, (a2)
 ; CHECK-NEXT:    slli a2, a1, 1
-; CHECK-NEXT:    vl1re64.v v9, (a0)
-; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vl1re64.v v9, (a3)
 ; CHECK-NEXT:    vl1re64.v v10, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v9, (a0)
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vs1r.v v10, (a2)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vs1r.v v8, (a0)
+; CHECK-NEXT:    vs1r.v v9, (a2)
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    vs1r.v v8, (a1)
+; CHECK-NEXT:    vs1r.v v10, (a0)
 ; CHECK-NEXT:    csrrs a0, vlenb, zero
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
index a9a680d54d5897..64031f8a93598f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
@@ -16,13 +16,13 @@ define <vscale x 1 x double> @test(ptr %addr, i64 %vl) {
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    csrrs a2, vlenb, zero
-; CHECK-NEXT:    vl1re64.v v8, (a0)
-; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vl1re64.v v8, (a3)
 ; CHECK-NEXT:    vl1re64.v v9, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vs1r.v v9, (a2)
+; CHECK-NEXT:    vs1r.v v8, (a2)
+; CHECK-NEXT:    vs1r.v v9, (a0)
 ; CHECK-NEXT:    vl1re64.v v8, (a2)
 ; CHECK-NEXT:    vl1re64.v v9, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index a2d02b6bb641b2..76aa2b913c6525 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -474,7 +474,7 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v12, v9, a0
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
index c49b1a7ad1861d..b9c611bf3e54a8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-max=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-KNOWNVLEN128
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 
 define void @extract_v2i8_v4i8_0(ptr %x, ptr %y) {
 ; CHECK-LABEL: extract_v2i8_v4i8_0:
@@ -63,22 +69,22 @@ define void @extract_v2i8_v8i8_6(ptr %x, ptr %y) {
 }
 
 define void @extract_v1i32_v8i32_4(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v1i32_v8i32_4:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v1i32_v8i32_4:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 4
+; VLA-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v1i32_v8i32_4:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v1i32_v8i32_4:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v9, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <1 x i32> @llvm.vector.extract.v1i32.v8i32(<8 x i32> %a, i64 4)
   store <1 x i32> %c, ptr %y
@@ -86,24 +92,24 @@ define void @extract_v1i32_v8i32_4(ptr %x, ptr %y) {
 }
 
 define void @extract_v1i32_v8i32_5(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v1i32_v8i32_5:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 5
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v1i32_v8i32_5:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 5
+; VLA-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v1i32_v8i32_5:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 1
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v1i32_v8i32_5:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 1
+; VLS-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <1 x i32> @llvm.vector.extract.v1i32.v8i32(<8 x i32> %a, i64 5)
   store <1 x i32> %c, ptr %y
@@ -111,20 +117,20 @@ define void @extract_v1i32_v8i32_5(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_0(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_0:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_0:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 0)
   store <2 x i32> %c, ptr %y
@@ -132,24 +138,24 @@ define void @extract_v2i32_v8i32_0(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_2(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_2:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 2
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_2:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 2)
   store <2 x i32> %c, ptr %y
@@ -157,22 +163,22 @@ define void @extract_v2i32_v8i32_2(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_4(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_4:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_4:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 4
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_4:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_4:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v9, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 4)
   store <2 x i32> %c, ptr %y
@@ -180,24 +186,24 @@ define void @extract_v2i32_v8i32_4(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_6(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_6:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 6
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_6:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 6)
   store <2 x i32> %c, ptr %y
@@ -230,59 +236,59 @@ define void @extract_v2i32_nxv16i32_2(<vscale x 16 x i32> %x, ptr %y) {
 }
 
 define void @extract_v2i32_nxv16i32_4(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_nxv16i32_4:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_nxv16i32_4:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 4
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_4:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_nxv16i32_4:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v9, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 4)
   store <2 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i32_nxv16i32_6(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_nxv16i32_6:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_nxv16i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 6
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_6:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_nxv16i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 6)
   store <2 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_nxv16i32_8:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_nxv16i32_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, m4, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 8
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_8:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v10, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_nxv16i32_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v10, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 8)
   store <2 x i32> %c, ptr %y
   ret void
@@ -339,40 +345,40 @@ define void @extract_v2i8_nxv2i8_6(<vscale x 2 x i8> %x, ptr %y) {
 }
 
 define void @extract_v8i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i32_nxv16i32_8:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i32_nxv16i32_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 8
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i32_nxv16i32_8:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vs2r.v v10, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i32_nxv16i32_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    ret
   %c = call <8 x i32> @llvm.vector.extract.v8i32.nxv16i32(<vscale x 16 x i32> %x, i64 8)
   store <8 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v8i1_v64i1_0(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i1_v64i1_0:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i1_v64i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i1_v64i1_0:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i1_v64i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 0)
   store <8 x i1> %c, ptr %y
@@ -380,26 +386,26 @@ define void @extract_v8i1_v64i1_0(ptr %x, ptr %y) {
 }
 
 define void @extract_v8i1_v64i1_8(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i1_v64i1_8:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i1_v64i1_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 1
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i1_v64i1_8:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i1_v64i1_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 1
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 8)
   store <8 x i1> %c, ptr %y
@@ -407,26 +413,26 @@ define void @extract_v8i1_v64i1_8(ptr %x, ptr %y) {
 }
 
 define void @extract_v8i1_v64i1_48(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i1_v64i1_48:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i1_v64i1_48:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 6
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i1_v64i1_48:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i1_v64i1_48:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 6
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 48)
   store <8 x i1> %c, ptr %y
@@ -508,38 +514,38 @@ define void @extract_v8i1_nxv64i1_192(<vscale x 64 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_v64i1_0(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_v64i1_0:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v0, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_v64i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_v64i1_0:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_v64i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v0, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %a, i64 0)
   store <2 x i1> %c, ptr %y
@@ -547,48 +553,48 @@ define void @extract_v2i1_v64i1_0(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_v64i1_2:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v0, (a0)
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_v64i1_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 2
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_v64i1_2:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_v64i1_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v0, (a0)
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 2
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %a, i64 2)
   store <2 x i1> %c, ptr %y
@@ -596,49 +602,49 @@ define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_v64i1_42:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v0, (a0)
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    li a0, 42
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_v64i1_42:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    li a0, 42
+; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
+; VLA-NEXT:    vslidedown.vx v8, v8, a0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_v64i1_42:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v10, 10
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_v64i1_42:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v0, (a0)
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v10, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %a, i64 42)
   store <2 x i1> %c, ptr %y
@@ -665,45 +671,45 @@ define void @extract_v2i1_nxv2i1_0(<vscale x 2 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_nxv2i1_2(<vscale x 2 x i1> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_nxv2i1_2:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_nxv2i1_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 2
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv2i1_2:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_nxv2i1_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 2
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %x, i64 2)
   store <2 x i1> %c, ptr %y
   ret void
@@ -754,91 +760,91 @@ define void @extract_v2i1_nxv64i1_2(<vscale x 64 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_nxv64i1_42(<vscale x 64 x i1> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_nxv64i1_42:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    li a1, 42
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vx v8, v8, a1
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_nxv64i1_42:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    li a1, 42
+; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
+; VLA-NEXT:    vslidedown.vx v8, v8, a1
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv64i1_42:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v10, 10
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_nxv64i1_42:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v10, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv64i1(<vscale x 64 x i1> %x, i64 42)
   store <2 x i1> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i1_nxv32i1_26(<vscale x 32 x i1> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_nxv32i1_26:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 26
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_nxv32i1_26:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 26
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv32i1_26:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 10
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_nxv32i1_26:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv32i1(<vscale x 32 x i1> %x, i64 26)
   store <2 x i1> %c, ptr %y
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index efb1f720f2d096..9f0240c53b219a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -1,9 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
 
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
+
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV32VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV64VLS %s
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
 ; CHECK-LABEL: insert_nxv8i32_v2i32_0:
@@ -45,26 +48,40 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v8i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v8i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v12
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v8i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; VLS-NEXT:    vmv.v.v v8, v12
+; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
   ret <vscale x 8 x i32> %v
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v8i32_8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v8i32_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v8i32_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v12, 8
+; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
   ret <vscale x 8 x i32> %v
@@ -82,17 +99,27 @@ define <vscale x 8 x i32> @insert_nxv8i32_undef_v2i32_0(ptr %svp) {
 }
 
 define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v4i32_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v9, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v4i32_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vle32.v v9, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vse32.v v9, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v4i32_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl1re32.v v9, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vs1r.v v9, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <4 x i32>, ptr %vp
   %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 0)
@@ -101,15 +128,25 @@ define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v4i32_v2i32_2(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v4i32_v2i32_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vslideup.vi v9, v8, 2
-; CHECK-NEXT:    vse32.v v9, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v4i32_v2i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vle32.v v9, (a0)
+; VLA-NEXT:    vslideup.vi v9, v8, 2
+; VLA-NEXT:    vse32.v v9, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v4i32_v2i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl1re32.v v9, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v9, v8, 2
+; VLS-NEXT:    vs1r.v v9, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <4 x i32>, ptr %vp
   %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 2)
@@ -118,13 +155,20 @@ define void @insert_v4i32_v2i32_2(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v4i32_undef_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v4i32_undef_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v4i32_undef_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vs1r.v v8, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %sv, i64 0)
   store <4 x i32> %v, ptr %vp
@@ -132,17 +176,27 @@ define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
-; CHECK-NEXT:    vmv.v.v v10, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v10, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; VLA-NEXT:    vmv.v.v v10, v8
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; VLS-NEXT:    vmv.v.v v10, v8
+; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 0)
@@ -151,17 +205,27 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_v2i32_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_v2i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v10, (a0)
+; VLA-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; VLA-NEXT:    vslideup.vi v10, v8, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_v2i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vle32.v v10, (a1)
+; VLS-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v10, 2
+; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 2)
@@ -170,15 +234,25 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_v2i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vslideup.vi v10, v8, 6
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_v2i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v10, (a0)
+; VLA-NEXT:    vslideup.vi v10, v8, 6
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_v2i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vle32.v v10, (a1)
+; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v10, 6
+; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 6)
@@ -187,14 +261,23 @@ define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_undef_v2i32_6(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_undef_v2i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 6
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_undef_v2i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v8, 6
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_undef_v2i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLS-NEXT:    vslideup.vi v10, v8, 6
+; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6)
   store <8 x i32> %v, ptr %vp
@@ -239,18 +322,30 @@ define void @insert_v4i16_v2i16_2(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v32i1_v8i1_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v32i1_v8i1_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlm.v v9, (a1)
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vsm.v v8, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v32i1_v8i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 32
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vlm.v v9, (a1)
+; VLA-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v9
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v32i1_v8i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vlm.v v9, (a1)
+; VLS-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
+; VLS-NEXT:    vmv.v.v v8, v9
+; VLS-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %v = load <32 x i1>, ptr %vp
   %sv = load <8 x i1>, ptr %svp
   %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 0)
@@ -259,18 +354,30 @@ define void @insert_v32i1_v8i1_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v32i1_v8i1_16(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v32i1_v8i1_16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlm.v v9, (a1)
-; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vsm.v v8, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v32i1_v8i1_16:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 32
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vlm.v v9, (a1)
+; VLA-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v9, 2
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v32i1_v8i1_16:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vlm.v v9, (a1)
+; VLS-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v9, 2
+; VLS-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %v = load <32 x i1>, ptr %vp
   %sv = load <8 x i1>, ptr %svp
   %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 16)
@@ -358,22 +465,36 @@ define <vscale x 2 x i16> @insert_nxv2i16_v2i16_2(<vscale x 2 x i16> %v, ptr %sv
 }
 
 define <vscale x 2 x i1> @insert_nxv2i1_v4i1_0(<vscale x 2 x i1> %v, ptr %svp) {
-; CHECK-LABEL: insert_nxv2i1_v4i1_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v9, 0
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv2i1_v4i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vmerge.vim v9, v9, 1, v0
+; VLA-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLA-NEXT:    vmv.v.i v10, 0
+; VLA-NEXT:    vmv1r.v v0, v8
+; VLA-NEXT:    vmerge.vim v8, v10, 1, v0
+; VLA-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v9, 0
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv2i1_v4i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vmerge.vim v10, v9, 1, v0
+; VLS-NEXT:    vmv1r.v v0, v8
+; VLS-NEXT:    vmerge.vim v8, v9, 1, v0
+; VLS-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
+; VLS-NEXT:    vmv.v.v v10, v8
+; VLS-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v10, 0
+; VLS-NEXT:    ret
   %sv = load <4 x i1>, ptr %svp
   %c = call <vscale x 2 x i1> @llvm.vector.insert.v4i1.nxv2i1(<vscale x 2 x i1> %v, <4 x i1> %sv, i64 0)
   ret <vscale x 2 x i1> %c
@@ -408,15 +529,24 @@ define <vscale x 8 x i1> @insert_nxv8i1_v8i1_16(<vscale x 8 x i1> %v, ptr %svp)
 declare <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
 
 define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
-; CHECK-LABEL: insert_v2i64_nxv16i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 4
-; CHECK-NEXT:    vs8r.v v8, (a2)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v2i64_nxv16i64:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; VLA-NEXT:    vle64.v v8, (a0)
+; VLA-NEXT:    vle64.v v16, (a1)
+; VLA-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 4
+; VLA-NEXT:    vs8r.v v8, (a2)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v2i64_nxv16i64:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl1re64.v v8, (a0)
+; VLS-NEXT:    vl1re64.v v16, (a1)
+; VLS-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vs8r.v v8, (a2)
+; VLS-NEXT:    ret
   %sv0 = load <2 x i64>, ptr %psv0
   %sv1 = load <2 x i64>, ptr %psv1
   %v0 = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv0, i64 0)
@@ -426,12 +556,18 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
 }
 
 define void @insert_v2i64_nxv16i64_lo0(ptr %psv, ptr %out) {
-; CHECK-LABEL: insert_v2i64_nxv16i64_lo0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vs8r.v v8, (a1)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v2i64_nxv16i64_lo0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; VLA-NEXT:    vle64.v v8, (a0)
+; VLA-NEXT:    vs8r.v v8, (a1)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v2i64_nxv16i64_lo0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl1re64.v v8, (a0)
+; VLS-NEXT:    vs8r.v v8, (a1)
+; VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 0)
   store <vscale x 16 x i64> %v, ptr %out
@@ -439,14 +575,22 @@ define void @insert_v2i64_nxv16i64_lo0(ptr %psv, ptr %out) {
 }
 
 define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) {
-; CHECK-LABEL: insert_v2i64_nxv16i64_lo2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v16, v8, 2
-; CHECK-NEXT:    vs8r.v v16, (a1)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v2i64_nxv16i64_lo2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; VLA-NEXT:    vle64.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
+; VLA-NEXT:    vslideup.vi v16, v8, 2
+; VLA-NEXT:    vs8r.v v16, (a1)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v2i64_nxv16i64_lo2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl1re64.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
+; VLS-NEXT:    vslideup.vi v16, v8, 2
+; VLS-NEXT:    vs8r.v v16, (a1)
+; VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
   store <vscale x 16 x i64> %v, ptr %out
@@ -521,6 +665,127 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
+; RV32VLA-LABEL: insert_v2i64_nxv16i64_hi:
+; RV32VLA:       # %bb.0:
+; RV32VLA-NEXT:    addi sp, sp, -80
+; RV32VLA-NEXT:    .cfi_def_cfa_offset 80
+; RV32VLA-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32VLA-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32VLA-NEXT:    .cfi_offset ra, -4
+; RV32VLA-NEXT:    .cfi_offset s0, -8
+; RV32VLA-NEXT:    addi s0, sp, 80
+; RV32VLA-NEXT:    .cfi_def_cfa s0, 0
+; RV32VLA-NEXT:    csrr a2, vlenb
+; RV32VLA-NEXT:    slli a2, a2, 4
+; RV32VLA-NEXT:    sub sp, sp, a2
+; RV32VLA-NEXT:    andi sp, sp, -64
+; RV32VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32VLA-NEXT:    vle64.v v8, (a0)
+; RV32VLA-NEXT:    addi a0, sp, 128
+; RV32VLA-NEXT:    vse64.v v8, (a0)
+; RV32VLA-NEXT:    csrr a0, vlenb
+; RV32VLA-NEXT:    slli a0, a0, 3
+; RV32VLA-NEXT:    addi a2, sp, 64
+; RV32VLA-NEXT:    add a3, a2, a0
+; RV32VLA-NEXT:    vl8re64.v v8, (a3)
+; RV32VLA-NEXT:    vl8re64.v v16, (a2)
+; RV32VLA-NEXT:    add a0, a1, a0
+; RV32VLA-NEXT:    vs8r.v v8, (a0)
+; RV32VLA-NEXT:    vs8r.v v16, (a1)
+; RV32VLA-NEXT:    addi sp, s0, -80
+; RV32VLA-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32VLA-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32VLA-NEXT:    addi sp, sp, 80
+; RV32VLA-NEXT:    ret
+;
+; RV64VLA-LABEL: insert_v2i64_nxv16i64_hi:
+; RV64VLA:       # %bb.0:
+; RV64VLA-NEXT:    addi sp, sp, -80
+; RV64VLA-NEXT:    .cfi_def_cfa_offset 80
+; RV64VLA-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64VLA-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64VLA-NEXT:    .cfi_offset ra, -8
+; RV64VLA-NEXT:    .cfi_offset s0, -16
+; RV64VLA-NEXT:    addi s0, sp, 80
+; RV64VLA-NEXT:    .cfi_def_cfa s0, 0
+; RV64VLA-NEXT:    csrr a2, vlenb
+; RV64VLA-NEXT:    slli a2, a2, 4
+; RV64VLA-NEXT:    sub sp, sp, a2
+; RV64VLA-NEXT:    andi sp, sp, -64
+; RV64VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64VLA-NEXT:    vle64.v v8, (a0)
+; RV64VLA-NEXT:    addi a0, sp, 128
+; RV64VLA-NEXT:    vse64.v v8, (a0)
+; RV64VLA-NEXT:    csrr a0, vlenb
+; RV64VLA-NEXT:    slli a0, a0, 3
+; RV64VLA-NEXT:    addi a2, sp, 64
+; RV64VLA-NEXT:    add a3, a2, a0
+; RV64VLA-NEXT:    vl8re64.v v8, (a3)
+; RV64VLA-NEXT:    vl8re64.v v16, (a2)
+; RV64VLA-NEXT:    add a0, a1, a0
+; RV64VLA-NEXT:    vs8r.v v8, (a0)
+; RV64VLA-NEXT:    vs8r.v v16, (a1)
+; RV64VLA-NEXT:    addi sp, s0, -80
+; RV64VLA-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64VLA-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64VLA-NEXT:    addi sp, sp, 80
+; RV64VLA-NEXT:    ret
+;
+; RV32VLS-LABEL: insert_v2i64_nxv16i64_hi:
+; RV32VLS:       # %bb.0:
+; RV32VLS-NEXT:    addi sp, sp, -80
+; RV32VLS-NEXT:    .cfi_def_cfa_offset 80
+; RV32VLS-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32VLS-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32VLS-NEXT:    .cfi_offset ra, -4
+; RV32VLS-NEXT:    .cfi_offset s0, -8
+; RV32VLS-NEXT:    addi s0, sp, 80
+; RV32VLS-NEXT:    .cfi_def_cfa s0, 0
+; RV32VLS-NEXT:    addi sp, sp, -256
+; RV32VLS-NEXT:    andi sp, sp, -64
+; RV32VLS-NEXT:    vl1re64.v v8, (a0)
+; RV32VLS-NEXT:    addi a0, sp, 128
+; RV32VLS-NEXT:    vs1r.v v8, (a0)
+; RV32VLS-NEXT:    addi a0, sp, 64
+; RV32VLS-NEXT:    addi a2, sp, 192
+; RV32VLS-NEXT:    vl8re64.v v8, (a2)
+; RV32VLS-NEXT:    vl8re64.v v16, (a0)
+; RV32VLS-NEXT:    addi a0, a1, 128
+; RV32VLS-NEXT:    vs8r.v v8, (a0)
+; RV32VLS-NEXT:    vs8r.v v16, (a1)
+; RV32VLS-NEXT:    addi sp, s0, -80
+; RV32VLS-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32VLS-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32VLS-NEXT:    addi sp, sp, 80
+; RV32VLS-NEXT:    ret
+;
+; RV64VLS-LABEL: insert_v2i64_nxv16i64_hi:
+; RV64VLS:       # %bb.0:
+; RV64VLS-NEXT:    addi sp, sp, -80
+; RV64VLS-NEXT:    .cfi_def_cfa_offset 80
+; RV64VLS-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64VLS-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64VLS-NEXT:    .cfi_offset ra, -8
+; RV64VLS-NEXT:    .cfi_offset s0, -16
+; RV64VLS-NEXT:    addi s0, sp, 80
+; RV64VLS-NEXT:    .cfi_def_cfa s0, 0
+; RV64VLS-NEXT:    addi sp, sp, -256
+; RV64VLS-NEXT:    andi sp, sp, -64
+; RV64VLS-NEXT:    vl1re64.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 128
+; RV64VLS-NEXT:    vs1r.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 192
+; RV64VLS-NEXT:    vl8re64.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 64
+; RV64VLS-NEXT:    vl8re64.v v16, (a0)
+; RV64VLS-NEXT:    addi a0, a1, 128
+; RV64VLS-NEXT:    vs8r.v v8, (a0)
+; RV64VLS-NEXT:    vs8r.v v16, (a1)
+; RV64VLS-NEXT:    addi sp, s0, -80
+; RV64VLS-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64VLS-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64VLS-NEXT:    addi sp, sp, 80
+; RV64VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 8)
   store <vscale x 16 x i64> %v, ptr %out
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 6ef5aa846d6d96..ce8827fe47536b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
+
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 
 define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: concat_2xv4i32:
@@ -128,31 +131,51 @@ define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 }
 
 define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: concat_2xv16i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv4r.v v16, v12
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 16
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_2xv16i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv4r.v v16, v12
+; VLA-NEXT:    li a0, 32
+; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_2xv16i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv4r.v v16, v12
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 16
+; VLS-NEXT:    ret
   %ab = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i32> %ab
 }
 
 define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
-; CHECK-LABEL: concat_4xv8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv2r.v v16, v14
-; CHECK-NEXT:    vmv2r.v v24, v12
-; CHECK-NEXT:    vmv2r.v v0, v10
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v0, 8
-; CHECK-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v24, 16
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 24
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_4xv8i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv2r.v v16, v14
+; VLA-NEXT:    vmv2r.v v24, v12
+; VLA-NEXT:    vmv2r.v v0, v10
+; VLA-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v0, 8
+; VLA-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v24, 16
+; VLA-NEXT:    li a0, 32
+; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 24
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_4xv8i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv2r.v v16, v14
+; VLS-NEXT:    vmv2r.v v24, v12
+; VLS-NEXT:    vmv2r.v v0, v10
+; VLS-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v0, 8
+; VLS-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v24, 16
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 24
+; VLS-NEXT:    ret
   %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %abcd = shufflevector <16 x i32> %ab, <16 x i32> %cd, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -160,82 +183,128 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x
 }
 
 define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) {
-; CHECK-LABEL: concat_8xv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vmv1r.v v16, v15
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v16, v14
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v16, v13
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v16, v12
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v11
-; CHECK-NEXT:    vmv1r.v v24, v10
-; CHECK-NEXT:    vmv1r.v v16, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 4
-; CHECK-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v24, 8
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v0, 12
-; CHECK-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 16
-; CHECK-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 20
-; CHECK-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 24
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 28
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_8xv4i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    addi sp, sp, -16
+; VLA-NEXT:    .cfi_def_cfa_offset 16
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 5
+; VLA-NEXT:    sub sp, sp, a0
+; VLA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; VLA-NEXT:    vmv1r.v v16, v15
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    li a1, 0
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a1, a1, a0
+; VLA-NEXT:    slli a0, a0, 1
+; VLA-NEXT:    add a0, a0, a1
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v14
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 4
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v13
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v12
+; VLA-NEXT:    addi a0, sp, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v0, v11
+; VLA-NEXT:    vmv1r.v v24, v10
+; VLA-NEXT:    vmv1r.v v16, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 4
+; VLA-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v24, 8
+; VLA-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v0, 12
+; VLA-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 20
+; VLA-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 4
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 24
+; VLA-NEXT:    li a0, 32
+; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    li a1, 0
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a1, a1, a0
+; VLA-NEXT:    slli a0, a0, 1
+; VLA-NEXT:    add a0, a0, a1
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 28
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 5
+; VLA-NEXT:    add sp, sp, a0
+; VLA-NEXT:    addi sp, sp, 16
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_8xv4i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    addi sp, sp, -16
+; VLS-NEXT:    .cfi_def_cfa_offset 16
+; VLS-NEXT:    addi sp, sp, -512
+; VLS-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; VLS-NEXT:    vmv1r.v v16, v15
+; VLS-NEXT:    addi a0, sp, 400
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v14
+; VLS-NEXT:    addi a0, sp, 272
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v13
+; VLS-NEXT:    addi a0, sp, 144
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v12
+; VLS-NEXT:    addi a0, sp, 16
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v0, v11
+; VLS-NEXT:    vmv1r.v v24, v10
+; VLS-NEXT:    vmv1r.v v16, v9
+; VLS-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v24, 8
+; VLS-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v0, 12
+; VLS-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 16
+; VLS-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLS-NEXT:    addi a0, sp, 144
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 20
+; VLS-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
+; VLS-NEXT:    addi a0, sp, 272
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 24
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    addi a0, sp, 400
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 28
+; VLS-NEXT:    addi sp, sp, 512
+; VLS-NEXT:    addi sp, sp, 16
+; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %abcd = shufflevector <8 x i32> %ab, <8 x i32> %cd, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index 0f3f57a0dec59a..b15896580d4253 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -76,7 +76,7 @@ define <vscale x 4 x i8> @insert_nxv1i8_nxv4i8_3(<vscale x 4 x i8> %vec, <vscale
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a1, a1, a0
 ; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vector.insert.nxv1i8.nxv4i8(<vscale x 4 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
@@ -227,7 +227,7 @@ define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec,
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 1)
@@ -306,7 +306,7 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_7(<vscale x 16 x i8> %vec, <vsc
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    sub a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 7)
@@ -319,7 +319,7 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_15(<vscale x 16 x i8> %vec, <vs
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    sub a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v10, a1
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 15)
@@ -344,7 +344,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_2(<vscale x 32 x half> %vec
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 2)
@@ -357,7 +357,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_26(<vscale x 32 x half> %ve
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v14, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 26)
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 243dc19a25588d..1ef63ffa9ee0c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -2235,9 +2235,9 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vmfeq.vv v16, v8, v24, v0.t
 ; ZVFH-NEXT:    add a0, a1, a1
-; ZVFH-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; ZVFH-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; ZVFH-NEXT:    vslideup.vx v16, v1, a1
-; ZVFH-NEXT:    vmv1r.v v0, v16
+; ZVFH-NEXT:    vmv.v.v v0, v16
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 4
 ; ZVFH-NEXT:    add sp, sp, a0
@@ -2337,7 +2337,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:  # %bb.3:
 ; ZVFHMIN-NEXT:    mv a2, a5
 ; ZVFHMIN-NEXT:  .LBB85_4:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v2, v26, a3
 ; ZVFHMIN-NEXT:    sub a5, a2, a4
 ; ZVFHMIN-NEXT:    sltu a6, a2, a5
@@ -2395,12 +2395,12 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v1
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v3, a3
 ; ZVFHMIN-NEXT:    add a0, a1, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v2, a1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 34
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
@@ -3637,7 +3637,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    slli a0, a1, 1
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v17, v16, a0
 ; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
index e77966d8c43b37..aee255196ce2e8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
@@ -3387,7 +3387,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    srli a0, a0, 3
 ; RV32-NEXT:    add a1, a0, a0
-; RV32-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; RV32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; RV32-NEXT:    vslideup.vx v0, v24, a0
 ; RV32-NEXT:    ret
 ;
@@ -3400,7 +3400,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    srli a0, a0, 3
 ; RV64-NEXT:    add a1, a0, a0
-; RV64-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; RV64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; RV64-NEXT:    vslideup.vx v0, v24, a0
 ; RV64-NEXT:    ret
 ;
@@ -3413,7 +3413,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
 ; ZVFHMIN32-NEXT:    srli a0, a0, 3
 ; ZVFHMIN32-NEXT:    add a1, a0, a0
-; ZVFHMIN32-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; ZVFHMIN32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; ZVFHMIN32-NEXT:    vslideup.vx v0, v24, a0
 ; ZVFHMIN32-NEXT:    ret
 ;
@@ -3426,7 +3426,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
 ; ZVFHMIN64-NEXT:    srli a0, a0, 3
 ; ZVFHMIN64-NEXT:    add a1, a0, a0
-; ZVFHMIN64-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; ZVFHMIN64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; ZVFHMIN64-NEXT:    vslideup.vx v0, v24, a0
 ; ZVFHMIN64-NEXT:    ret
   %vc = fcmp oeq <vscale x 16 x double> %va, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index 007afe12b8e438..a23b7c7b6ae9e4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -2424,7 +2424,7 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v1, a1
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -2459,7 +2459,7 @@ define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b,
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    add a0, a2, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v25, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
@@ -2492,7 +2492,7 @@ define <vscale x 32 x i1> @icmp_eq_vx_swap_nxv32i32(<vscale x 32 x i32> %va, i32
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    add a0, a2, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v25, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
index a2ac684604b944..5f35a4e50a9523 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
@@ -3235,7 +3235,7 @@ define <vscale x 16 x i1> @icmp_eq_vi_nx16i64(<vscale x 16 x i64> %va) {
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vi v24, v16, 0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v24, a0
 ; CHECK-NEXT:    ret
   %vc = icmp eq <vscale x 16 x i64> %va, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index c23c10205e6e36..4c64b1677b3626 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -22,7 +22,7 @@ define void @vector_interleave_store_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vsc
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsm.v v9, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index e84fd1b1a7036a..1acc0fec8fe586 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -24,7 +24,7 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v8, a0
 ; CHECK-NEXT:    ret
 ;
@@ -44,7 +44,7 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; ZVBB-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v0, v8, a0
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
@@ -376,9 +376,9 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v10, a0
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv4f16_nxv2f16:
@@ -391,9 +391,9 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
 ; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a0
-; ZVBB-NEXT:    vmv1r.v v8, v10
+; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
   ret <vscale x 4 x half> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
index 8e983f63428a6a..b888fde7d06836 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
@@ -937,7 +937,7 @@ define <vscale x 32 x i1> @vfptosi_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v24
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
   %evec = fptosi <vscale x 32 x half> %va to <vscale x 32 x i1>
@@ -967,7 +967,7 @@ define <vscale x 32 x i1> @vfptoui_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v24
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
   %evec = fptoui <vscale x 32 x half> %va to <vscale x 32 x i1>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 2546ec95a00793..6d42b15273cf86 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -894,7 +894,7 @@ define half @vreduce_ord_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-NEXT:    lui a2, 1048568
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a2
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
@@ -916,7 +916,7 @@ define half @vreduce_ord_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v10, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
@@ -938,11 +938,11 @@ define half @vreduce_ord_fadd_nxv10f16(<vscale x 10 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v12, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.v.v v11, v12
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v12, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
@@ -982,7 +982,7 @@ define half @vreduce_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-NEXT:    lui a2, 1048568
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a2
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
@@ -1002,7 +1002,7 @@ define half @vreduce_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v10, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
@@ -1025,11 +1025,11 @@ define half @vreduce_fmin_nxv10f16(<vscale x 10 x half> %v) {
 ; CHECK-NEXT:    vlse16.v v12, (a1), zero
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v12, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.v.v v11, v12
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v12, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
index daf46c6eef0252..4f33439db770dc 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
@@ -1,5 +1,5 @@
-; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue -wasm-emit-multivalue 2>&1 | FileCheck %s --check-prefix=EH
-; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 -wasm-emit-multivalue | FileCheck %s --check-prefix=SJLJ
+; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=EH
+; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=SJLJ
 
 ; Currently multivalue returning functions are not supported in Emscripten EH /
 ; SjLj. Make sure they error out.
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
index 4fadbd5f07e6df..4b4661b144667f 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -wasm-emit-multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
index f4f93ac2f30ce3..52a8c686824d33 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; NOTE: Test functions have been generated by multivalue-stackify.py.
 
-; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue | FileCheck %s
 
 ; Test that the multivalue stackification works
 
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue.ll b/llvm/test/CodeGen/WebAssembly/multivalue.ll
index 846691e5ff0cda..675009c8f3e548 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue.ll
@@ -1,8 +1,7 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call -wasm-emit-multivalue | FileCheck --check-prefix REF %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s --check-prefix REGS
-; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | obj2yaml | FileCheck %s --check-prefix OBJ
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix NO-MULTIVALUE
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call | FileCheck --check-prefix REF %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix REGS
+; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call | obj2yaml | FileCheck %s --check-prefix OBJ
 
 ; Test that the multivalue calls, returns, function types, and block
 ; types work as expected.
@@ -20,7 +19,6 @@ declare void @use_i64(i64)
 ; CHECK-NEXT: i32.const 42{{$}}
 ; CHECK-NEXT: i64.const 42{{$}}
 ; CHECK-NEXT: end_function{{$}}
-; NO-MULTIVALUE-NOT: .functype pair_const () -> (i32, i64)
 define %pair @pair_const() {
   ret %pair { i32 42, i64 42 }
 }
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
index 7bf37b59353ad6..47c5ae7b457dde 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s --check-prefix=MULTIVALUE
+; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue | FileCheck %s --check-prefix=MULTIVALUE
 ; RUN: llc < %s -verify-machineinstrs -mcpu=mvp | FileCheck %s --check-prefix=NO_MULTIVALUE
 
 ; Test libcall signatures when multivalue is enabled and disabled
diff --git a/llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll b/llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll
new file mode 100644
index 00000000000000..a38243ca218cc1
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -wasm-ref-type-mem2local -S | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+%externref = type ptr addrspace(10)
+%funcref = type ptr addrspace(20)
+
+declare %externref @get_externref()
+declare %funcref @get_funcref()
+declare i32 @get_i32()
+declare void @take_externref(%externref)
+declare void @take_funcref(%funcref)
+declare void @take_i32(i32)
+
+; Reference type allocas should be moved to addrspace(1)
+; CHECK-LABEL: @test_ref_type_mem2local
+define void @test_ref_type_mem2local() {
+entry:
+  %alloc.externref = alloca %externref, align 1
+  %eref = call %externref @get_externref()
+  store %externref %eref, ptr %alloc.externref, align 1
+  %eref.loaded = load %externref, ptr %alloc.externref, align 1
+  call void @take_externref(%externref %eref.loaded)
+  ; CHECK:      %alloc.externref.var = alloca ptr addrspace(10), align 1, addrspace(1)
+  ; CHECK-NEXT: %eref = call ptr addrspace(10) @get_externref()
+  ; CHECK-NEXT: store ptr addrspace(10) %eref, ptr addrspace(1) %alloc.externref.var, align 1
+  ; CHECK-NEXT: %eref.loaded = load ptr addrspace(10), ptr addrspace(1) %alloc.externref.var, align 1
+  ; CHECK-NEXT: call void @take_externref(ptr addrspace(10) %eref.loaded)
+
+  %alloc.funcref = alloca %funcref, align 1
+  %fref = call %funcref @get_funcref()
+  store %funcref %fref, ptr %alloc.funcref, align 1
+  %fref.loaded = load %funcref, ptr %alloc.funcref, align 1
+  call void @take_funcref(%funcref %fref.loaded)
+  ; CHECK-NEXT: %alloc.funcref.var = alloca ptr addrspace(20), align 1, addrspace(1)
+  ; CHECK-NEXT: %fref = call ptr addrspace(20) @get_funcref()
+  ; CHECK-NEXT: store ptr addrspace(20) %fref, ptr addrspace(1) %alloc.funcref.var, align 1
+  ; CHECK-NEXT: %fref.loaded = load ptr addrspace(20), ptr addrspace(1) %alloc.funcref.var, align 1
+  ; CHECK-NEXT: call void @take_funcref(ptr addrspace(20) %fref.loaded)
+
+  ret void
+}
+
+; POD type allocas should stay the same
+; CHECK-LABEL: @test_pod_type
+define void @test_pod_type() {
+entry:
+  %alloc.i32 = alloca i32
+  %i32 = call i32 @get_i32()
+  store i32 %i32, ptr %alloc.i32
+  %i32.loaded = load i32, ptr %alloc.i32
+  call void @take_i32(i32 %i32.loaded)
+  ; CHECK: %alloc.i32 = alloca i32, align 4{{$}}
+  ; CHECK-NOT: addrspace(1)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
index 7039e33c00935d..cbb5bd09c2399a 100644
--- a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
@@ -160,6 +160,53 @@ define <16 x i8> @demandedelts_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>
   ret <16 x i8> %5
 }
 
+define <4 x float> @demandedbits_sitofp_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
+; SSE-LABEL: demandedbits_sitofp_blendvps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    cvtdq2ps %xmm2, %xmm0
+; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: demandedbits_sitofp_blendvps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtdq2ps %xmm2, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp <4 x i32> %a2 to <4 x float>
+  %sel = tail call noundef <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %cvt)
+  ret <4 x float> %sel
+}
+
+define <4 x float> @demandedbits_uitofp_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
+; SSE-LABEL: demandedbits_uitofp_blendvps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [1258291200,1258291200,1258291200,1258291200]
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; SSE-NEXT:    psrld $16, %xmm2
+; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT:    addps %xmm2, %xmm0
+; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: demandedbits_uitofp_blendvps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
+; AVX-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
+; AVX-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vaddps %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = uitofp <4 x i32> %a2 to <4 x float>
+  %sel = tail call noundef <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %cvt)
+  ret <4 x float> %sel
+}
+
 define <2 x i64> @demandedbits_blendvpd(i64 %a0, i64 %a2, <2 x double> %a3) {
 ; SSE-LABEL: demandedbits_blendvpd:
 ; SSE:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/kshift.ll b/llvm/test/CodeGen/X86/kshift.ll
index 0acf82f5a144a2..f4efacc1946cff 100644
--- a/llvm/test/CodeGen/X86/kshift.ll
+++ b/llvm/test/CodeGen/X86/kshift.ll
@@ -270,11 +270,10 @@ define i64 @kshiftl_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
 ; KNL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
-; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; KNL-NEXT:    vpcmpeqb %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -564,14 +563,13 @@ define i64 @kshiftr_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
 ; KNL-LABEL: kshiftr_v64i1_63:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k1
-; KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; KNL-NEXT:    vpcmpeqb %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
diff --git a/llvm/test/CodeGen/X86/pr46455.ll b/llvm/test/CodeGen/X86/pr46455.ll
index 092e417c812e78..3d1b04f7435713 100644
--- a/llvm/test/CodeGen/X86/pr46455.ll
+++ b/llvm/test/CodeGen/X86/pr46455.ll
@@ -4,10 +4,10 @@
 define void @EntryModule(ptr %buffer_table) {
 ; CHECK-LABEL: EntryModule:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    movq (%rdi), %rax
 ; CHECK-NEXT:    movq 24(%rdi), %rcx
-; CHECK-NEXT:    vcmpneqps (%rax), %ymm0, %ymm0
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcmpneqps (%rax), %xmm0, %xmm0
 ; CHECK-NEXT:    vpsrld $31, %xmm0, %xmm1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
@@ -16,7 +16,6 @@ define void @EntryModule(ptr %buffer_table) {
 ; CHECK-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, (%rcx)
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
   %i1 = load ptr, ptr %buffer_table, align 8
diff --git a/llvm/test/CodeGen/X86/setcc-lowering.ll b/llvm/test/CodeGen/X86/setcc-lowering.ll
index 705e48ca4c9c9b..90e5c279d2e17c 100644
--- a/llvm/test/CodeGen/X86/setcc-lowering.ll
+++ b/llvm/test/CodeGen/X86/setcc-lowering.ll
@@ -1,22 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
-; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=knl < %s   | FileCheck %s --check-prefix=KNL-32
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx  < %s | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=knl < %s   | FileCheck %s --check-prefixes=AVX,KNL-32
 
 
 ; Verify that we don't crash during codegen due to a wrong lowering
 ; of a setcc node with illegal operand types and return type.
 
 define <8 x i16> @pr25080(<8 x i32> %a) {
-; AVX-LABEL: pr25080:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-LABEL: pr25080:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: pr25080:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [8388607,8388607,8388607,8388607]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; KNL-32-LABEL: pr25080:
 ; KNL-32:       # %bb.0: # %entry
@@ -38,23 +51,40 @@ entry:
 }
 
 define void @pr26232(i64 %a, <16 x i1> %b) {
-; AVX-LABEL: pr26232:
-; AVX:       # %bb.0: # %allocas
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    .p2align 4, 0x90
-; AVX-NEXT:  .LBB1_1: # %for_loop599
-; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX-NEXT:    cmpq $65536, %rdi # imm = 0x10000
-; AVX-NEXT:    setl %al
-; AVX-NEXT:    vmovd %eax, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vpand %xmm0, %xmm2, %xmm2
-; AVX-NEXT:    vpsllw $7, %xmm2, %xmm2
-; AVX-NEXT:    vpmovmskb %xmm2, %eax
-; AVX-NEXT:    testl %eax, %eax
-; AVX-NEXT:    jne .LBB1_1
-; AVX-NEXT:  # %bb.2: # %for_exit600
-; AVX-NEXT:    retq
+; AVX1-LABEL: pr26232:
+; AVX1:       # %bb.0: # %allocas
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    .p2align 4, 0x90
+; AVX1-NEXT:  .LBB1_1: # %for_loop599
+; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX1-NEXT:    cmpq $65536, %rdi # imm = 0x10000
+; AVX1-NEXT:    setl %al
+; AVX1-NEXT:    vmovd %eax, %xmm2
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $7, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovmskb %xmm2, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    jne .LBB1_1
+; AVX1-NEXT:  # %bb.2: # %for_exit600
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: pr26232:
+; AVX2:       # %bb.0: # %allocas
+; AVX2-NEXT:    .p2align 4, 0x90
+; AVX2-NEXT:  .LBB1_1: # %for_loop599
+; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT:    cmpq $65536, %rdi # imm = 0x10000
+; AVX2-NEXT:    setl %al
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm0, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovmskb %xmm1, %eax
+; AVX2-NEXT:    testl %eax, %eax
+; AVX2-NEXT:    jne .LBB1_1
+; AVX2-NEXT:  # %bb.2: # %for_exit600
+; AVX2-NEXT:    retq
 ;
 ; KNL-32-LABEL: pr26232:
 ; KNL-32:       # %bb.0: # %allocas
@@ -108,14 +138,7 @@ define <4 x i32> @pcmpgt(<4 x i8> %x) {
 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
-;
-; KNL-32-LABEL: pcmpgt:
-; KNL-32:       # %bb.0:
-; KNL-32-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; KNL-32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-32-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; KNL-32-NEXT:    retl
+; AVX-NEXT:    ret{{[l|q]}}
   %zext = zext <4 x i8> %x to <4 x i32>
   %icmp = icmp ne <4 x i32> %zext, zeroinitializer
   %sext = sext <4 x i1> %icmp to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/x86-32-intrcc.ll b/llvm/test/CodeGen/X86/x86-32-intrcc.ll
index 2e482753e26853..3c3944c2082bd5 100644
--- a/llvm/test/CodeGen/X86/x86-32-intrcc.ll
+++ b/llvm/test/CodeGen/X86/x86-32-intrcc.ll
@@ -149,7 +149,6 @@ define x86_intrcc void @test_isr_x87(ptr byval(%struct.interrupt_frame) %frame)
 ; CHECK-NEXT:    pushl %ebp
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    andl $-16, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    fldt f80
 ; CHECK-NEXT:    fld1
 ; CHECK-NEXT:    faddp %st, %st(1)
@@ -163,7 +162,6 @@ define x86_intrcc void @test_isr_x87(ptr byval(%struct.interrupt_frame) %frame)
 ; CHECK0-NEXT:    pushl %ebp
 ; CHECK0-NEXT:    movl %esp, %ebp
 ; CHECK0-NEXT:    andl $-16, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    fldt f80
 ; CHECK0-NEXT:    fld1
 ; CHECK0-NEXT:    faddp %st, %st(1)
@@ -188,7 +186,6 @@ define dso_local x86_intrcc void @test_fp_1(ptr byval(%struct.interrupt_frame) %
 ; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-16, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    leal 20(%ebp), %eax
 ; CHECK-NEXT:    leal 4(%ebp), %ecx
 ; CHECK-NEXT:    movl %ecx, sink_address
@@ -206,7 +203,6 @@ define dso_local x86_intrcc void @test_fp_1(ptr byval(%struct.interrupt_frame) %
 ; CHECK0-NEXT:    pushl %ecx
 ; CHECK0-NEXT:    pushl %eax
 ; CHECK0-NEXT:    andl $-16, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    leal 4(%ebp), %ecx
 ; CHECK0-NEXT:    movl %ecx, %eax
 ; CHECK0-NEXT:    addl $16, %eax
@@ -234,7 +230,6 @@ define dso_local x86_intrcc void @test_fp_2(ptr byval(%struct.interrupt_frame) %
 ; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-16, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    movl 4(%ebp), %eax
 ; CHECK-NEXT:    leal 24(%ebp), %ecx
 ; CHECK-NEXT:    leal 8(%ebp), %edx
@@ -257,7 +252,6 @@ define dso_local x86_intrcc void @test_fp_2(ptr byval(%struct.interrupt_frame) %
 ; CHECK0-NEXT:    pushl %ecx
 ; CHECK0-NEXT:    pushl %eax
 ; CHECK0-NEXT:    andl $-16, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    movl 4(%ebp), %eax
 ; CHECK0-NEXT:    leal 8(%ebp), %edx
 ; CHECK0-NEXT:    movl %edx, %ecx
@@ -288,7 +282,6 @@ define x86_intrcc void @test_copy_elide(ptr byval(%struct.interrupt_frame) %fram
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-16, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    leal 4(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, sink_address
 ; CHECK-NEXT:    leal -4(%ebp), %esp
@@ -303,7 +296,6 @@ define x86_intrcc void @test_copy_elide(ptr byval(%struct.interrupt_frame) %fram
 ; CHECK0-NEXT:    movl %esp, %ebp
 ; CHECK0-NEXT:    pushl %eax
 ; CHECK0-NEXT:    andl $-16, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    movl 4(%ebp), %eax
 ; CHECK0-NEXT:    leal 4(%ebp), %eax
 ; CHECK0-NEXT:    movl %eax, sink_address
@@ -358,7 +350,6 @@ define x86_intrcc void @test_isr_realign(ptr byval(%struct.interrupt_frame) %fra
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-32, %esp
 ; CHECK-NEXT:    subl $32, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    movl 4(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, (%esp)
 ; CHECK-NEXT:    leal -4(%ebp), %esp
@@ -374,7 +365,6 @@ define x86_intrcc void @test_isr_realign(ptr byval(%struct.interrupt_frame) %fra
 ; CHECK0-NEXT:    pushl %eax
 ; CHECK0-NEXT:    andl $-32, %esp
 ; CHECK0-NEXT:    subl $32, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    movl 4(%ebp), %eax
 ; CHECK0-NEXT:    movl %eax, (%esp)
 ; CHECK0-NEXT:    leal -4(%ebp), %esp
diff --git a/llvm/test/CodeGen/X86/x86-64-intrcc-uintr.ll b/llvm/test/CodeGen/X86/x86-64-intrcc-uintr.ll
index a46b9d9ba5a100..1fe395b84d46c3 100644
--- a/llvm/test/CodeGen/X86/x86-64-intrcc-uintr.ll
+++ b/llvm/test/CodeGen/X86/x86-64-intrcc-uintr.ll
@@ -21,28 +21,24 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_empty(ptr nocapture byval(%s
 ; CHECK-USER-LABEL: test_uintr_isr_cc_empty:
 ; CHECK-USER:       # %bb.0: # %entry
 ; CHECK-USER-NEXT:    pushq %rax
-; CHECK-USER-NEXT:    cld
 ; CHECK-USER-NEXT:    addq $16, %rsp
 ; CHECK-USER-NEXT:    uiret
 ;
 ; CHECK0-USER-LABEL: test_uintr_isr_cc_empty:
 ; CHECK0-USER:       # %bb.0: # %entry
 ; CHECK0-USER-NEXT:    pushq %rax
-; CHECK0-USER-NEXT:    cld
 ; CHECK0-USER-NEXT:    addq $16, %rsp
 ; CHECK0-USER-NEXT:    uiret
 ;
 ; CHECK-KERNEL-LABEL: test_uintr_isr_cc_empty:
 ; CHECK-KERNEL:       # %bb.0: # %entry
 ; CHECK-KERNEL-NEXT:    pushq %rax
-; CHECK-KERNEL-NEXT:    cld
 ; CHECK-KERNEL-NEXT:    addq $16, %rsp
 ; CHECK-KERNEL-NEXT:    iretq
 ;
 ; CHECK0-KERNEL-LABEL: test_uintr_isr_cc_empty:
 ; CHECK0-KERNEL:       # %bb.0: # %entry
 ; CHECK0-KERNEL-NEXT:    pushq %rax
-; CHECK0-KERNEL-NEXT:    cld
 ; CHECK0-KERNEL-NEXT:    addq $16, %rsp
 ; CHECK0-KERNEL-NEXT:    iretq
 entry:
@@ -75,7 +71,6 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_args(ptr nocapture readonly
 ; CHECK-USER-NEXT:    pushq %rax
 ; CHECK-USER-NEXT:    pushq %rdx
 ; CHECK-USER-NEXT:    pushq %rcx
-; CHECK-USER-NEXT:    cld
 ; CHECK-USER-NEXT:    movq 32(%rsp), %rax
 ; CHECK-USER-NEXT:    movq 40(%rsp), %rcx
 ; CHECK-USER-NEXT:    movq 48(%rsp), %rdx
@@ -96,7 +91,6 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_args(ptr nocapture readonly
 ; CHECK0-USER-NEXT:    pushq %rax
 ; CHECK0-USER-NEXT:    pushq %rdx
 ; CHECK0-USER-NEXT:    pushq %rcx
-; CHECK0-USER-NEXT:    cld
 ; CHECK0-USER-NEXT:    movq 32(%rsp), %rax
 ; CHECK0-USER-NEXT:    leaq 40(%rsp), %rcx
 ; CHECK0-USER-NEXT:    movq (%rcx), %rdx
@@ -118,7 +112,6 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_args(ptr nocapture readonly
 ; CHECK-KERNEL-NEXT:    pushq %rax
 ; CHECK-KERNEL-NEXT:    pushq %rdx
 ; CHECK-KERNEL-NEXT:    pushq %rcx
-; CHECK-KERNEL-NEXT:    cld
 ; CHECK-KERNEL-NEXT:    movq 32(%rsp), %rax
 ; CHECK-KERNEL-NEXT:    movq 40(%rsp), %rcx
 ; CHECK-KERNEL-NEXT:    movq 48(%rsp), %rdx
@@ -139,7 +132,6 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_args(ptr nocapture readonly
 ; CHECK0-KERNEL-NEXT:    pushq %rax
 ; CHECK0-KERNEL-NEXT:    pushq %rdx
 ; CHECK0-KERNEL-NEXT:    pushq %rcx
-; CHECK0-KERNEL-NEXT:    cld
 ; CHECK0-KERNEL-NEXT:    movq 32(%rsp), %rax
 ; CHECK0-KERNEL-NEXT:    leaq 40(%rsp), %rcx
 ; CHECK0-KERNEL-NEXT:    movq (%rcx), %rdx
diff --git a/llvm/test/CodeGen/X86/x86-64-intrcc.ll b/llvm/test/CodeGen/X86/x86-64-intrcc.ll
index 443d4c2fa46490..5fc606eb566ea6 100644
--- a/llvm/test/CodeGen/X86/x86-64-intrcc.ll
+++ b/llvm/test/CodeGen/X86/x86-64-intrcc.ll
@@ -114,7 +114,6 @@ define dso_local x86_intrcc void @test_fp_1(ptr byval(%struct.interrupt_frame) %
   ; CHECK: # %bb.0: # %entry
   ; CHECK-NEXT: pushq %rbp
   ; CHECK-NEXT: movq %rsp, %rbp
-  ; CHECK: cld
   ; CHECK-DAG: leaq 8(%rbp), %[[R1:[^ ]*]]
   ; CHECK-DAG: leaq 40(%rbp), %[[R2:[^ ]*]]
   ; CHECK: movq %[[R1]], sink_address
@@ -136,7 +135,6 @@ define dso_local x86_intrcc void @test_fp_2(ptr byval(%struct.interrupt_frame) %
   ; CHECK-NEXT: pushq %rax
   ; CHECK-NEXT: pushq %rbp
   ; CHECK-NEXT: movq %rsp, %rbp
-  ; CHECK: cld
   ; CHECK-DAG: movq 16(%rbp), %[[R3:[^ ]*]]
   ; CHECK-DAG: leaq 24(%rbp), %[[R1:[^ ]*]]
   ; CHECK-DAG: leaq 56(%rbp), %[[R2:[^ ]*]]
@@ -164,7 +162,6 @@ define x86_intrcc void @test_copy_elide(ptr byval(%struct.interrupt_frame) %fram
   ; CHECK-NEXT: pushq %rax
   ; CHECK-NEXT: pushq %rbp
   ; CHECK-NEXT: movq %rsp, %rbp
-  ; CHECK: cld
   ; CHECK: leaq 16(%rbp), %[[R1:[^ ]*]]
   ; CHECK: movq %[[R1]], sink_address(%rip)
 entry:
diff --git a/llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll b/llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll
new file mode 100644
index 00000000000000..c143f69f126a85
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll
@@ -0,0 +1,10 @@
+; This test checks that we are not instrumenting unnecessary globals
+; RUN: opt < %s -passes=asan -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+@v_available_externally = available_externally global i32 zeroinitializer
+; CHECK-NOT: {{asan_gen.*v_available_externally}}
+
+; CHECK: @asan.module_ctor
diff --git a/llvm/test/MC/AMDGPU/gfx1011_err.s b/llvm/test/MC/AMDGPU/gfx1011_err.s
index 4b37aaf221e395..a86e48a29c78f7 100644
--- a/llvm/test/MC/AMDGPU/gfx1011_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1011_err.s
@@ -17,7 +17,7 @@ v_dot8c_i32_i4 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES)
-// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 v_fma_legacy_f32 v0, v1, v2, v3
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s
index ba8784a39c3698..f4ab5fe5b14a92 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_err.s
@@ -25,7 +25,7 @@ s_get_waveid_in_workgroup s0
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK)
-// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 v_mac_f32 v0, v1, v2
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx10_err_pos.s b/llvm/test/MC/AMDGPU/gfx10_err_pos.s
index 1d34f00ee0f921..c2679db3b2acfa 100644
--- a/llvm/test/MC/AMDGPU/gfx10_err_pos.s
+++ b/llvm/test/MC/AMDGPU/gfx10_err_pos.s
@@ -448,7 +448,7 @@ ds_swizzle_b32 v8, v2 offset:SWZ(QUAD_PERM, 0, 1, 2, 3)
 // expected a hwreg macro or an absolute expression
 
 s_setreg_b32 undef, s2
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected a hwreg macro or an absolute expression
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected a hwreg macro, structured immediate or an absolute expression
 // CHECK-NEXT:{{^}}s_setreg_b32 undef, s2
 // CHECK-NEXT:{{^}}             ^
 
@@ -621,10 +621,10 @@ s_setreg_b32  hwreg(3,0,33), s2
 // CHECK-NEXT:{{^}}                        ^
 
 //==============================================================================
-// invalid code of hardware register: only 6-bit values are legal
+// invalid hardware register: only 6-bit values are legal
 
 s_setreg_b32  hwreg(0x40), s2
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: invalid code of hardware register: only 6-bit values are legal
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: only 6-bit values are legal
 // CHECK-NEXT:{{^}}s_setreg_b32  hwreg(0x40), s2
 // CHECK-NEXT:{{^}}                    ^
 
@@ -1158,10 +1158,10 @@ v_movrels_b32_sdwa v0, shared_base
 // CHECK-NEXT:{{^}}                       ^
 
 //==============================================================================
-// specified hardware register is not supported on this GPU
+// invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES)
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // CHECK-NEXT:{{^}}s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES)
 // CHECK-NEXT:{{^}}                       ^
 
diff --git a/llvm/test/MC/AMDGPU/gfx940_asm_features.s b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
index 5ee9480677be92..e208b6cf903d38 100644
--- a/llvm/test/MC/AMDGPU/gfx940_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
@@ -197,23 +197,23 @@ scratch_load_lds_ushort v2, off
 // GFX940: scratch_load_lds_sshort v2, off         ; encoding: [0x00,0x60,0xa4,0xdc,0x02,0x00,0x7f,0x00]
 scratch_load_lds_sshort v2, off
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_XCC_ID)   ; encoding: [0x14,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_XCC_ID)
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA) ; encoding: [0x15,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA)
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA1) ; encoding: [0x16,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA1)
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_LO) ; encoding: [0x17,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_LO)
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_HI) ; encoding: [0x18,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_HI)
 
diff --git a/llvm/test/MC/AMDGPU/gfx940_err.s b/llvm/test/MC/AMDGPU/gfx940_err.s
index 515b89513a8048..000f3decf96071 100644
--- a/llvm/test/MC/AMDGPU/gfx940_err.s
+++ b/llvm/test/MC/AMDGPU/gfx940_err.s
@@ -97,22 +97,22 @@ v_cvt_pk_fp8_f32 v1, v2, v3 mul:2
 // GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
 
 s_getreg_b32 s1, hwreg(HW_REG_FLAT_SCR_LO)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_FLAT_SCR_HI)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_XNACK_MASK)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_HW_ID1)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_HW_ID2)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_POPS_PACKER)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 ds_ordered_count v5, v1 offset:65535 gds
 // GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/sopk-err.s b/llvm/test/MC/AMDGPU/sopk-err.s
index 504ee1d11cbc97..cd92343b0e7fe7 100644
--- a/llvm/test/MC/AMDGPU/sopk-err.s
+++ b/llvm/test/MC/AMDGPU/sopk-err.s
@@ -5,48 +5,127 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -show-encoding %s | FileCheck -check-prefix=GFX10 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck -check-prefix=GFX11 %s
 
-// RUN: not llvm-mc -triple=amdgcn %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX9-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX10-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX11-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX9-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX10-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX11-ERR --implicit-check-not=error: --strict-whitespace %s
 
 s_setreg_b32  0x1f803, s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid immediate: only 16-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  0x1f803, s2
+// GCN-NEXT: {{^}}              ^
 
 s_setreg_b32  typo(0x40), s2
-// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a hwreg macro or an absolute expression
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a hwreg macro, structured immediate or an absolute expression
+// GCN-NEXT: {{^}}s_setreg_b32  typo(0x40), s2
+// GCN-NEXT: {{^}}              ^
 
 s_setreg_b32  hwreg(0x40), s2
-// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid code of hardware register: only 6-bit values are legal
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: only 6-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(0x40), s2
+// GCN-NEXT: {{^}}                    ^
+
+s_setreg_b32  {id: 0x40}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: only 6-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 0x40}, s2
+// GCN-NEXT: {{^}}                   ^
 
 s_setreg_b32  hwreg(HW_REG_WRONG), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a register name or an absolute expression
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(HW_REG_WRONG), s2
+// GCN-NEXT: {{^}}                    ^
 
 s_setreg_b32  hwreg(1 2,3), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a comma or a closing parenthesis
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(1 2,3), s2
+// GCN-NEXT: {{^}}                      ^
 
 s_setreg_b32  hwreg(1,2 3), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a comma
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(1,2 3), s2
+// GCN-NEXT: {{^}}                        ^
 
 s_setreg_b32  hwreg(1,2,3, s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a closing parenthesis
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(1,2,3, s2
+// GCN-NEXT: {{^}}                         ^
+
+s_setreg_b32  {id: 1 offset: 2, size: 3}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: comma or closing brace expected
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1 offset: 2, size: 3}, s2
+// GCN-NEXT: {{^}}                     ^
+
+s_setreg_b32  {id: 1 offset: 2, size: 3}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: comma or closing brace expected
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1 offset: 2, size: 3}, s2
+// GCN-NEXT: {{^}}                     ^
+
+s_setreg_b32  {id 1, offset: 2, size: 3}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: colon expected
+// GCN-NEXT: {{^}}s_setreg_b32  {id 1, offset: 2, size: 3}, s2
+// GCN-NEXT: {{^}}                  ^
+
+s_setreg_b32  {id: 1, offset: 2, size: 3, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: colon expected
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1, offset: 2, size: 3, s2
+// GCN-NEXT: {{^}}                                            ^
+
+s_setreg_b32  {id: 1, offset: 2, blah: 3}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: unknown field
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1, offset: 2, blah: 3}, s2
+// GCN-NEXT: {{^}}                                 ^
+
+s_setreg_b32  {id: 1, id: 2}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: duplicate field
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1, id: 2}, s2
+// GCN-NEXT: {{^}}                      ^
 
 s_setreg_b32  hwreg(3,32,32), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bit offset: only 5-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(3,32,32), s2
+// GCN-NEXT: {{^}}                      ^
+
+s_setreg_b32  {id: 3, offset: 32, size: 32}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bit offset: only 5-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 3, offset: 32, size: 32}, s2
+// GCN-NEXT: {{^}}                              ^
 
 s_setreg_b32  hwreg(3,0,33), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bitfield width: only values from 1 to 32 are legal
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(3,0,33), s2
+// GCN-NEXT: {{^}}                        ^
+
+s_setreg_b32  {id: 3, offset: 0, size: 33}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bitfield width: only values from 1 to 32 are legal
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 3, offset: 0, size: 33}, s2
+// GCN-NEXT: {{^}}                                       ^
 
 s_setreg_imm32_b32  0x1f803, 0xff
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid immediate: only 16-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_imm32_b32  0x1f803, 0xff
+// GCN-NEXT: {{^}}                    ^
 
 s_setreg_imm32_b32  hwreg(3,0,33), 0xff
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bitfield width: only values from 1 to 32 are legal
+// GCN-NEXT: {{^}}s_setreg_imm32_b32  hwreg(3,0,33), 0xff
+// GCN-NEXT: {{^}}                              ^
+
+s_setreg_imm32_b32  {id: 3, offset: 0, size: 33}, 0xff
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bitfield width: only values from 1 to 32 are legal
+// GCN-NEXT: {{^}}s_setreg_imm32_b32  {id: 3, offset: 0, size: 33}, 0xff
+// GCN-NEXT: {{^}}                                             ^
 
 s_getreg_b32  s2, hwreg(3,32,32)
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bit offset: only 5-bit values are legal
+// GCN-NEXT: {{^}}s_getreg_b32  s2, hwreg(3,32,32)
+// GCN-NEXT: {{^}}                          ^
+
+s_getreg_b32  s2, {id: 3, offset: 32, size: 32}
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bit offset: only 5-bit values are legal
+// GCN-NEXT: {{^}}s_getreg_b32  s2, {id: 3, offset: 32, size: 32}
+// GCN-NEXT: {{^}}                                  ^
 
 s_cbranch_i_fork s[2:3], 0x6
 // SICI: s_cbranch_i_fork s[2:3], 6 ; encoding: [0x06,0x00,0x82,0xb8]
@@ -57,69 +136,109 @@ s_cbranch_i_fork s[2:3], 0x6
 
 s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES)
 // GFX10:  s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES) ; encoding: [0x0f,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES)
+// SICIVI-ERR-NEXT: {{^}}                       ^
 // GFX9: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES) ; encoding: [0x0f,0xf8,0x82,0xb8]
 // GFX11: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES) ; encoding: [0x0f,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_TBA_LO)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_TBA_LO) ; encoding: [0x10,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TBA_LO)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX11-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TBA_LO)
+// GFX11-ERR-NEXT: {{^}}                       ^
 // GFX9:     s_getreg_b32 s2, hwreg(HW_REG_TBA_LO)   ; encoding: [0x10,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_TBA_HI)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_TBA_HI) ; encoding: [0x11,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TBA_HI)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX11-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TBA_HI)
+// GFX11-ERR-NEXT: {{^}}                       ^
 // GFX9:     s_getreg_b32 s2, hwreg(HW_REG_TBA_HI)   ; encoding: [0x11,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_TMA_LO)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_TMA_LO) ; encoding: [0x12,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TMA_LO)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX11-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TMA_LO)
+// GFX11-ERR-NEXT: {{^}}                       ^
 // GFX9:     s_getreg_b32 s2, hwreg(HW_REG_TMA_LO)   ; encoding: [0x12,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_TMA_HI)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_TMA_HI) ; encoding: [0x13,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TMA_HI)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX11-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TMA_HI)
+// GFX11-ERR-NEXT: {{^}}                       ^
 // GFX9:     s_getreg_b32 s2, hwreg(HW_REG_TMA_HI)   ; encoding: [0x13,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO) ; encoding: [0x14,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX9-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX9-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX9-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO)
+// GFX9-ERR-NEXT: {{^}}                       ^
 // GFX11: s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO) ; encoding: [0x14,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI) ; encoding: [0x15,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX9-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX9-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX9-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI)
+// GFX9-ERR-NEXT: {{^}}                       ^
 // GFX11: s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI) ; encoding: [0x15,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK) ; encoding: [0x16,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX9-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-4]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX9-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR: :[[@LINE-6]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s2, hwreg(HW_REG_POPS_PACKER)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_POPS_PACKER) ; encoding: [0x19,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX9-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-4]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_POPS_PACKER)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX9-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR: :[[@LINE-6]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_cmpk_le_u32 s2, -1
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_cmpk_le_u32 s2, -1
+// GCN-NEXT: {{^}}                  ^
 
 s_cmpk_le_u32 s2, 0x1ffff
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_cmpk_le_u32 s2, 0x1ffff
+// GCN-NEXT: {{^}}                  ^
 
 s_cmpk_le_u32 s2, 0x10000
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_cmpk_le_u32 s2, 0x10000
+// GCN-NEXT: {{^}}                  ^
 
 s_mulk_i32 s2, 0xFFFFFFFFFFFF0000
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_mulk_i32 s2, 0xFFFFFFFFFFFF0000
+// GCN-NEXT: {{^}}               ^
 
 s_mulk_i32 s2, 0x10000
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_mulk_i32 s2, 0x10000
+// GCN-NEXT: {{^}}               ^
diff --git a/llvm/test/MC/AMDGPU/sopk.s b/llvm/test/MC/AMDGPU/sopk.s
index 2b20c35aa7719a..c912b83ca61c27 100644
--- a/llvm/test/MC/AMDGPU/sopk.s
+++ b/llvm/test/MC/AMDGPU/sopk.s
@@ -158,6 +158,12 @@ s_getreg_b32 s2, hwreg(51, 1, 31)
 // GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
 // GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
 
+s_getreg_b32 s2, {id: 51, offset: 1, size: 31}
+// SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+
 // HW register code of unknown HW register, default offset/width
 s_getreg_b32 s2, hwreg(51)
 // SICI: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
@@ -165,6 +171,27 @@ s_getreg_b32 s2, hwreg(51)
 // GFX10: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
 // GFX11: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
 
+// Structured form using default values.
+s_getreg_b32 s2, {id: 51}
+// SICI: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
+
+// Fields may come in any order.
+s_getreg_b32 s2, {size: 32, id: 51}
+// SICI: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
+
+// Empty field lists are allowed.
+s_getreg_b32 s2, {}
+// SICI: s_getreg_b32 s2, hwreg(0) ; encoding: [0x00,0xf8,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(0) ; encoding: [0x00,0xf8,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(0) ; encoding: [0x00,0xf8,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(0) ; encoding: [0x00,0xf8,0x82,0xb8]
+
 // HW register code of unknown HW register, valid symbolic name range but no name available
 s_getreg_b32 s2, hwreg(10)
 // SICI: s_getreg_b32 s2, hwreg(10) ; encoding: [0x0a,0xf8,0x02,0xb9]
@@ -271,17 +298,17 @@ s_setreg_b32 hwreg(HW_REG_HW_ID), s2
 // SICI: s_setreg_b32 hwreg(HW_REG_HW_ID), s2       ; encoding: [0x04,0xf8,0x82,0xb9]
 // VI9:  s_setreg_b32 hwreg(HW_REG_HW_ID), s2       ; encoding: [0x04,0xf8,0x02,0xb9]
 // GFX10: s_setreg_b32 hwreg(HW_REG_HW_ID1), s2   ; encoding: [0x17,0xf8,0x82,0xb9]
-// NOGFX11: :[[@LINE-4]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOGFX11: :[[@LINE-4]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_setreg_b32 hwreg(HW_REG_HW_ID1), s2
-// NOSICIVI: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// NOGFX9: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOSICIVI: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// NOGFX9: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX10: s_setreg_b32 hwreg(HW_REG_HW_ID1), s2      ; encoding: [0x17,0xf8,0x82,0xb9]
 // GFX11: s_setreg_b32 hwreg(HW_REG_HW_ID1), s2 ; encoding: [0x17,0xf8,0x02,0xb9]
 
 s_setreg_b32 hwreg(HW_REG_HW_ID2), s2
-// NOSICIVI: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// NOGFX9: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOSICIVI: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// NOGFX9: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX10: s_setreg_b32 hwreg(HW_REG_HW_ID2), s2      ; encoding: [0x18,0xf8,0x82,0xb9]
 // GFX11: s_setreg_b32 hwreg(HW_REG_HW_ID2), s2 ; encoding: [0x18,0xf8,0x02,0xb9]
 
@@ -427,12 +454,24 @@ s_getreg_b32 s2, hwreg(reg + 1, offset - 1, width + 1)
 // GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
 // GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
 
+s_getreg_b32 s2, {id: reg + 1, offset: offset - 1, size: width + 1}
+// SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+
 s_getreg_b32 s2, hwreg(1 + reg, -1 + offset, 1 + width)
 // SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
 // VI9:  s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
 // GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
 // GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
 
+s_getreg_b32 s2, {id: 1 + reg, offset: -1 + offset, size: 1 + width}
+// SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/ARM/thumbv8m.s b/llvm/test/MC/ARM/thumbv8m.s
index 0e9ab4a9b3bf91..f03dd03dae3a4f 100644
--- a/llvm/test/MC/ARM/thumbv8m.s
+++ b/llvm/test/MC/ARM/thumbv8m.s
@@ -184,13 +184,13 @@ ttat r0, r1
 // 'Lazy Load/Store Multiple'
 
 // UNDEF-BASELINE: error: instruction requires: armv8m.main
-// CHECK-MAINLINE: vlldm r5          @ encoding: [0x35,0xec,0x00,0x0a]
-// CHECK-MAINLINE_DSP: vlldm r5      @ encoding: [0x35,0xec,0x00,0x0a]
+// CHECK-MAINLINE: vlldm r5, {d0 - d15} @ encoding: [0x35,0xec,0x00,0x0a]
+// CHECK-MAINLINE_DSP: vlldm r5, {d0 - d15} @ encoding: [0x35,0xec,0x00,0x0a]
 vlldm r5
 
 // UNDEF-BASELINE: error: instruction requires: armv8m.main
-// CHECK-MAINLINE: vlstm r10         @ encoding: [0x2a,0xec,0x00,0x0a]
-// CHECK-MAINLINE_DSP: vlstm r10     @ encoding: [0x2a,0xec,0x00,0x0a]
+// CHECK-MAINLINE: vlstm r10, {d0 - d15} @ encoding: [0x2a,0xec,0x00,0x0a]
+// CHECK-MAINLINE_DSP: vlstm r10, {d0 - d15} @ encoding: [0x2a,0xec,0x00,0x0a]
 vlstm r10
 
 // New SYSm's
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s b/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s
new file mode 100644
index 00000000000000..4e35883ffe4332
--- /dev/null
+++ b/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s
@@ -0,0 +1,11 @@
+// RUN: llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+// RUN: llvm-mc -triple=thumbv8.1m.main-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+vlstm r8, {d0 - d31}
+// CHECK: vlstm	r8, {d0 - d31} @ encoding: [0x28,0xec,0x80,0x0a]
+
+vlldm r8, {d0 - d31}
+// CHECK: vlldm	r8, {d0 - d31} @ encoding: [0x38,0xec,0x80,0x0a]
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-8m.s b/llvm/test/MC/ARM/vlstm-vlldm-8m.s
new file mode 100644
index 00000000000000..bbc95318aeb3d0
--- /dev/null
+++ b/llvm/test/MC/ARM/vlstm-vlldm-8m.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple=armv8m.main-arm-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+// RUN: llvm-mc -triple=thumbv8m.main-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+vlstm r8, {d0 - d15}
+// CHECK: vlstm	r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+vlldm r8, {d0 - d15}
+// CHECK: vlldm	r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
+
+vlstm r8
+// CHECK: vlstm	r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+vlldm r8
+// CHECK: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-diag.s b/llvm/test/MC/ARM/vlstm-vlldm-diag.s
new file mode 100644
index 00000000000000..b57f535c6a25cf
--- /dev/null
+++ b/llvm/test/MC/ARM/vlstm-vlldm-diag.s
@@ -0,0 +1,61 @@
+// RUN: not llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s 2>&1 >/dev/null \
+// RUN: | FileCheck --check-prefixes=ERR %s
+
+// RUN: not llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s 2>&1 >/dev/null \
+// RUN: | FileCheck --check-prefixes=ERRT2 %s
+
+vlstm r8, {d0 - d11}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d0 - d11}
+
+vlldm r8, {d0 - d11}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d0 - d11}
+
+vlstm r8, {d3 - d15}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d3 - d15}
+
+vlldm r8, {d3 - d15}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d3 - d15}
+
+vlstm r8, {d0 - d29}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d0 - d29}
+
+vlldm r8, {d0 - d29}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d0 - d29}
+
+vlstm r8, {d3 - d31}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d3 - d31}
+
+vlldm r8, {d3 - d31}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d3 - d31}
+
+vlstm r8, {d0 - d35}
+// ERR: error: register expected
+// ERR-NEXT: vlstm r8, {d0 - d35}
+
+vlldm r8, {d0 - d35}
+// ERR: error: register expected
+// ERR-NEXT: vlldm r8, {d0 - d35}
+
+vlstm pc
+// ERR: error: operand must be a register in range [r0, r14]
+// ERR-NEXT: vlstm pc
+
+vlldm pc
+// ERR: error: operand must be a register in range [r0, r14]
+// ERR-NEXT: vlldm pc
+
+vlstm pc
+// ERRT2: error: operand must be a register in range [r0, r14]
+// ERRT2-NEXT: vlstm pc
+
+vlldm pc
+// ERRT2: error: operand must be a register in range [r0, r14]
+// ERRT2-NEXT: vlldm pc
\ No newline at end of file
diff --git a/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt
new file mode 100644
index 00000000000000..6b9882454c06a3
--- /dev/null
+++ b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt
@@ -0,0 +1,11 @@
+// RUN: llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+// RUN: llvm-mc -triple=thumbv8.1m.main-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+[0x28,0xec,0x80,0x0a]
+// CHECK-DISS: vlstm r8, {d0 - d31} @ encoding: [0x28,0xec,0x80,0x0a]
+
+[0x38,0xec,0x80,0x0a]
+// CHECK-DISS: vlldm r8, {d0 - d31} @ encoding: [0x38,0xec,0x80,0x0a]
\ No newline at end of file
diff --git a/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt
new file mode 100644
index 00000000000000..1e28d5284c5b2a
--- /dev/null
+++ b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple=armv8m.main-arm-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+// RUN: llvm-mc -triple=thumbv8m.main-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+[0x28,0xec,0x00,0x0a]
+// CHECK-DISS: vlstm r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+[0x38,0xec,0x00,0x0a]
+// CHECK-DISS: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
+
+[0x28,0xec,0x00,0x0a]
+// CHECK-DISS: vlstm r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+[0x38,0xec,0x00,0x0a]
+// CHECK-DISS: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
\ No newline at end of file
diff --git a/llvm/test/TableGen/HwModeEncodeDecode3.td b/llvm/test/TableGen/HwModeEncodeDecode3.td
new file mode 100644
index 00000000000000..406e52d25be706
--- /dev/null
+++ b/llvm/test/TableGen/HwModeEncodeDecode3.td
@@ -0,0 +1,168 @@
+// RUN: llvm-tblgen -gen-emitter -I %p/../../include %s | \
+// RUN:     FileCheck %s --check-prefix=ENCODER
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | \
+// RUN:     FileCheck %s --check-prefix=DECODER
+// RUN: llvm-tblgen -gen-disassembler --suppress-per-hwmode-duplicates -I \
+// RUN:     %p/../../include %s | FileCheck %s --check-prefix=DECODER-SUPPRESS
+
+include "llvm/Target/Target.td"
+
+def archInstrInfo : InstrInfo { }
+
+def arch : Target {
+  let InstructionSet = archInstrInfo;
+}
+
+def Myi32 : Operand<i32> {
+  let DecoderMethod = "DecodeMyi32";
+}
+
+def HasA : Predicate<"Subtarget->hasA()">;
+def HasB : Predicate<"Subtarget->hasB()">;
+
+def ModeA : HwMode<"+a", [HasA]>;
+def ModeB : HwMode<"+b", [HasB]>;
+
+
+def fooTypeEncDefault : InstructionEncoding {
+  let Size = 8;
+  field bits<64> SoftFail = 0;
+  bits<64> Inst;
+  bits<8> factor;
+  let Inst{7...0} = factor;
+  let Inst{3...2} = 0b10;
+  let Inst{1...0} = 0b00;
+}
+
+def fooTypeEncA : InstructionEncoding {
+  let Size = 4;
+  field bits<32> SoftFail = 0;
+  bits<32> Inst;
+  bits<8> factor;
+  let Inst{7...0} = factor;
+  let Inst{3...2} = 0b11;
+  let Inst{1...0} = 0b00;
+}
+
+def fooTypeEncB : InstructionEncoding {
+  let Size = 4;
+  field bits<32> SoftFail = 0;
+  bits<32> Inst;
+  bits<8> factor;
+  let Inst{15...8} = factor;
+  let Inst{1...0} = 0b11;
+}
+
+// Test for DefaultMode as a selector.
+def foo : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$factor);
+  let EncodingInfos = EncodingByHwMode<
+    [ModeA, ModeB, DefaultMode], [fooTypeEncA, fooTypeEncB, fooTypeEncDefault]
+  >;
+  let AsmString = "foo  $factor";
+}
+
+def bar: Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$factor);
+  let Size = 4;
+  bits<32> Inst;
+  bits<32> SoftFail;
+  bits<8> factor;
+  let Inst{31...24} = factor;
+  let Inst{1...0} = 0b10;
+  let AsmString = "bar  $factor";
+}
+
+def baz : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$factor);
+  bits<32> Inst;
+  let EncodingInfos = EncodingByHwMode<
+    [ModeB], [fooTypeEncA]
+  >;
+  let AsmString = "foo  $factor";
+}
+
+def unrelated: Instruction {
+  let OutOperandList = (outs);
+  let DecoderNamespace = "Alt";
+  let InOperandList = (ins i32imm:$factor);
+  let Size = 4;
+  bits<32> Inst;
+  bits<32> SoftFail;
+  bits<8> factor;
+  let Inst{31...24} = factor;
+  let Inst{1...0} = 0b10;
+  let AsmString = "unrelated  $factor";
+}
+
+
+// DECODER-LABEL: DecoderTableAlt_DefaultMode32[] =
+// DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTableAlt_ModeA32[] =
+// DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTableAlt_ModeB32[] =
+// DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTable_DefaultMode32[] =
+// DECODER-DAG: Opcode: bar
+// DECODER-LABEL: DecoderTable_DefaultMode64[] =
+// DECODER-DAG: Opcode: fooTypeEncDefault:foo
+// DECODER-LABEL: DecoderTable_ModeA32[] =
+// DECODER-DAG: Opcode: fooTypeEncA:foo
+// DECODER-DAG: Opcode: bar
+// DECODER-LABEL: DecoderTable_ModeB32[] =
+// DECODER-DAG: Opcode: fooTypeEncB:foo
+// DECODER-DAG: Opcode: fooTypeEncA:baz
+// DECODER-DAG: Opcode: bar
+
+
+// DECODER-SUPPRESS-LABEL: DecoderTableAlt_AllModes32[] =
+// DECODER-SUPPRESS-DAG: Opcode: unrelated
+// DECODER-SUPPRESS-LABEL: DecoderTable_AllModes32[] =
+// DECODER-SUPPRESS-DAG: Opcode: bar
+// DECODER-SUPPRESS-LABEL: DecoderTable_DefaultMode64[] =
+// DECODER-SUPPRESS-NOT: Opcode: bar
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncDefault:foo
+// DECODER-SUPPRESS-LABEL: DecoderTable_ModeA32[] =
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncA:foo
+// DECODER-SUPPRESS-NOT: Opcode: bar
+// DECODER-SUPPRESS-LABEL: DecoderTable_ModeB32[] =
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncB:foo
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncA:baz
+// DECODER-SUPPRESS-NOT: Opcode: bar
+
+// ENCODER-LABEL:   static const uint64_t InstBits_DefaultMode[] = {
+// ENCODER:         UINT64_C(2),        // bar
+// ENCODER:         UINT64_C(0),        // baz
+// ENCODER:         UINT64_C(8),        // foo
+// ENCODER:         UINT64_C(2),        // unrelated
+
+// ENCODER-LABEL:   static const uint64_t InstBits_ModeA[] = {
+// ENCODER:         UINT64_C(2),        // bar
+// ENCODER:         UINT64_C(0),        // baz
+// ENCODER:         UINT64_C(12),       // foo
+// ENCODER:         UINT64_C(2),        // unrelated
+
+// ENCODER-LABEL:   static const uint64_t InstBits_ModeB[] = {
+// ENCODER:         UINT64_C(2),        // bar
+// ENCODER:         UINT64_C(12),       // baz
+// ENCODER:         UINT64_C(3),        // foo
+// ENCODER:         UINT64_C(2),        // unrelated
+
+// ENCODER:  unsigned HwMode = STI.getHwMode();
+// ENCODER:  switch (HwMode) {
+// ENCODER:  default: llvm_unreachable("Unknown hardware mode!"); break;
+// ENCODER:  case 0: InstBits = InstBits_DefaultMode; break;
+// ENCODER:  case 1: InstBits = InstBits_ModeA; break;
+// ENCODER:  case 2: InstBits = InstBits_ModeB; break;
+// ENCODER:  };
+
+// ENCODER:     case ::foo: {
+// ENCODER:      switch (HwMode) {
+// ENCODER:      default: llvm_unreachable("Unhandled HwMode");
+// ENCODER:      case 0: {
+// ENCODER:      case 1: {
+// ENCODER:      case 2: {
+
diff --git a/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll b/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
index e3f2a54f321eda..279238bea1842e 100644
--- a/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
+++ b/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
@@ -93,9 +93,8 @@ define void @some_checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i16 [[IV]], 1
 ; CHECK-NEXT:    [[PTR_IV_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV_1]]
-; CHECK-NEXT:    [[CMP_PTR_IV_1_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV_1]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_1_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV_1]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_PTR_IV_1_LOWER]], [[CMP_PTR_IV_1_UPPER]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_PTR_IV_1_UPPER]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[TRAP]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    store i8 0, ptr [[PTR_IV]], align 4
@@ -171,9 +170,8 @@ define void @no_checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8 %
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i16 [[IV]], 1
 ; CHECK-NEXT:    [[PTR_IV_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV_1]]
-; CHECK-NEXT:    [[CMP_PTR_IV_1_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV_1]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_1_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV_1]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_PTR_IV_1_LOWER]], [[CMP_PTR_IV_1_UPPER]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_PTR_IV_1_UPPER]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[TRAP]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    store i8 0, ptr [[PTR_IV]], align 4
diff --git a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
index 66ce1ffc6ebc93..1842ca2d82545c 100644
--- a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
+++ b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
@@ -27,18 +27,16 @@ define void @test1(ptr %src, ptr noundef %lower, ptr noundef %upper, i8 %N) {
 ; CHECK-NEXT:    store i32 0, ptr [[PTR_SRC_IV]], align 4
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_1]]
-; CHECK-NEXT:    [[CMP_IV_1_START:%.*]] = icmp ult ptr [[SRC_IV_1]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_1_END:%.*]] = icmp uge ptr [[SRC_IV_1]], [[UPPER]]
-; CHECK-NEXT:    [[OR_2:%.*]] = or i1 [[CMP_IV_1_START]], [[CMP_IV_1_END]]
+; CHECK-NEXT:    [[OR_2:%.*]] = or i1 false, [[CMP_IV_1_END]]
 ; CHECK-NEXT:    br i1 [[OR_2]], label [[TRAP_BB]], label [[LOOP_BODY_2:%.*]]
 ; CHECK:       loop.body.2:
 ; CHECK-NEXT:    [[PTR_SRC_IV_1:%.*]] = bitcast ptr [[SRC_IV_1]] to ptr
 ; CHECK-NEXT:    store i32 0, ptr [[PTR_SRC_IV_1]], align 4
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i8 [[IV]], 2
 ; CHECK-NEXT:    [[SRC_IV_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_2]]
-; CHECK-NEXT:    [[CMP_IV_2_START:%.*]] = icmp ult ptr [[SRC_IV_2]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_2_END:%.*]] = icmp uge ptr [[SRC_IV_2]], [[UPPER]]
-; CHECK-NEXT:    [[OR_3:%.*]] = or i1 [[CMP_IV_2_START]], [[CMP_IV_2_END]]
+; CHECK-NEXT:    [[OR_3:%.*]] = or i1 false, [[CMP_IV_2_END]]
 ; CHECK-NEXT:    br i1 [[OR_3]], label [[TRAP_BB]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PTR_SRC_IV_2:%.*]] = bitcast ptr [[SRC_IV_2]] to ptr
@@ -125,16 +123,14 @@ define void @test2(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_1]]
-; CHECK-NEXT:    [[CMP_IV_1_START:%.*]] = icmp ult ptr [[SRC_IV_1]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_1_END:%.*]] = icmp uge ptr [[SRC_IV_1]], [[UPPER]]
-; CHECK-NEXT:    [[OR_2:%.*]] = or i1 [[CMP_IV_1_START]], [[CMP_IV_1_END]]
+; CHECK-NEXT:    [[OR_2:%.*]] = or i1 false, [[CMP_IV_1_END]]
 ; CHECK-NEXT:    br i1 [[OR_2]], label [[TRAP_BB]], label [[LOOP_BODY_2:%.*]]
 ; CHECK:       loop.body.2:
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i8 [[IV]], 2
 ; CHECK-NEXT:    [[SRC_IV_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_2]]
-; CHECK-NEXT:    [[CMP_IV_2_START:%.*]] = icmp ult ptr [[SRC_IV_2]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_2_END:%.*]] = icmp uge ptr [[SRC_IV_2]], [[UPPER]]
-; CHECK-NEXT:    [[OR_3:%.*]] = or i1 [[CMP_IV_2_START]], [[CMP_IV_2_END]]
+; CHECK-NEXT:    [[OR_3:%.*]] = or i1 false, [[CMP_IV_2_END]]
 ; CHECK-NEXT:    br i1 [[OR_3]], label [[TRAP_BB]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PTR:%.*]] = bitcast ptr [[SRC_IV]] to ptr
@@ -221,16 +217,14 @@ define void @test2_with_ne(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_1]]
-; CHECK-NEXT:    [[CMP_IV_1_START:%.*]] = icmp ult ptr [[SRC_IV_1]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_1_END:%.*]] = icmp uge ptr [[SRC_IV_1]], [[UPPER]]
-; CHECK-NEXT:    [[OR_2:%.*]] = or i1 [[CMP_IV_1_START]], [[CMP_IV_1_END]]
+; CHECK-NEXT:    [[OR_2:%.*]] = or i1 false, [[CMP_IV_1_END]]
 ; CHECK-NEXT:    br i1 [[OR_2]], label [[TRAP_BB]], label [[LOOP_BODY_2:%.*]]
 ; CHECK:       loop.body.2:
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i8 [[IV]], 2
 ; CHECK-NEXT:    [[SRC_IV_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_2]]
-; CHECK-NEXT:    [[CMP_IV_2_START:%.*]] = icmp ult ptr [[SRC_IV_2]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_2_END:%.*]] = icmp uge ptr [[SRC_IV_2]], [[UPPER]]
-; CHECK-NEXT:    [[OR_3:%.*]] = or i1 [[CMP_IV_2_START]], [[CMP_IV_2_END]]
+; CHECK-NEXT:    [[OR_3:%.*]] = or i1 false, [[CMP_IV_2_END]]
 ; CHECK-NEXT:    br i1 [[OR_3]], label [[TRAP_BB]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PTR:%.*]] = bitcast ptr [[SRC_IV]] to ptr
@@ -316,16 +310,14 @@ define void @test3(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK-NEXT:    br i1 [[OR_1]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]]
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[NEXT]]
-; CHECK-NEXT:    [[CMP_IV_1_START:%.*]] = icmp ult ptr [[SRC_IV_1]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_1_END:%.*]] = icmp uge ptr [[SRC_IV_1]], [[UPPER]]
-; CHECK-NEXT:    [[OR_2:%.*]] = or i1 [[CMP_IV_1_START]], [[CMP_IV_1_END]]
+; CHECK-NEXT:    [[OR_2:%.*]] = or i1 false, [[CMP_IV_1_END]]
 ; CHECK-NEXT:    br i1 [[OR_2]], label [[TRAP_BB]], label [[LOOP_BODY_2:%.*]]
 ; CHECK:       loop.body.2:
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i8 [[IV]], 2
 ; CHECK-NEXT:    [[SRC_IV_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_2]]
-; CHECK-NEXT:    [[CMP_IV_2_START:%.*]] = icmp ult ptr [[SRC_IV_2]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_2_END:%.*]] = icmp uge ptr [[SRC_IV_2]], [[UPPER]]
-; CHECK-NEXT:    [[OR_3:%.*]] = or i1 [[CMP_IV_2_START]], [[CMP_IV_2_END]]
+; CHECK-NEXT:    [[OR_3:%.*]] = or i1 false, [[CMP_IV_2_END]]
 ; CHECK-NEXT:    br i1 [[OR_3]], label [[TRAP_BB]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PTR:%.*]] = bitcast ptr [[SRC_IV]] to ptr
diff --git a/llvm/test/Transforms/ConstraintElimination/zext-for-per-formula-reasoning.ll b/llvm/test/Transforms/ConstraintElimination/zext-for-per-formula-reasoning.ll
index 63f5d4d4ba347b..7844651a01f9a3 100644
--- a/llvm/test/Transforms/ConstraintElimination/zext-for-per-formula-reasoning.ll
+++ b/llvm/test/Transforms/ConstraintElimination/zext-for-per-formula-reasoning.ll
@@ -90,11 +90,9 @@ define i1 @gep_zext_idx_adds(ptr %p, i8 %cnt, i8 %off) {
 ; CHECK-NEXT:    [[EXT:%.*]] = zext i8 [[CNT]] to i16
 ; CHECK-NEXT:    [[EXT_1:%.*]] = add nuw nsw i16 [[EXT]], 1
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i16 [[EXT_1]]
-; CHECK-NEXT:    [[T_1:%.*]] = icmp uge ptr [[ADD_PTR]], [[P]]
-; CHECK-NEXT:    [[F_1:%.*]] = icmp ult ptr [[ADD_PTR]], [[P]]
 ; CHECK-NEXT:    [[GEP_11:%.*]] = getelementptr inbounds i32, ptr [[P]], i16 11
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp uge ptr [[ADD_PTR]], [[GEP_11]]
-; CHECK-NEXT:    [[RES_1:%.*]] = xor i1 [[T_1]], [[F_1]]
+; CHECK-NEXT:    [[RES_1:%.*]] = xor i1 true, false
 ; CHECK-NEXT:    [[RES_2:%.*]] = xor i1 [[RES_1]], [[C_1]]
 ; CHECK-NEXT:    ret i1 [[RES_2]]
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
index 7121c85dd59be4..c12b3b122ba74c 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -passes=loop-vectorize -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
-; RUN: opt < %s -passes='loop-vectorize' -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -S -mcpu=pwr9 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='loop-vectorize' -S -mcpu=pwr9 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: vector.body
 ; CHECK: load double, ptr
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 1bcd7a2e009e0b..da6dc34e409684 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -120,7 +120,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
 entry:
@@ -260,7 +260,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
new file mode 100644
index 00000000000000..9968c933494aa0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s
+
+; FIXME: The vectorizer should refuse to fold the tail by masking because
+; %conv is used outside of the loop. Test for this by checking that
+; %n.vec, the vector trip count, is rounded down to the next multiple of
+; 4. If folding the tail, it would have been rounded up instead.
+; Test case for #76069(https://github.com/llvm/llvm-project/issues/76069).
+define i32 @test(ptr %arr, i64 %n) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i64 [[N]], 1
+; CHECK-NEXT:    br i1 [[CMP1]], label [[PREHEADER:%.*]], label [[DONE:%.*]]
+; CHECK:       preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[N]], -2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i8
+; CHECK-NEXT:    [[TMP3:%.*]] = add i8 1, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i8 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP1]], 255
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP1]] to i8
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 2, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i8 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[TMP1]], 255
+; CHECK-NEXT:    [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP6]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 1, [[N_VEC]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i8
+; CHECK-NEXT:    [[IND_END1:%.*]] = add i8 1, [[DOTCAST]]
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 2, i64 3, i64 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE10]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT4]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP17]], i32 0
+; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP20]]
+; CHECK-NEXT:    store i32 65, ptr [[TMP21]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP17]], i32 1
+; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP23]]
+; CHECK-NEXT:    store i32 65, ptr [[TMP24]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i1> [[TMP17]], i32 2
+; CHECK-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP26]]
+; CHECK-NEXT:    store i32 65, ptr [[TMP27]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP17]], i32 3
+; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.if9:
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP29]]
+; CHECK-NEXT:    store i32 65, ptr [[TMP30]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.continue10:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 1, [[CMO]]
+; CHECK-NEXT:    br i1 true, label [[LOAD_VAL:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i8 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[CONV:%.*]] = phi i64 [ [[CONV2:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i8 [ [[INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i64 [[CONV]], -1
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[SUB]]
+; CHECK-NEXT:    store i32 65, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[INC]] = add i8 [[I]], 1
+; CHECK-NEXT:    [[CONV2]] = zext i8 [[INC]] to i64
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[CONV2]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOP]], label [[LOAD_VAL]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       load_val:
+; CHECK-NEXT:    [[FINAL:%.*]] = phi i64 [ [[CONV]], [[LOOP]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[FINAL]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[PTR2]], align 4
+; CHECK-NEXT:    br label [[DONE]]
+; CHECK:       done:
+; CHECK-NEXT:    [[VALUE:%.*]] = phi i32 [ [[VAL]], [[LOAD_VAL]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[VALUE]]
+;
+entry:
+  %cmp1 = icmp ugt i64 %n, 1
+  br i1 %cmp1, label %preheader, label %done
+
+preheader:
+  br label %loop
+
+loop:
+  %conv = phi i64 [ %conv2, %loop ], [ 1, %preheader ]
+  %i = phi i8 [ %inc, %loop ], [ 1, %preheader ]
+  %sub = add nsw i64 %conv, -1
+  %ptr = getelementptr inbounds i32, ptr %arr, i64 %sub
+  store i32 65, ptr %ptr, align 4
+  %inc = add i8 %i, 1
+  %conv2 = zext i8 %inc to i64
+  %cmp2 = icmp ult i64 %conv2, %n
+  br i1 %cmp2, label %loop, label %load_val, !llvm.loop !0
+
+load_val:
+  %final = phi i64 [ %conv, %loop ]
+  %ptr2 = getelementptr inbounds i32, ptr %arr, i64 %final
+  %val = load i32, ptr %ptr2, align 4
+  br label %done
+
+done:
+  %value = phi i32 [ %val, %load_val ], [ 0, %entry ]
+  ret i32 %value
+
+}
+
+!0 = distinct !{!0, !1, !2, !3}
+!1 = !{!"llvm.loop.unroll.disable"}
+!2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
new file mode 100644
index 00000000000000..1dddbfe20a2ed7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -0,0 +1,90 @@
+; RUN: opt -passes=loop-vectorize -force-vector-width=8 -force-vector-interleave=2 -disable-output -debug -S %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; REQUIRES: asserts
+
+; Check if the vector loop condition can be simplified to true for a given
+; VF/IC combination.
+define void @test_tc_less_than_16(ptr %A, i64 %N) {
+; CHECK:      LV: Scalarizing:  %cmp =
+; CHECK-NEXT: VPlan 'Initial VPlan for VF={8},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ph:
+; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
+; CHECK-NEXT:     EMIT ir<%p.src> = WIDEN-POINTER-INDUCTION ir<%A>, 1
+; CHECK-NEXT:     vp<[[VPTR:%.]]> = vector-pointer ir<%p.src>
+; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VPTR]]>
+; CHECK-NEXT:     WIDEN ir<%add> = add nsw ir<%l>, ir<10>
+; CHECK-NEXT:     vp<[[VPTR2:%.+]]> = vector-pointer ir<%p.src>
+; CHECK-NEXT:     WIDEN store vp<[[VPTR2]]>, ir<%add>
+; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+; CHECK: Executing best plan with VF=8, UF=2
+; CHECK-NEXT: VPlan 'Final VPlan for VF={8},UF={2}' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ph:
+; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
+; CHECK-NEXT:     EMIT ir<%p.src> = WIDEN-POINTER-INDUCTION ir<%A>, 1
+; CHECK-NEXT:     vp<[[VPTR:%.]]> = vector-pointer ir<%p.src>
+; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VPTR]]>
+; CHECK-NEXT:     WIDEN ir<%add> = add nsw ir<%l>, ir<10>
+; CHECK-NEXT:     vp<[[VPTR2:%.+]]> = vector-pointer ir<%p.src>
+; CHECK-NEXT:     WIDEN store vp<[[VPTR2]]>, ir<%add>
+; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT branch-on-cond ir<true>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+  %and = and i64 %N, 15
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %and, %entry ], [ %iv.next, %loop ]
+  %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ]
+  %p.src.next = getelementptr inbounds i8, ptr %p.src, i64 1
+  %l = load i8, ptr %p.src, align 1
+  %add = add nsw i8 %l, 10
+  store i8 %add, ptr %p.src
+  %iv.next = add nsw i64 %iv, -1
+  %cmp = icmp eq i64 %iv.next, 0
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/SROA/phi-and-select.ll b/llvm/test/Transforms/SROA/phi-and-select.ll
index 54cfb10793a1ac..7c8b27c9de9c0b 100644
--- a/llvm/test/Transforms/SROA/phi-and-select.ll
+++ b/llvm/test/Transforms/SROA/phi-and-select.ll
@@ -114,13 +114,13 @@ define i32 @test3(i32 %x) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    switch i32 [[X:%.*]], label [[BB0:%.*]] [
-; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
-; CHECK-NEXT:    i32 2, label [[BB2:%.*]]
-; CHECK-NEXT:    i32 3, label [[BB3:%.*]]
-; CHECK-NEXT:    i32 4, label [[BB4:%.*]]
-; CHECK-NEXT:    i32 5, label [[BB5:%.*]]
-; CHECK-NEXT:    i32 6, label [[BB6:%.*]]
-; CHECK-NEXT:    i32 7, label [[BB7:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:      i32 2, label [[BB2:%.*]]
+; CHECK-NEXT:      i32 3, label [[BB3:%.*]]
+; CHECK-NEXT:      i32 4, label [[BB4:%.*]]
+; CHECK-NEXT:      i32 5, label [[BB5:%.*]]
+; CHECK-NEXT:      i32 6, label [[BB6:%.*]]
+; CHECK-NEXT:      i32 7, label [[BB7:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb0:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
@@ -733,6 +733,7 @@ define void @PR20822(i1 %c1, i1 %c2, ptr %ptr) {
 ; CHECK-LABEL: @PR20822(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[F_SROA_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[F1_SROA_GEP:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[PTR:%.*]], i32 0, i32 0
 ; CHECK-NEXT:    br i1 [[C1:%.*]], label [[IF_END:%.*]], label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    br label [[IF_END]]
@@ -742,9 +743,8 @@ define void @PR20822(i1 %c1, i1 %c2, ptr %ptr) {
 ; CHECK:       if.then2:
 ; CHECK-NEXT:    br label [[IF_THEN5]]
 ; CHECK:       if.then5:
-; CHECK-NEXT:    [[F1:%.*]] = phi ptr [ [[PTR:%.*]], [[IF_THEN2]] ], [ [[F_SROA_0]], [[IF_END]] ]
-; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[F1]], i32 0, i32 0
-; CHECK-NEXT:    store i32 0, ptr [[DOTFCA_0_GEP]], align 4
+; CHECK-NEXT:    [[F1_SROA_PHI:%.*]] = phi ptr [ [[F1_SROA_GEP]], [[IF_THEN2]] ], [ [[F_SROA_0]], [[IF_END]] ]
+; CHECK-NEXT:    store i32 0, ptr [[F1_SROA_PHI]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SROA/phi-gep.ll b/llvm/test/Transforms/SROA/phi-gep.ll
index c5aa1cdd9cf654..78071dcdafb49d 100644
--- a/llvm/test/Transforms/SROA/phi-gep.ll
+++ b/llvm/test/Transforms/SROA/phi-gep.ll
@@ -65,15 +65,13 @@ end:
 define i32 @test_sroa_phi_gep_poison(i1 %cond) {
 ; CHECK-LABEL: @test_sroa_phi_gep_poison(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATE_LOAD_IF_THEN:%.*]] = load i32, ptr poison, align 4
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ poison, [[IF_THEN]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], ptr [[PHI]], i32 0, i32 1
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[PHI_SROA_PHI_SROA_SPECULATE_LOAD_IF_THEN]], [[IF_THEN]] ]
+; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
 ;
 entry:
   %a = alloca %pair, align 4
@@ -94,17 +92,13 @@ end:
 define i32 @test_sroa_phi_gep_global(i1 %cond) {
 ; CHECK-LABEL: @test_sroa_phi_gep_global(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[PAIR]], ptr [[A]], i32 0, i32 1
-; CHECK-NEXT:    store i32 1, ptr [[GEP_A]], align 4
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATE_LOAD_IF_THEN:%.*]] = load i32, ptr getelementptr inbounds ([[PAIR:%.*]], ptr @g, i32 0, i32 1), align 4
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ @g, [[IF_THEN]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], ptr [[PHI]], i32 0, i32 1
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[PHI_SROA_PHI_SROA_SPECULATE_LOAD_IF_THEN]], [[IF_THEN]] ]
+; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
 ;
 entry:
   %a = alloca %pair, align 4
@@ -245,7 +239,7 @@ define i32 @test_sroa_invoke_phi_gep(i1 %cond) personality ptr @__gxx_personalit
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[CALL:%.*]], label [[END:%.*]]
 ; CHECK:       call:
 ; CHECK-NEXT:    [[B:%.*]] = invoke ptr @foo()
-; CHECK-NEXT:    to label [[END]] unwind label [[INVOKE_CATCH:%.*]]
+; CHECK-NEXT:            to label [[END]] unwind label [[INVOKE_CATCH:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[CALL]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], ptr [[PHI]], i32 0, i32 1
@@ -253,7 +247,7 @@ define i32 @test_sroa_invoke_phi_gep(i1 %cond) personality ptr @__gxx_personalit
 ; CHECK-NEXT:    ret i32 [[LOAD]]
 ; CHECK:       invoke_catch:
 ; CHECK-NEXT:    [[RES:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    catch ptr null
+; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -468,10 +462,10 @@ define i32 @test_sroa_phi_gep_multiple_values_from_same_block(i32 %arg) {
 ; CHECK-LABEL: @test_sroa_phi_gep_multiple_values_from_same_block(
 ; CHECK-NEXT:  bb.1:
 ; CHECK-NEXT:    switch i32 [[ARG:%.*]], label [[BB_3:%.*]] [
-; CHECK-NEXT:    i32 1, label [[BB_2:%.*]]
-; CHECK-NEXT:    i32 2, label [[BB_2]]
-; CHECK-NEXT:    i32 3, label [[BB_4:%.*]]
-; CHECK-NEXT:    i32 4, label [[BB_4]]
+; CHECK-NEXT:      i32 1, label [[BB_2:%.*]]
+; CHECK-NEXT:      i32 2, label [[BB_2]]
+; CHECK-NEXT:      i32 3, label [[BB_4:%.*]]
+; CHECK-NEXT:      i32 4, label [[BB_4]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb.2:
 ; CHECK-NEXT:    br label [[BB_4]]
@@ -504,6 +498,117 @@ bb.4:                                                ; preds = %bb.1, %bb.1, %bb
   ret i32 %load
 }
 
+define i64 @test_phi_idx_mem2reg_const(i1 %arg) {
+; CHECK-LABEL: @test_phi_idx_mem2reg_const(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i64 [ 2, [[BB1]] ], [ 3, [[BB2]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 0, [[BB1]] ], [ 1, [[BB2]] ]
+; CHECK-NEXT:    ret i64 [[PHI_SROA_PHI_SROA_SPECULATED]]
+;
+bb:
+  %alloca = alloca [2 x i64], align 8
+  %gep1 = getelementptr inbounds i64, ptr %alloca, i64 1
+  store i64 2, ptr %alloca
+  store i64 3, ptr %gep1
+  br i1 %arg, label %bb1, label %bb2
+
+bb1:
+  br label %end
+
+bb2:
+  br label %end
+
+end:
+  %phi = phi i64 [ 0, %bb1 ], [ 1, %bb2 ]
+  %getelementptr = getelementptr inbounds i64, ptr %alloca, i64 %phi
+  %load = load i64, ptr %getelementptr
+  ret i64 %load
+}
+
+define i64 @test_phi_idx_mem2reg_not_const(i1 %arg, i64 %idx) {
+; CHECK-LABEL: @test_phi_idx_mem2reg_not_const(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x i64], align 8
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[ALLOCA]], i64 1
+; CHECK-NEXT:    store i64 2, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    store i64 3, ptr [[GEP1]], align 4
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[IDX:%.*]], [[BB2]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds i64, ptr [[ALLOCA]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GETELEMENTPTR]], align 4
+; CHECK-NEXT:    ret i64 [[LOAD]]
+;
+bb:
+  %alloca = alloca [2 x i64], align 8
+  %gep1 = getelementptr inbounds i64, ptr %alloca, i64 1
+  store i64 2, ptr %alloca
+  store i64 3, ptr %gep1
+  br i1 %arg, label %bb1, label %bb2
+
+bb1:
+  br label %end
+
+bb2:
+  br label %end
+
+end:
+  %phi = phi i64 [ 0, %bb1 ], [ %idx, %bb2 ]
+  %getelementptr = getelementptr inbounds i64, ptr %alloca, i64 %phi
+  %load = load i64, ptr %getelementptr
+  ret i64 %load
+}
+
+define i64 @test_phi_mem2reg_pointer_op_is_non_const_gep(i1 %arg, i64 %idx) {
+; CHECK-LABEL: @test_phi_mem2reg_pointer_op_is_non_const_gep(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x i64], align 8
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[ALLOCA]], i64 1
+; CHECK-NEXT:    store i64 2, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    store i64 3, ptr [[GEP1]], align 4
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 0, [[BB1]] ], [ 1, [[BB2]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds i64, ptr [[ALLOCA]], i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[GETELEMENTPTR2:%.*]] = getelementptr inbounds i64, ptr [[GETELEMENTPTR]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GETELEMENTPTR]], align 4
+; CHECK-NEXT:    ret i64 [[LOAD]]
+;
+bb:
+  %alloca = alloca [2 x i64], align 8
+  %gep1 = getelementptr inbounds i64, ptr %alloca, i64 1
+  store i64 2, ptr %alloca
+  store i64 3, ptr %gep1
+  br i1 %arg, label %bb1, label %bb2
+
+bb1:
+  br label %end
+
+bb2:
+  br label %end
+
+end:
+  %phi = phi i64 [ 0, %bb1 ], [ 1, %bb2 ]
+  %getelementptr = getelementptr inbounds i64, ptr %alloca, i64 %idx
+  %getelementptr2 = getelementptr inbounds i64, ptr %getelementptr, i64 %phi
+  %load = load i64, ptr %getelementptr
+  ret i64 %load
+}
+
 declare ptr @foo()
 
 declare i32 @__gxx_personality_v0(...)
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s b/llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s
new file mode 100644
index 00000000000000..81ca75251381ad
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s
@@ -0,0 +1,12 @@
+# REQUIRES: exegesis-can-measure-latency, x86_64-linux
+
+# Test that specifying the loop register to use works as expected.
+
+# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -snippets-file=%s | FileCheck %s
+
+# CHECK: measurements:
+
+# LLVM-EXEGESIS-DEFREG R11 ff
+# LLVM-EXEGESIS-LOOP-REGISTER R12
+
+addq $0xff, %r11
diff --git a/llvm/test/tools/llvm-objcopy/ELF/set-symbol-visibility.test b/llvm/test/tools/llvm-objcopy/ELF/set-symbol-visibility.test
new file mode 100644
index 00000000000000..de30ee09bfda53
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/set-symbol-visibility.test
@@ -0,0 +1,311 @@
+# RUN: yaml2obj --docnum=1 %s -o %t.o
+# RUN: echo '.*' > %t.symbols.regex
+
+## Check that the visibility of all symbols is properly set to DEFAULT.
+# RUN: llvm-objcopy %t.o %t0.o --set-symbols-visibility=%t.symbols.regex=default --regex
+# RUN: llvm-readelf -s %t0.o | FileCheck %s --check-prefix=DEF
+
+# DEF-DAG: DEFAULT     1 default_local
+# DEF-DAG: DEFAULT     1 internal_local
+# DEF-DAG: DEFAULT     1 hidden_local
+# DEF-DAG: DEFAULT     1 protected_local
+# DEF-DAG: DEFAULT     1 default_global
+# DEF-DAG: DEFAULT     1 default_weak
+# DEF-DAG: DEFAULT     1 internal_global
+# DEF-DAG: DEFAULT     1 internal_weak
+# DEF-DAG: DEFAULT     1 hidden_global
+# DEF-DAG: DEFAULT     1 hidden_weak
+# DEF-DAG: DEFAULT     1 protected_global
+# DEF-DAG: DEFAULT     1 protected_weak
+
+## Check that the visibility of all symbols is properly set to HIDDEN.
+# RUN: llvm-objcopy %t.o %t1.o --set-symbols-visibility=%t.symbols.regex=hidden --regex
+# RUN: llvm-readelf -s %t1.o | FileCheck %s --check-prefix=HID
+
+# HID-DAG: HIDDEN      1 default_local
+# HID-DAG: HIDDEN      1 internal_local
+# HID-DAG: HIDDEN      1 hidden_local
+# HID-DAG: HIDDEN      1 protected_local
+# HID-DAG: HIDDEN      1 default_global
+# HID-DAG: HIDDEN      1 default_weak
+# HID-DAG: HIDDEN      1 internal_global
+# HID-DAG: HIDDEN      1 internal_weak
+# HID-DAG: HIDDEN      1 hidden_global
+# HID-DAG: HIDDEN      1 hidden_weak
+# HID-DAG: HIDDEN      1 protected_global
+# HID-DAG: HIDDEN      1 protected_weak
+
+## Check that the visibility of all symbols is properly set to PROTECTED.
+# RUN: llvm-objcopy %t.o %t2.o --set-symbols-visibility=%t.symbols.regex=protected --regex
+# RUN: llvm-readelf -s %t2.o | FileCheck %s --check-prefix=PRO
+
+# PRO-DAG: PROTECTED   1 default_local
+# PRO-DAG: PROTECTED   1 internal_local
+# PRO-DAG: PROTECTED   1 hidden_local
+# PRO-DAG: PROTECTED   1 protected_local
+# PRO-DAG: PROTECTED   1 default_global
+# PRO-DAG: PROTECTED   1 default_weak
+# PRO-DAG: PROTECTED   1 internal_global
+# PRO-DAG: PROTECTED   1 internal_weak
+# PRO-DAG: PROTECTED   1 hidden_global
+# PRO-DAG: PROTECTED   1 hidden_weak
+# PRO-DAG: PROTECTED   1 protected_global
+# PRO-DAG: PROTECTED   1 protected_weak
+
+## Check that the visibility of all symbols is properly set to INTERNAL.
+# RUN: llvm-objcopy %t.o %t3.o --set-symbols-visibility=%t.symbols.regex=internal --regex
+# RUN: llvm-readelf -s %t3.o | FileCheck %s --check-prefix=INT
+
+# INT-DAG: INTERNAL    1 default_local
+# INT-DAG: INTERNAL    1 internal_local
+# INT-DAG: INTERNAL    1 hidden_local
+# INT-DAG: INTERNAL    1 protected_local
+# INT-DAG: INTERNAL    1 default_global
+# INT-DAG: INTERNAL    1 default_weak
+# INT-DAG: INTERNAL    1 internal_global
+# INT-DAG: INTERNAL    1 internal_weak
+# INT-DAG: INTERNAL    1 hidden_global
+# INT-DAG: INTERNAL    1 hidden_weak
+# INT-DAG: INTERNAL    1 protected_global
+# INT-DAG: INTERNAL    1 protected_weak
+
+## Check that setting the visibility of certain symbols that were read from
+## a file does not affect other symbols.
+# RUN: echo -e "default_local\ninternal_local" > %t.symbol.list
+# RUN: llvm-objcopy %t.o %t4.o --set-symbols-visibility=%t.symbol.list=hidden
+# RUN: llvm-readelf -s %t4.o | FileCheck %s --check-prefix=FILE
+
+# FILE-DAG: HIDDEN      1 default_local
+# FILE-DAG: HIDDEN      1 internal_local
+## Unaffected symbols:
+# FILE-DAG: HIDDEN      1 hidden_local
+# FILE-DAG: PROTECTED   1 protected_local
+# FILE-DAG: DEFAULT     1 default_global
+# FILE-DAG: DEFAULT     1 default_weak
+# FILE-DAG: INTERNAL    1 internal_global
+# FILE-DAG: INTERNAL    1 internal_weak
+# FILE-DAG: HIDDEN      1 hidden_global
+# FILE-DAG: HIDDEN      1 hidden_weak
+# FILE-DAG: PROTECTED   1 protected_global
+# FILE-DAG: PROTECTED   1 protected_weak
+
+## Check that the visibility of a single symbol is set correctly,
+## and that no other symbols are affected.
+# RUN: llvm-objcopy %t.o %t5.o --set-symbol-visibility=default_local=hidden \
+# RUN:                         --set-symbol-visibility=internal_local=protected \
+# RUN:                         --set-symbol-visibility=hidden_local=internal \
+# RUN:                         --set-symbol-visibility=protected_local=default
+# RUN: llvm-readelf -s %t5.o | FileCheck %s --check-prefix=SINGLE
+
+# SINGLE-DAG: HIDDEN      1 default_local
+# SINGLE-DAG: PROTECTED   1 internal_local
+# SINGLE-DAG: INTERNAL    1 hidden_local
+# SINGLE-DAG: DEFAULT     1 protected_local
+## Unaffected symbols:
+# SINGLE-DAG: DEFAULT     1 default_global
+# SINGLE-DAG: DEFAULT     1 default_weak
+# SINGLE-DAG: INTERNAL    1 internal_global
+# SINGLE-DAG: INTERNAL    1 internal_weak
+# SINGLE-DAG: HIDDEN      1 hidden_global
+# SINGLE-DAG: HIDDEN      1 hidden_weak
+# SINGLE-DAG: PROTECTED   1 protected_global
+# SINGLE-DAG: PROTECTED   1 protected_weak
+
+## Check that the visibility of symbols specified by a regex are set correctly,
+## and that no other symbols are affected.
+# RUN: llvm-objcopy %t.o %t6.o --set-symbol-visibility='.*'_local=hidden --regex
+# RUN: llvm-readelf -s %t6.o | FileCheck %s --check-prefix=REGEX
+
+# REGEX-DAG: HIDDEN      1 default_local
+# REGEX-DAG: HIDDEN      1 internal_local
+# REGEX-DAG: HIDDEN      1 hidden_local
+# REGEX-DAG: HIDDEN      1 protected_local
+## Unaffected symbols:
+# REGEX-DAG: DEFAULT     1 default_global
+# REGEX-DAG: DEFAULT     1 default_weak
+# REGEX-DAG: INTERNAL    1 internal_global
+# REGEX-DAG: INTERNAL    1 internal_weak
+# REGEX-DAG: HIDDEN      1 hidden_global
+# REGEX-DAG: HIDDEN      1 hidden_weak
+# REGEX-DAG: PROTECTED   1 protected_global
+# REGEX-DAG: PROTECTED   1 protected_weak
+
+## Check that the visibility of symbols specified by a wildcard are set correctly,
+## and that no other symbols are affected.
+# RUN: llvm-objcopy %t.o %t7.o --set-symbol-visibility='*_local'=hidden --wildcard
+# RUN: llvm-readelf -s %t7.o | FileCheck %s --check-prefix=WILDCARD
+
+# WILDCARD-DAG: HIDDEN      1 default_local
+# WILDCARD-DAG: HIDDEN      1 internal_local
+# WILDCARD-DAG: HIDDEN      1 hidden_local
+# WILDCARD-DAG: HIDDEN      1 protected_local
+## Unaffected symbols:
+# WILDCARD-DAG: DEFAULT     1 default_global
+# WILDCARD-DAG: DEFAULT     1 default_weak
+# WILDCARD-DAG: INTERNAL    1 internal_global
+# WILDCARD-DAG: INTERNAL    1 internal_weak
+# WILDCARD-DAG: HIDDEN      1 hidden_global
+# WILDCARD-DAG: HIDDEN      1 hidden_weak
+# WILDCARD-DAG: PROTECTED   1 protected_global
+# WILDCARD-DAG: PROTECTED   1 protected_weak
+
+## Check that the latest option that matches the same symbols as any of the previous
+## options overwrites the visibility of these symbols.
+# RUN: echo -e '*_weak\n*_local' > %t.symbols.pattern
+# RUN: llvm-objcopy %t.o %t8.o --set-symbol-visibility='default_*'=hidden \
+# RUN:                         --set-symbol-visibility='internal_*'=hidden \
+# RUN:                         --set-symbols-visibility=%t.symbols.pattern=protected \
+# RUN:                         --wildcard
+# RUN: llvm-readelf -s %t8.o | FileCheck %s --check-prefix=REWRITE
+
+# REWRITE-DAG: PROTECTED   1 default_local
+# REWRITE-DAG: HIDDEN      1 default_global
+# REWRITE-DAG: PROTECTED   1 default_weak
+# REWRITE-DAG: PROTECTED   1 internal_local
+# REWRITE-DAG: HIDDEN      1 internal_global
+# REWRITE-DAG: PROTECTED   1 internal_weak
+# REWRITE-DAG: PROTECTED   1 hidden_local
+# REWRITE-DAG: PROTECTED   1 hidden_weak
+# REWRITE-DAG: PROTECTED   1 protected_local
+# REWRITE-DAG: PROTECTED   1 protected_weak
+## Unaffected symbols:
+# REWRITE-DAG: HIDDEN      1 hidden_global
+# REWRITE-DAG: PROTECTED   1 protected_global
+
+## Check that a symbol name with a special charater is treated as a plain name
+## when pattern matching options are not enabled.
+# RUN: yaml2obj --docnum=2 %s -o %t9.o
+# RUN: llvm-objcopy %t9.o --set-symbol-visibility='f*o'=hidden
+# RUN: llvm-readelf -s %t9.o | FileCheck %s --check-prefix=SPECIAL
+
+# SPECIAL-DAG: HIDDEN      1 f*o
+## Unaffected symbol:
+# SPECIAL-DAG: DEFAULT     1 foo
+
+# RUN: yaml2obj --docnum=3 %s -o %t10.o
+
+## Check that the visibility of undefined symbols can be changed as well.
+# RUN: llvm-objcopy %t10.o --set-symbol-visibility=foo=hidden
+# RUN: llvm-readelf -s %t10.o | FileCheck %s --check-prefix=UNDEF
+# UNDEF: HIDDEN    UND foo
+
+## Check that passing an invalid visibility type generates an error message.
+# RUN: echo 'foo' > %t.symbols
+# RUN: not llvm-objcopy %t10.o --set-symbols-visibility=%t.symbols=invalid-type 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=TYPE
+# RUN: not llvm-objcopy %t10.o --set-symbol-visibility=foo=invalid-type 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=TYPE
+# TYPE: error: 'invalid-type' is not a valid symbol visibility
+
+## Check that omitting the '=' character generates an error.
+# RUN: not llvm-objcopy %t10.o --set-symbols-visibility=%t.symbols,hidden 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=FORMAT -DOPTION=--set-symbols-visibility
+# RUN: not llvm-objcopy %t10.o --set-symbol-visibility=foo default 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=FORMAT -DOPTION=--set-symbol-visibility
+# FORMAT: error: bad format for [[OPTION]]
+
+## Check that using an invalid symbol pattern generates an error.
+# RUN: echo '*.' > %t.symbols.regex
+# RUN: not llvm-objcopy %t10.o --set-symbols-visibility=%t.symbols.regex=hidden --regex 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=SYMBOL
+# RUN: not llvm-objcopy %t10.o --set-symbol-visibility='*.'=default --regex 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=SYMBOL
+# SYMBOL: error: cannot compile regular expression '*.': repetition-operator operand invalid
+
+## Check passing an invalid filename generates an error.
+# RUN: not llvm-objcopy %t10.o --set-symbols-visibility=no_file=hidden 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=NO_FILE -DMSG=%errc_ENOENT
+# NO_FILE: error: 'no_file': [[MSG]]
+
+---
+!ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name:  .text
+    Type:  SHT_PROGBITS
+Symbols:
+  - Name:    default_local
+    Section: .text
+    Binding:  STB_LOCAL
+  - Name:    protected_local
+    Section: .text
+    Binding:  STB_LOCAL
+    Other:    [ STV_PROTECTED ]
+  - Name:    internal_local
+    Section: .text
+    Binding:  STB_LOCAL
+    Other:    [ STV_INTERNAL ]
+  - Name:    hidden_local
+    Section: .text
+    Binding:  STB_LOCAL
+    Other:    [ STV_HIDDEN ]
+  - Name:    default_weak
+    Section: .text
+    Binding:  STB_WEAK
+  - Name:    internal_weak
+    Section: .text
+    Binding:  STB_WEAK
+    Other:    [ STV_INTERNAL ]
+  - Name:    hidden_weak
+    Section: .text
+    Binding:  STB_WEAK
+    Other:    [ STV_HIDDEN ]
+  - Name:    protected_weak
+    Section: .text
+    Binding:  STB_WEAK
+    Other:    [ STV_PROTECTED ]
+  - Name:    default_global
+    Section: .text
+    Binding:  STB_GLOBAL
+  - Name:    internal_global
+    Section: .text
+    Binding:  STB_GLOBAL
+    Other:    [ STV_INTERNAL ]
+  - Name:    hidden_global
+    Section: .text
+    Binding:  STB_GLOBAL
+    Other:    [ STV_HIDDEN ]
+  - Name:    protected_global
+    Section: .text
+    Binding:  STB_GLOBAL
+    Other:    [ STV_PROTECTED ]
+  - Name:    ignored_name
+    Section: .text
+    Binding:  STB_GLOBAL
+    Other:    [ STV_INTERNAL ]
+...
+
+---
+!ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name:  .text
+    Type:  SHT_PROGBITS
+Symbols:
+  - Name:    f*o
+    Section: .text
+    Binding:  STB_LOCAL
+  - Name:    foo
+    Section: .text
+    Binding:  STB_LOCAL
+...
+
+---
+!ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Symbols:
+  - Name:     foo
+    Binding:  STB_LOCAL
+...
diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 2b438a8b134613..2bfc9705368e46 100644
--- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -286,6 +286,8 @@ static opt<bool> Verify("verify", desc("Verify the DWARF debug info."),
                         cat(DwarfDumpCategory));
 static opt<ErrorDetailLevel> ErrorDetails(
     "error-display", init(Unspecified),
+    desc("Set the level of detail and summary to display when verifying "
+         "(implies --verify)"),
     values(clEnumValN(NoDetailsOrSummary, "quiet",
                       "Only display whether errors occurred."),
            clEnumValN(NoDetailsOnlySummary, "summary",
@@ -295,6 +297,11 @@ static opt<ErrorDetailLevel> ErrorDetails(
            clEnumValN(BothDetailsAndSummary, "full",
                       "Display each error as well as a summary. [default]")),
     cat(DwarfDumpCategory));
+static opt<std::string> JsonErrSummaryFile(
+    "verify-json", init(""),
+    desc("Output JSON-formatted error summary to the specified file. "
+         "(Implies --verify)"),
+    value_desc("filename.json"), cat(DwarfDumpCategory));
 static opt<bool> Quiet("quiet", desc("Use with -verify to not emit to STDOUT."),
                        cat(DwarfDumpCategory));
 static opt<bool> DumpUUID("uuid", desc("Show the UUID for each architecture."),
@@ -349,6 +356,7 @@ static DIDumpOptions getDumpOpts(DWARFContext &C) {
                        ErrorDetails != NoDetailsOrSummary;
     DumpOpts.ShowAggregateErrors = ErrorDetails != OnlyDetailsNoSummary &&
                                    ErrorDetails != NoDetailsOnlySummary;
+    DumpOpts.JsonErrSummaryFile = JsonErrSummaryFile;
     return DumpOpts.noImplicitRecursion();
   }
   return DumpOpts;
@@ -834,8 +842,10 @@ int main(int argc, char **argv) {
                           "-verbose is currently not supported";
     return 1;
   }
-  if (!Verify && ErrorDetails != Unspecified)
-    WithColor::warning() << "-error-detail has no affect without -verify";
+  // -error-detail and -json-summary-file both imply -verify
+  if (ErrorDetails != Unspecified || !JsonErrSummaryFile.empty()) {
+    Verify = true;
+  }
 
   std::error_code EC;
   ToolOutputFile OutputFile(OutputFilename, EC, sys::fs::OF_TextWithCRLF);
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
index 0aecaaeea4b2e7..4ae6bc2a54cd50 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -74,6 +74,8 @@ struct BenchmarkKey {
   // The address that the snippet should be loaded in at if the execution mode
   // being used supports it.
   intptr_t SnippetAddress = 0;
+  // The register that should be used to hold the loop counter.
+  unsigned LoopRegister;
 };
 
 struct BenchmarkMeasure {
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
index 7258fcb4279c7d..431d99c72b8086 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
@@ -9,6 +9,7 @@
 #include "SnippetFile.h"
 #include "BenchmarkRunner.h"
 #include "Error.h"
+#include "Target.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
@@ -175,6 +176,20 @@ class BenchmarkCodeStreamer : public MCStreamer, public AsmCommentConsumer {
 
       return;
     }
+    if (CommentText.consume_front("LOOP-REGISTER")) {
+      // LLVM-EXEGESIS-LOOP-REGISTER <loop register>
+      unsigned LoopRegister;
+
+      if (!(LoopRegister = findRegisterByName(CommentText.trim()))) {
+        errs() << "unknown register '" << CommentText
+               << "' in 'LLVM-EXEGESIS-LOOP-REGISTER " << CommentText << "'\n";
+        ++InvalidComments;
+        return;
+      }
+
+      Result->Key.LoopRegister = LoopRegister;
+      return;
+    }
   }
 
   unsigned numInvalidComments() const { return InvalidComments; }
@@ -221,6 +236,11 @@ Expected<std::vector<BenchmarkCode>> readSnippets(const LLVMState &State,
 
   BenchmarkCode Result;
 
+  // Ensure that there is a default loop register value specified.
+  Result.Key.LoopRegister =
+      State.getExegesisTarget().getDefaultLoopCounterRegister(
+          State.getTargetMachine().getTargetTriple());
+
   const TargetMachine &TM = State.getTargetMachine();
   MCContext Context(TM.getTargetTriple(), TM.getMCAsmInfo(),
                     TM.getMCRegisterInfo(), TM.getMCSubtargetInfo());
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
index 561687a62319b3..0bab30d1582003 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
@@ -48,10 +48,8 @@ class DuplicateSnippetRepetitor : public SnippetRepetitor {
 
 class LoopSnippetRepetitor : public SnippetRepetitor {
 public:
-  explicit LoopSnippetRepetitor(const LLVMState &State)
-      : SnippetRepetitor(State),
-        LoopCounter(State.getExegesisTarget().getLoopCounterRegister(
-            State.getTargetMachine().getTargetTriple())) {}
+  explicit LoopSnippetRepetitor(const LLVMState &State, unsigned LoopRegister)
+      : SnippetRepetitor(State), LoopCounter(LoopRegister) {}
 
   // Loop over the snippet ceil(MinInstructions / Instructions.Size()) times.
   FillFunction Repeat(ArrayRef<MCInst> Instructions, unsigned MinInstructions,
@@ -113,8 +111,8 @@ class LoopSnippetRepetitor : public SnippetRepetitor {
         (void)_;
         Loop.addInstructions(Instructions);
       }
-      ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB,
-                                     State.getInstrInfo());
+      ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB, State.getInstrInfo(),
+                                     LoopCounter);
 
       // Set up the exit basic block.
       Loop.MBB->addSuccessor(Exit.MBB, BranchProbability::getZero());
@@ -138,14 +136,14 @@ SnippetRepetitor::~SnippetRepetitor() {}
 
 std::unique_ptr<const SnippetRepetitor>
 SnippetRepetitor::Create(Benchmark::RepetitionModeE Mode,
-                         const LLVMState &State) {
+                         const LLVMState &State, unsigned LoopRegister) {
   switch (Mode) {
   case Benchmark::Duplicate:
   case Benchmark::MiddleHalfDuplicate:
     return std::make_unique<DuplicateSnippetRepetitor>(State);
   case Benchmark::Loop:
   case Benchmark::MiddleHalfLoop:
-    return std::make_unique<LoopSnippetRepetitor>(State);
+    return std::make_unique<LoopSnippetRepetitor>(State, LoopRegister);
   case Benchmark::AggregateMin:
     break;
   }
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
index 2b3c416c9029f7..c62e80f161f128 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
@@ -29,7 +29,8 @@ namespace exegesis {
 class SnippetRepetitor {
 public:
   static std::unique_ptr<const SnippetRepetitor>
-  Create(Benchmark::RepetitionModeE Mode, const LLVMState &State);
+  Create(Benchmark::RepetitionModeE Mode, const LLVMState &State,
+         unsigned LoopRegister);
 
   virtual ~SnippetRepetitor();
 
diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h
index 7bbd946b03331f..522c75d15703d5 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.h
+++ b/llvm/tools/llvm-exegesis/lib/Target.h
@@ -202,12 +202,15 @@ class ExegesisTarget {
   }
 
   // Returns a counter usable as a loop counter.
-  virtual unsigned getLoopCounterRegister(const Triple &) const { return 0; }
+  virtual unsigned getDefaultLoopCounterRegister(const Triple &) const {
+    return 0;
+  }
 
   // Adds the code to decrement the loop counter and
   virtual void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
                                            MachineBasicBlock &TargetMBB,
-                                           const MCInstrInfo &MII) const {
+                                           const MCInstrInfo &MII,
+                                           unsigned LoopRegister) const {
     llvm_unreachable("decrementLoopCounterAndBranch() requires "
                      "getLoopCounterRegister() > 0");
   }
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index 6fc951a6e35d6a..a41a995f5560af 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -720,7 +720,7 @@ class ExegesisX86Target : public ExegesisTarget {
 
   unsigned getScratchMemoryRegister(const Triple &TT) const override;
 
-  unsigned getLoopCounterRegister(const Triple &) const override;
+  unsigned getDefaultLoopCounterRegister(const Triple &) const override;
 
   unsigned getMaxMemoryAccessSize() const override { return 64; }
 
@@ -733,7 +733,8 @@ class ExegesisX86Target : public ExegesisTarget {
 
   void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
                                    MachineBasicBlock &TargetMBB,
-                                   const MCInstrInfo &MII) const override;
+                                   const MCInstrInfo &MII,
+                                   unsigned LoopRegister) const override;
 
   std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
                                const APInt &Value) const override;
@@ -852,7 +853,7 @@ const unsigned ExegesisX86Target::kUnavailableRegistersSSE[12] = {
 // We're using one of R8-R15 because these registers are never hardcoded in
 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
 // conflicts.
-constexpr const unsigned kLoopCounterReg = X86::R8;
+constexpr const unsigned kDefaultLoopCounterReg = X86::R8;
 
 } // namespace
 
@@ -870,11 +871,12 @@ unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
   return TT.isOSWindows() ? X86::RCX : X86::RDI;
 }
 
-unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const {
+unsigned
+ExegesisX86Target::getDefaultLoopCounterRegister(const Triple &TT) const {
   if (!TT.isArch64Bit()) {
     return 0;
   }
-  return kLoopCounterReg;
+  return kDefaultLoopCounterReg;
 }
 
 Error ExegesisX86Target::randomizeTargetMCOperand(
@@ -912,10 +914,10 @@ void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
 
 void ExegesisX86Target::decrementLoopCounterAndJump(
     MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
-    const MCInstrInfo &MII) const {
+    const MCInstrInfo &MII, unsigned LoopRegister) const {
   BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
-      .addDef(kLoopCounterReg)
-      .addUse(kLoopCounterReg)
+      .addDef(LoopRegister)
+      .addUse(LoopRegister)
       .addImm(-1);
   BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1))
       .addMBB(&TargetMBB)
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
index 782d44422791ca..1ae2565e894c69 100644
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -497,22 +497,42 @@ void benchmarkMain() {
   }
 
   const auto Opcodes = getOpcodesOrDie(State);
+  std::vector<BenchmarkCode> Configurations;
+
+  unsigned LoopRegister =
+      State.getExegesisTarget().getDefaultLoopCounterRegister(
+          State.getTargetMachine().getTargetTriple());
+
+  if (Opcodes.empty()) {
+    Configurations = ExitOnErr(readSnippets(State, SnippetsFile));
+    for (const auto &Configuration : Configurations) {
+      if (ExecutionMode != BenchmarkRunner::ExecutionModeE::SubProcess &&
+          (Configuration.Key.MemoryMappings.size() != 0 ||
+           Configuration.Key.MemoryValues.size() != 0 ||
+           Configuration.Key.SnippetAddress != 0))
+        ExitWithError("Memory and snippet address annotations are only "
+                      "supported in subprocess "
+                      "execution mode");
+    }
+    LoopRegister = Configurations[0].Key.LoopRegister;
+  }
 
   SmallVector<std::unique_ptr<const SnippetRepetitor>, 2> Repetitors;
   if (RepetitionMode != Benchmark::RepetitionModeE::AggregateMin)
-    Repetitors.emplace_back(SnippetRepetitor::Create(RepetitionMode, State));
+    Repetitors.emplace_back(
+        SnippetRepetitor::Create(RepetitionMode, State, LoopRegister));
   else {
     for (Benchmark::RepetitionModeE RepMode :
          {Benchmark::RepetitionModeE::Duplicate,
           Benchmark::RepetitionModeE::Loop})
-      Repetitors.emplace_back(SnippetRepetitor::Create(RepMode, State));
+      Repetitors.emplace_back(
+          SnippetRepetitor::Create(RepMode, State, LoopRegister));
   }
 
   BitVector AllReservedRegs;
   for (const std::unique_ptr<const SnippetRepetitor> &Repetitor : Repetitors)
     AllReservedRegs |= Repetitor->getReservedRegs();
 
-  std::vector<BenchmarkCode> Configurations;
   if (!Opcodes.empty()) {
     for (const unsigned Opcode : Opcodes) {
       // Ignore instructions without a sched class if
@@ -534,17 +554,6 @@ void benchmarkMain() {
       std::move(ConfigsForInstr->begin(), ConfigsForInstr->end(),
                 std::back_inserter(Configurations));
     }
-  } else {
-    Configurations = ExitOnErr(readSnippets(State, SnippetsFile));
-    for (const auto &Configuration : Configurations) {
-      if (ExecutionMode != BenchmarkRunner::ExecutionModeE::SubProcess &&
-          (Configuration.Key.MemoryMappings.size() != 0 ||
-           Configuration.Key.MemoryValues.size() != 0 ||
-           Configuration.Key.SnippetAddress != 0))
-        ExitWithError("Memory and snippet address annotations are only "
-                      "supported in subprocess "
-                      "execution mode");
-    }
   }
 
   if (MinInstructions == 0) {
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index ec9dc0a2a814d4..6318578b110048 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -254,6 +254,21 @@ parseSetSectionFlagValue(StringRef FlagValue) {
   return SFU;
 }
 
+static Expected<uint8_t> parseVisibilityType(StringRef VisType) {
+  const uint8_t Invalid = 0xff;
+  uint8_t type = StringSwitch<uint8_t>(VisType)
+                     .Case("default", ELF::STV_DEFAULT)
+                     .Case("hidden", ELF::STV_HIDDEN)
+                     .Case("internal", ELF::STV_INTERNAL)
+                     .Case("protected", ELF::STV_PROTECTED)
+                     .Default(Invalid);
+  if (type == Invalid)
+    return createStringError(errc::invalid_argument,
+                             "'%s' is not a valid symbol visibility",
+                             VisType.str().c_str());
+  return type;
+}
+
 namespace {
 struct TargetInfo {
   FileFormat Format;
@@ -969,6 +984,33 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
 
     Config.SymbolsToAdd.push_back(*SymInfo);
   }
+  for (auto *Arg : InputArgs.filtered(OBJCOPY_set_symbol_visibility)) {
+    if (!StringRef(Arg->getValue()).contains('='))
+      return createStringError(errc::invalid_argument,
+                               "bad format for --set-symbol-visibility");
+    auto [Sym, Visibility] = StringRef(Arg->getValue()).split('=');
+    Expected<uint8_t> Type = parseVisibilityType(Visibility);
+    if (!Type)
+      return Type.takeError();
+    ELFConfig.SymbolsToSetVisibility.emplace_back(NameMatcher(), *Type);
+    if (Error E = ELFConfig.SymbolsToSetVisibility.back().first.addMatcher(
+            NameOrPattern::create(Sym, SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
+  }
+  for (auto *Arg : InputArgs.filtered(OBJCOPY_set_symbols_visibility)) {
+    if (!StringRef(Arg->getValue()).contains('='))
+      return createStringError(errc::invalid_argument,
+                               "bad format for --set-symbols-visibility");
+    auto [File, Visibility] = StringRef(Arg->getValue()).split('=');
+    Expected<uint8_t> Type = parseVisibilityType(Visibility);
+    if (!Type)
+      return Type.takeError();
+    ELFConfig.SymbolsToSetVisibility.emplace_back(NameMatcher(), *Type);
+    if (Error E =
+            addSymbolsFromFile(ELFConfig.SymbolsToSetVisibility.back().first,
+                               DC.Alloc, File, SymbolMatchStyle, ErrorCallback))
+      return std::move(E);
+  }
 
   ELFConfig.AllowBrokenLinks = InputArgs.hasArg(OBJCOPY_allow_broken_links);
 
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
index 86774c889ab860..3c0e5cd475a36b 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
@@ -88,6 +88,17 @@ defm set_section_type
          "Set the type of section <section> to the integer <type>">,
       MetaVarName<"section=type">;
 
+defm set_symbol_visibility
+    : Eq<"set-symbol-visibility",
+         "Change the visibility of a symbol to the specified value">,
+      MetaVarName<"symbol=visibility">;
+defm set_symbols_visibility
+    : Eq<"set-symbols-visibility",
+         "Read a list of symbols from <filename> and change their "
+         "visibility to the specified value. Visibility values: default, "
+         "internal, hidden, protected">,
+      MetaVarName<"filename=visibility">;
+
 def S : Flag<["-"], "S">,
         Alias<strip_all>,
         HelpText<"Alias for --strip-all">;
diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
index b58d7cd952aff0..69d9b9a2f784f7 100644
--- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
+++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
@@ -71,10 +71,10 @@ dumpDXContainer(MemoryBufferRef Source) {
       break;
     }
     case dxbc::PartType::SFI0: {
-      std::optional<uint64_t> Flags = Container.getShaderFlags();
+      std::optional<uint64_t> Flags = Container.getShaderFeatureFlags();
       // Omit the flags in the YAML if they are missing or zero.
       if (Flags && *Flags > 0)
-        NewPart.Flags = DXContainerYAML::ShaderFlags(*Flags);
+        NewPart.Flags = DXContainerYAML::ShaderFeatureFlags(*Flags);
       break;
     }
     case dxbc::PartType::HASH: {
diff --git a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
index 7426884217a08e..1f2b8c1754f6ef 100644
--- a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
@@ -110,12 +110,12 @@ TEST_F(SelectionDAGAddressAnalysisTest, sameFrameObject) {
   SDValue Index = DAG->getMemBasePlusOffset(FIPtr, Offset, Loc);
   SDValue Store = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index,
                                 PtrInfo.getWithOffset(Offset));
-  std::optional<int64_t> NumBytes = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes = cast<StoreSDNode>(Store)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store.getNode(), NumBytes, Store.getNode(), NumBytes, *DAG, IsAlias);
+      Store.getNode(), LocationSize::precise(NumBytes), Store.getNode(),
+      LocationSize::precise(NumBytes), *DAG, IsAlias);
 
   EXPECT_TRUE(IsValid);
   EXPECT_TRUE(IsAlias);
@@ -134,14 +134,10 @@ TEST_F(SelectionDAGAddressAnalysisTest, sameFrameObjectUnknownSize) {
   SDValue Store = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index,
                                 PtrInfo.getWithOffset(Offset));
 
-  // Maybe unlikely that BaseIndexOffset::computeAliasing is used with the
-  // optional NumBytes being unset like in this test, but it would be confusing
-  // if that function determined IsAlias=false here.
-  std::optional<int64_t> NumBytes;
-
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store.getNode(), NumBytes, Store.getNode(), NumBytes, *DAG, IsAlias);
+      Store.getNode(), LocationSize::beforeOrAfterPointer(), Store.getNode(),
+      LocationSize::beforeOrAfterPointer(), *DAG, IsAlias);
 
   EXPECT_FALSE(IsValid);
 }
@@ -165,14 +161,13 @@ TEST_F(SelectionDAGAddressAnalysisTest, noAliasingFrameObjects) {
                                  PtrInfo.getWithOffset(Offset0));
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index1,
                                  PtrInfo.getWithOffset(Offset1));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
 
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
@@ -195,14 +190,13 @@ TEST_F(SelectionDAGAddressAnalysisTest, unknownSizeFrameObjects) {
       DAG->getStore(DAG->getEntryNode(), Loc, Value, FIPtr, PtrInfo);
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index1,
                                  MachinePointerInfo(PtrInfo.getAddrSpace()));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
 
   EXPECT_FALSE(IsValid);
 }
@@ -220,20 +214,19 @@ TEST_F(SelectionDAGAddressAnalysisTest, globalWithFrameObject) {
   SDValue Index = DAG->getMemBasePlusOffset(FIPtr, Offset, Loc);
   SDValue Store = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index,
                                 PtrInfo.getWithOffset(Offset));
-  std::optional<int64_t> NumBytes = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes = cast<StoreSDNode>(Store)->getMemoryVT().getStoreSize();
   EVT GTy = DAG->getTargetLoweringInfo().getValueType(DAG->getDataLayout(),
                                                       G->getType());
   SDValue GValue = DAG->getConstant(0, Loc, GTy);
   SDValue GAddr = DAG->getGlobalAddress(G, Loc, GTy);
   SDValue GStore = DAG->getStore(DAG->getEntryNode(), Loc, GValue, GAddr,
                                  MachinePointerInfo(G, 0));
-  std::optional<int64_t> GNumBytes = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(GStore)->getMemoryVT().getStoreSize());
+  TypeSize GNumBytes = cast<StoreSDNode>(GStore)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store.getNode(), NumBytes, GStore.getNode(), GNumBytes, *DAG, IsAlias);
+      Store.getNode(), LocationSize::precise(NumBytes), GStore.getNode(),
+      LocationSize::precise(GNumBytes), *DAG, IsAlias);
 
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
@@ -248,8 +241,7 @@ TEST_F(SelectionDAGAddressAnalysisTest, globalWithAliasedGlobal) {
   SDValue GAddr = DAG->getGlobalAddress(G, Loc, GTy);
   SDValue GStore = DAG->getStore(DAG->getEntryNode(), Loc, GValue, GAddr,
                                  MachinePointerInfo(G, 0));
-  std::optional<int64_t> GNumBytes = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(GStore)->getMemoryVT().getStoreSize());
+  TypeSize GNumBytes = cast<StoreSDNode>(GStore)->getMemoryVT().getStoreSize();
 
   SDValue AliasedGValue = DAG->getConstant(1, Loc, GTy);
   SDValue AliasedGAddr = DAG->getGlobalAddress(AliasedG, Loc, GTy);
@@ -258,9 +250,9 @@ TEST_F(SelectionDAGAddressAnalysisTest, globalWithAliasedGlobal) {
                     MachinePointerInfo(AliasedG, 0));
 
   bool IsAlias;
-  bool IsValid = BaseIndexOffset::computeAliasing(GStore.getNode(), GNumBytes,
-                                                  AliasedGStore.getNode(),
-                                                  GNumBytes, *DAG, IsAlias);
+  bool IsValid = BaseIndexOffset::computeAliasing(
+      GStore.getNode(), LocationSize::precise(GNumBytes),
+      AliasedGStore.getNode(), LocationSize::precise(GNumBytes), *DAG, IsAlias);
 
   // With some deeper analysis we could detect if G and AliasedG is aliasing or
   // not. But computeAliasing is currently defensive and assumes that a
@@ -290,19 +282,19 @@ TEST_F(SelectionDAGAddressAnalysisTest, fixedSizeFrameObjectsWithinDiff) {
                                  PtrInfo.getWithOffset(Offset0));
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value1, Index1,
                                  PtrInfo.getWithOffset(Offset1));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
 
   IsValid = BaseIndexOffset::computeAliasing(
-      Store1.getNode(), NumBytes1, Store0.getNode(), NumBytes0, *DAG, IsAlias);
+      Store1.getNode(), LocationSize::precise(NumBytes1), Store0.getNode(),
+      LocationSize::precise(NumBytes0), *DAG, IsAlias);
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
 }
@@ -331,14 +323,13 @@ TEST_F(SelectionDAGAddressAnalysisTest, fixedSizeFrameObjectsOutOfDiff) {
                                  PtrInfo.getWithOffset(Offset0));
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value1, Index1,
                                  PtrInfo.getWithOffset(Offset1));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
   EXPECT_TRUE(IsValid);
   EXPECT_TRUE(IsAlias);
 }
@@ -365,14 +356,13 @@ TEST_F(SelectionDAGAddressAnalysisTest, twoFixedStackObjects) {
                                  PtrInfo0.getWithOffset(Offset0));
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value1, Index1,
                                  PtrInfo1.getWithOffset(Offset0));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
 }
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index d923b25fda9f96..fdbe8df783b117 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -3327,8 +3327,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirective) {
     EXPECT_NE(IPBB->end(), IP.getPoint());
   };
 
-  Builder.restoreIP(OMPBuilder.createSingle(
-      Builder, BodyGenCB, FiniCB, /*IsNowait*/ false, /*DidIt*/ nullptr));
+  Builder.restoreIP(
+      OMPBuilder.createSingle(Builder, BodyGenCB, FiniCB, /*IsNowait*/ false));
   Value *EntryBBTI = EntryBB->getTerminator();
   EXPECT_NE(EntryBBTI, nullptr);
   EXPECT_TRUE(isa<BranchInst>(EntryBBTI));
@@ -3417,8 +3417,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) {
     EXPECT_NE(IPBB->end(), IP.getPoint());
   };
 
-  Builder.restoreIP(OMPBuilder.createSingle(
-      Builder, BodyGenCB, FiniCB, /*IsNowait*/ true, /*DidIt*/ nullptr));
+  Builder.restoreIP(
+      OMPBuilder.createSingle(Builder, BodyGenCB, FiniCB, /*IsNowait*/ true));
   Value *EntryBBTI = EntryBB->getTerminator();
   EXPECT_NE(EntryBBTI, nullptr);
   EXPECT_TRUE(isa<BranchInst>(EntryBBTI));
@@ -3464,6 +3464,151 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) {
   EXPECT_EQ(ExitBarrier, nullptr);
 }
 
+// Helper class to check each instruction of a BB.
+class BBInstIter {
+  BasicBlock *BB;
+  BasicBlock::iterator BBI;
+
+public:
+  BBInstIter(BasicBlock *BB) : BB(BB), BBI(BB->begin()) {}
+
+  bool hasNext() const { return BBI != BB->end(); }
+
+  template <typename InstTy> InstTy *next() {
+    if (!hasNext())
+      return nullptr;
+    Instruction *Cur = &*BBI++;
+    if (!isa<InstTy>(Cur))
+      return nullptr;
+    return cast<InstTy>(Cur);
+  }
+};
+
+TEST_F(OpenMPIRBuilderTest, SingleDirectiveCopyPrivate) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+
+  AllocaInst *PrivAI = nullptr;
+
+  BasicBlock *EntryBB = nullptr;
+  BasicBlock *ThenBB = nullptr;
+
+  Value *CPVar = Builder.CreateAlloca(F->arg_begin()->getType());
+  Builder.CreateStore(F->arg_begin(), CPVar);
+
+  FunctionType *CopyFuncTy = FunctionType::get(
+      Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getPtrTy()}, false);
+  Function *CopyFunc =
+      Function::Create(CopyFuncTy, Function::PrivateLinkage, "copy_var", *M);
+
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+    if (AllocaIP.isSet())
+      Builder.restoreIP(AllocaIP);
+    else
+      Builder.SetInsertPoint(&*(F->getEntryBlock().getFirstInsertionPt()));
+    PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
+    Builder.CreateStore(F->arg_begin(), PrivAI);
+
+    llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
+    llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint();
+    EXPECT_EQ(CodeGenIPBB->getTerminator(), CodeGenIPInst);
+
+    Builder.restoreIP(CodeGenIP);
+
+    // collect some info for checks later
+    ThenBB = Builder.GetInsertBlock();
+    EntryBB = ThenBB->getUniquePredecessor();
+
+    // simple instructions for body
+    Value *PrivLoad =
+        Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI, "local.use");
+    Builder.CreateICmpNE(F->arg_begin(), PrivLoad);
+  };
+
+  auto FiniCB = [&](InsertPointTy IP) {
+    BasicBlock *IPBB = IP.getBlock();
+    // IP must be before the unconditional branch to ExitBB
+    EXPECT_NE(IPBB->end(), IP.getPoint());
+  };
+
+  Builder.restoreIP(OMPBuilder.createSingle(Builder, BodyGenCB, FiniCB,
+                                            /*IsNowait*/ false, {CPVar},
+                                            {CopyFunc}));
+  Value *EntryBBTI = EntryBB->getTerminator();
+  EXPECT_NE(EntryBBTI, nullptr);
+  EXPECT_TRUE(isa<BranchInst>(EntryBBTI));
+  BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
+  EXPECT_TRUE(EntryBr->isConditional());
+  EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
+  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
+  EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
+
+  CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
+  EXPECT_TRUE(isa<CallInst>(CondInst->getOperand(0)));
+
+  CallInst *SingleEntryCI = cast<CallInst>(CondInst->getOperand(0));
+  EXPECT_EQ(SingleEntryCI->arg_size(), 2U);
+  EXPECT_EQ(SingleEntryCI->getCalledFunction()->getName(), "__kmpc_single");
+  EXPECT_TRUE(isa<GlobalVariable>(SingleEntryCI->getArgOperand(0)));
+
+  // check ThenBB
+  BBInstIter ThenBBI(ThenBB);
+  // load PrivAI
+  auto *PrivLI = ThenBBI.next<LoadInst>();
+  EXPECT_NE(PrivLI, nullptr);
+  EXPECT_EQ(PrivLI->getPointerOperand(), PrivAI);
+  // icmp
+  EXPECT_TRUE(ThenBBI.next<ICmpInst>());
+  // store 1, DidIt
+  auto *DidItSI = ThenBBI.next<StoreInst>();
+  EXPECT_NE(DidItSI, nullptr);
+  EXPECT_EQ(DidItSI->getValueOperand(),
+            ConstantInt::get(Type::getInt32Ty(Ctx), 1));
+  Value *DidIt = DidItSI->getPointerOperand();
+  // call __kmpc_end_single
+  auto *SingleEndCI = ThenBBI.next<CallInst>();
+  EXPECT_NE(SingleEndCI, nullptr);
+  EXPECT_EQ(SingleEndCI->getCalledFunction()->getName(), "__kmpc_end_single");
+  EXPECT_EQ(SingleEndCI->arg_size(), 2U);
+  EXPECT_TRUE(isa<GlobalVariable>(SingleEndCI->getArgOperand(0)));
+  EXPECT_EQ(SingleEndCI->getArgOperand(1), SingleEntryCI->getArgOperand(1));
+  // br ExitBB
+  auto *ExitBBBI = ThenBBI.next<BranchInst>();
+  EXPECT_NE(ExitBBBI, nullptr);
+  EXPECT_TRUE(ExitBBBI->isUnconditional());
+  EXPECT_EQ(ExitBBBI->getOperand(0), ExitBB);
+  EXPECT_FALSE(ThenBBI.hasNext());
+
+  // check ExitBB
+  BBInstIter ExitBBI(ExitBB);
+  // call __kmpc_global_thread_num
+  auto *ThreadNumCI = ExitBBI.next<CallInst>();
+  EXPECT_NE(ThreadNumCI, nullptr);
+  EXPECT_EQ(ThreadNumCI->getCalledFunction()->getName(),
+            "__kmpc_global_thread_num");
+  // load DidIt
+  auto *DidItLI = ExitBBI.next<LoadInst>();
+  EXPECT_NE(DidItLI, nullptr);
+  EXPECT_EQ(DidItLI->getPointerOperand(), DidIt);
+  // call __kmpc_copyprivate
+  auto *CopyPrivateCI = ExitBBI.next<CallInst>();
+  EXPECT_NE(CopyPrivateCI, nullptr);
+  EXPECT_EQ(CopyPrivateCI->arg_size(), 6U);
+  EXPECT_TRUE(isa<AllocaInst>(CopyPrivateCI->getArgOperand(3)));
+  EXPECT_EQ(CopyPrivateCI->getArgOperand(3), CPVar);
+  EXPECT_TRUE(isa<Function>(CopyPrivateCI->getArgOperand(4)));
+  EXPECT_EQ(CopyPrivateCI->getArgOperand(4), CopyFunc);
+  EXPECT_TRUE(isa<LoadInst>(CopyPrivateCI->getArgOperand(5)));
+  DidItLI = cast<LoadInst>(CopyPrivateCI->getArgOperand(5));
+  EXPECT_EQ(DidItLI->getOperand(0), DidIt);
+  EXPECT_FALSE(ExitBBI.hasNext());
+}
+
 TEST_F(OpenMPIRBuilderTest, OMPAtomicReadFlt) {
   OpenMPIRBuilder OMPBuilder(*M);
   OMPBuilder.initialize();
diff --git a/llvm/unittests/Support/TypeSizeTest.cpp b/llvm/unittests/Support/TypeSizeTest.cpp
index 34fe376989e7ba..b02b7e60095359 100644
--- a/llvm/unittests/Support/TypeSizeTest.cpp
+++ b/llvm/unittests/Support/TypeSizeTest.cpp
@@ -81,7 +81,6 @@ static_assert(INT64_C(2) * TSFixed32 == TypeSize::getFixed(64));
 static_assert(UINT64_C(2) * TSFixed32 == TypeSize::getFixed(64));
 static_assert(alignTo(TypeSize::getFixed(7), 8) == TypeSize::getFixed(8));
 
-static_assert(TypeSize() == TypeSize::getFixed(0));
 static_assert(TypeSize::getZero() == TypeSize::getFixed(0));
 static_assert(TypeSize::getZero() != TypeSize::getScalable(0));
 static_assert(TypeSize::getFixed(0) != TypeSize::getScalable(0));
diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
index aeb25bf012d034..3a76054ca4f36d 100644
--- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp
+++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
@@ -1126,7 +1126,9 @@ TEST(MachineInstr, HasSideEffects) {
       VLDR_VPR_post,
       VLDR_VPR_pre,
       VLLDM,
+      VLLDM_T2,
       VLSTM,
+      VLSTM_T2,
       VMRS,
       VMRS_FPCXTNS,
       VMRS_FPCXTS,
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
index 505a030675f64c..f1fa891171177c 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
@@ -219,6 +219,25 @@ TEST_F(X86SnippetFileTest, SnippetAddress) {
   EXPECT_EQ(Snippet.Key.SnippetAddress, 0x10000);
 }
 
+TEST_F(X86SnippetFileTest, LoopRegister) {
+  auto Snippets = TestCommon(R"(
+    # LLVM-EXEGESIS-LOOP-REGISTER R11
+  )");
+  ASSERT_TRUE(static_cast<bool>(Snippets));
+  EXPECT_THAT(*Snippets, SizeIs(1));
+  const auto &Snippet = (*Snippets)[0];
+  EXPECT_EQ(Snippet.Key.LoopRegister, X86::R11);
+}
+
+TEST_F(X86SnippetFileTest, LoopRegisterInvalidRegister) {
+  auto Error = TestCommon(R"(
+    # LLVM-EXEGESIS-LOOP-REGISTER INVALID
+  )")
+                   .takeError();
+  EXPECT_TRUE(static_cast<bool>(Error));
+  consumeError(std::move(Error));
+}
+
 } // namespace
 } // namespace exegesis
 } // namespace llvm
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
index 25e8836087c15d..b55ca5057ae01c 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
@@ -40,7 +40,10 @@ class X86SnippetRepetitorTest : public X86TestBase {
 
   void TestCommon(Benchmark::RepetitionModeE RepetitionMode,
                   unsigned SnippetInstructions = 1) {
-    const auto Repetitor = SnippetRepetitor::Create(RepetitionMode, State);
+    const auto Repetitor = SnippetRepetitor::Create(
+        RepetitionMode, State,
+        State.getExegesisTarget().getDefaultLoopCounterRegister(
+            State.getTargetMachine().getTargetTriple()));
     const std::vector<MCInst> Instructions(SnippetInstructions,
                                            MCInstBuilder(X86::NOOP));
     FunctionFiller Sink(*MF, {X86::EAX});
@@ -98,11 +101,12 @@ TEST_F(X86SnippetRepetitorTest, Loop) {
                           HasOpcode(X86::NOOP), HasOpcode(X86::NOOP),
                           HasOpcode(X86::NOOP), HasOpcode(X86::ADD64ri8),
                           HasOpcode(X86::JCC_1)));
-  EXPECT_THAT(LoopBlock.liveins(),
-              UnorderedElementsAre(
-                  LiveReg(X86::EAX),
-                  LiveReg(State.getExegesisTarget().getLoopCounterRegister(
-                      State.getTargetMachine().getTargetTriple()))));
+  EXPECT_THAT(
+      LoopBlock.liveins(),
+      UnorderedElementsAre(
+          LiveReg(X86::EAX),
+          LiveReg(State.getExegesisTarget().getDefaultLoopCounterRegister(
+              State.getTargetMachine().getTargetTriple()))));
   EXPECT_THAT(MF->getBlockNumbered(2)->instrs(),
               ElementsAre(HasOpcode(X86::RET64)));
 }
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index d80761d5fe35d2..1e80eb6b1ad50e 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -365,8 +365,8 @@ void CodeEmitterGen::emitInstructionBaseValues(
   if (HwMode == -1)
     o << "  static const uint64_t InstBits[] = {\n";
   else
-    o << "  static const uint64_t InstBits_" << HWM.getMode(HwMode).Name
-      << "[] = {\n";
+    o << "  static const uint64_t InstBits_"
+      << HWM.getModeName(HwMode, /*IncludeDefault=*/true) << "[] = {\n";
 
   for (const CodeGenInstruction *CGI : NumberedInstructions) {
     Record *R = CGI->TheDef;
@@ -495,8 +495,8 @@ void CodeEmitterGen::run(raw_ostream &o) {
       o << "  switch (HwMode) {\n";
       o << "  default: llvm_unreachable(\"Unknown hardware mode!\"); break;\n";
       for (unsigned I : HwModes) {
-        o << "  case " << I << ": InstBits = InstBits_" << HWM.getMode(I).Name
-          << "; break;\n";
+        o << "  case " << I << ": InstBits = InstBits_"
+          << HWM.getModeName(I, /*IncludeDefault=*/true) << "; break;\n";
       }
       o << "  };\n";
     }
diff --git a/llvm/utils/TableGen/CodeGenHwModes.h b/llvm/utils/TableGen/CodeGenHwModes.h
index 56639f741ede11..23723b7bd4af55 100644
--- a/llvm/utils/TableGen/CodeGenHwModes.h
+++ b/llvm/utils/TableGen/CodeGenHwModes.h
@@ -52,6 +52,11 @@ struct CodeGenHwModes {
     assert(Id != 0 && "Mode id of 0 is reserved for the default mode");
     return Modes[Id - 1];
   }
+  StringRef getModeName(unsigned Id, bool IncludeDefault = false) const {
+    if (IncludeDefault && Id == CodeGenHwModes::DefaultMode)
+      return DefaultModeName;
+    return getMode(Id).Name;
+  }
   const HwModeSelect &getHwModeSelect(Record *R) const;
   const std::map<Record *, HwModeSelect> &getHwModeSelects() const {
     return ModeSelects;
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 4ce5a73d775668..27ff84bce4058e 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -2461,8 +2461,9 @@ collectHwModesReferencedForEncodings(const CodeGenHwModes &HWM,
         BV.set(P.first);
     }
   }
-  transform(BV.set_bits(), std::back_inserter(Names),
-            [&HWM](const int &M) { return HWM.getMode(M).Name; });
+  transform(BV.set_bits(), std::back_inserter(Names), [&HWM](const int &M) {
+    return HWM.getModeName(M, /*IncludeDefault=*/true);
+  });
 }
 
 // Emits disassembler code for instruction decoding.
@@ -2503,8 +2504,9 @@ void DecoderEmitter::run(raw_ostream &o) {
       if (DefInit *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
         EncodingInfoByHwMode EBM(DI->getDef(), HWM);
         for (auto &KV : EBM)
-          NumberedEncodings.emplace_back(KV.second, NumberedInstruction,
-                                         HWM.getMode(KV.first).Name);
+          NumberedEncodings.emplace_back(
+              KV.second, NumberedInstruction,
+              HWM.getModeName(KV.first, /*IncludeDefault=*/true));
         continue;
       }
     }
diff --git a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
index fbff113613d269..5e533bf23ec475 100644
--- a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
@@ -8,6 +8,8 @@ static_library("InstallAPI") {
   ]
   sources = [
     "FileList.cpp",
+    "Frontend.cpp",
     "HeaderFile.cpp",
+    "Visitor.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
index 949b3b2147405c..a8d6290f1b990d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
@@ -61,6 +61,7 @@ static_library("LLVMWebAssemblyCodeGen") {
     "WebAssemblyOptimizeLiveIntervals.cpp",
     "WebAssemblyOptimizeReturned.cpp",
     "WebAssemblyPeephole.cpp",
+    "WebAssemblyRefTypeMem2Local.cpp",
     "WebAssemblyRegColoring.cpp",
     "WebAssemblyRegNumbering.cpp",
     "WebAssemblyRegStackify.cpp",
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 16c898bdeb6e00..070609c94a3b38 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -111,8 +111,6 @@ if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
 else()
   set(MLIR_ENABLE_CUDA_CONVERSIONS 0)
 endif()
-# TODO: we should use a config.h file like LLVM does
-add_definitions(-DMLIR_CUDA_CONVERSIONS_ENABLED=${MLIR_ENABLE_CUDA_CONVERSIONS})
 
 # Build the ROCm conversions and run according tests if the AMDGPU backend
 # is available.
diff --git a/mlir/docs/Dialects/GPU.md b/mlir/docs/Dialects/GPU.md
index 85255fdc5e6439..8a3acc33600a41 100644
--- a/mlir/docs/Dialects/GPU.md
+++ b/mlir/docs/Dialects/GPU.md
@@ -50,6 +50,7 @@ An example of how the compilation workflow look is:
 ```
 mlir-opt example.mlir                   \
   --pass-pipeline="builtin.module(      \
+    gpu-kernel-outlining,               \ # Outline gpu.launch body to a kernel.
     nvvm-attach-target{chip=sm_90 O=3}, \ # Attach an NVVM target to a gpu.module op.
     gpu.module(convert-gpu-to-nvvm),    \ # Convert GPU to NVVM.
     gpu-to-llvm,                        \ # Convert GPU to LLVM.
diff --git a/mlir/docs/PassManagement.md b/mlir/docs/PassManagement.md
index ff86bbfef7b0bf..c9d705f0506a3e 100644
--- a/mlir/docs/PassManagement.md
+++ b/mlir/docs/PassManagement.md
@@ -56,8 +56,7 @@ By default, an operation pass is `op-agnostic`, meaning that it operates on the
 operation type of the pass manager that it is added to. This means a pass may operate
 on many different types of operations. Agnostic passes should be written such that
 they do not make assumptions on the operation they run on. Examples of this type of pass are
-[Canonicalization](Pass.md/-canonicalize-canonicalize-operations)
-[Common Sub-Expression Elimination](Passes.md/#-cse-eliminate-common-sub-expressions).
+[Canonicalization](Passes.md/#-canonicalize) and [Common Sub-Expression Elimination](Passes.md/#-cse).
 
 To create an agnostic operation pass, a derived class must adhere to the following:
 
diff --git a/mlir/docs/PatternRewriter.md b/mlir/docs/PatternRewriter.md
index 011cd14175634b..0ba76199874cc3 100644
--- a/mlir/docs/PatternRewriter.md
+++ b/mlir/docs/PatternRewriter.md
@@ -366,7 +366,7 @@ Note: This driver listens for IR changes via the callbacks provided by
 rewriter and do not bypass the rewriter API by modifying ops directly.
 
 Note: This driver is the one used by the [canonicalization](Canonicalization.md)
-[pass](Passes.md/#-canonicalize-canonicalize-operations) in MLIR.
+[pass](Passes.md/#-canonicalize) in MLIR.
 
 ### Debugging
 
diff --git a/mlir/examples/CMakeLists.txt b/mlir/examples/CMakeLists.txt
index d256bf1a5cbb13..2a1cac34d8c290 100644
--- a/mlir/examples/CMakeLists.txt
+++ b/mlir/examples/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(toy)
 add_subdirectory(transform)
+add_subdirectory(transform-opt)
 add_subdirectory(minimal-opt)
diff --git a/mlir/examples/transform-opt/CMakeLists.txt b/mlir/examples/transform-opt/CMakeLists.txt
new file mode 100644
index 00000000000000..8e23555d0b5d73
--- /dev/null
+++ b/mlir/examples/transform-opt/CMakeLists.txt
@@ -0,0 +1,26 @@
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
+
+set(LIBS
+  MLIRAnalysis
+  MLIRIR
+  MLIRParser
+  MLIRSupport
+  MLIRTransformDialect
+  MLIRTransformDialectTransforms
+  MLIRTransforms
+  ${dialect_libs}
+  ${conversion_libs}
+  ${extension_libs}
+)
+
+add_mlir_tool(mlir-transform-opt
+  mlir-transform-opt.cpp
+
+  DEPENDS
+  ${LIBS}
+)
+target_link_libraries(mlir-transform-opt PRIVATE ${LIBS})
+llvm_update_compile_flags(mlir-transform-opt)
+mlir_check_all_link_libraries(mlir-transform-opt)
diff --git a/mlir/examples/transform-opt/README.md b/mlir/examples/transform-opt/README.md
new file mode 100644
index 00000000000000..e9c8cc0173c7b2
--- /dev/null
+++ b/mlir/examples/transform-opt/README.md
@@ -0,0 +1,40 @@
+# Standalone Transform Dialect Interpreter
+
+This is an example of using the Transform dialect interpreter functionality standalone, that is, outside of the regular pass pipeline. The example is a
+binary capable of processing MLIR source files similar to `mlir-opt` and other
+optimizer drivers, with the entire transformation process driven by a Transform
+dialect script. This script can be embedded into the source file or provided in
+a separate MLIR source file.
+
+Either the input module or the transform module must contain a top-level symbol
+named `__transform_main`, which is used as the entry point to the transformation
+script.
+
+```sh
+mlir-transform-opt payload_with_embedded_transform.mlir
+mlir-transform-opt payload.mlir -transform=transform.mlir
+```
+
+The name of the entry point can be overridden using command-line options.
+
+```sh
+mlir-transform-opt payload-mlir -transform-entry-point=another_entry_point
+```
+
+Transform scripts can reference symbols defined in other source files, called
+libraries, which can be supplied to the binary through command-line options.
+Libraries will be embedded into the main transformation module by the tool and
+the interpreter will process everything as a single module. A debug option is
+available to see the contents of the transform module before it goes into the interpreter.
+
+```sh
+mlir-transform-opt payload.mlir -transform=transform.mlir \
+  -transform-library=external_definitions_1.mlir \
+  -transform-library=external_definitions_2.mlir \
+  -dump-library-module
+```
+
+Check out the [Transform dialect
+tutorial](https://mlir.llvm.org/docs/Tutorials/transform/) as well as
+[documentation](https://mlir.llvm.org/docs/Dialects/Transform/) to learn more
+about the dialect. 
diff --git a/mlir/examples/transform-opt/mlir-transform-opt.cpp b/mlir/examples/transform-opt/mlir-transform-opt.cpp
new file mode 100644
index 00000000000000..41a17f18726b1a
--- /dev/null
+++ b/mlir/examples/transform-opt/mlir-transform-opt.cpp
@@ -0,0 +1,389 @@
+//===- mlir-transform-opt.cpp -----------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/Utils.h"
+#include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllExtensions.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include <cstdlib>
+
+namespace {
+
+using namespace llvm;
+
+/// Structure containing command line options for the tool, these will get
+/// initialized when an instance is created.
+struct MlirTransformOptCLOptions {
+  cl::opt<bool> allowUnregisteredDialects{
+      "allow-unregistered-dialect",
+      cl::desc("Allow operations coming from an unregistered dialect"),
+      cl::init(false)};
+
+  cl::opt<bool> verifyDiagnostics{
+      "verify-diagnostics",
+      cl::desc("Check that emitted diagnostics match expected-* lines "
+               "on the corresponding line"),
+      cl::init(false)};
+
+  cl::opt<std::string> payloadFilename{cl::Positional, cl::desc("<input file>"),
+                                       cl::init("-")};
+
+  cl::opt<std::string> outputFilename{"o", cl::desc("Output filename"),
+                                      cl::value_desc("filename"),
+                                      cl::init("-")};
+
+  cl::opt<std::string> transformMainFilename{
+      "transform",
+      cl::desc("File containing entry point of the transform script, if "
+               "different from the input file"),
+      cl::value_desc("filename"), cl::init("")};
+
+  cl::list<std::string> transformLibraryFilenames{
+      "transform-library", cl::desc("File(s) containing definitions of "
+                                    "additional transform script symbols")};
+
+  cl::opt<std::string> transformEntryPoint{
+      "transform-entry-point",
+      cl::desc("Name of the entry point transform symbol"),
+      cl::init(mlir::transform::TransformDialect::kTransformEntryPointSymbolName
+                   .str())};
+
+  cl::opt<bool> disableExpensiveChecks{
+      "disable-expensive-checks",
+      cl::desc("Disables potentially expensive checks in the transform "
+               "interpreter, providing more speed at the expense of "
+               "potential memory problems and silent corruptions"),
+      cl::init(false)};
+
+  cl::opt<bool> dumpLibraryModule{
+      "dump-library-module",
+      cl::desc("Prints the combined library module before the output"),
+      cl::init(false)};
+};
+} // namespace
+
+/// "Managed" static instance of the command-line options structure. This makes
+/// them locally-scoped and explicitly initialized/deinitialized. While this is
+/// not strictly necessary in the tool source file that is not being used as a
+/// library (where the options would pollute the global list of options), it is
+/// good practice to follow this.
+static llvm::ManagedStatic<MlirTransformOptCLOptions> clOptions;
+
+/// Explicitly registers command-line options.
+static void registerCLOptions() { *clOptions; }
+
+namespace {
+/// A wrapper class for source managers diagnostic. This provides both unique
+/// ownership and virtual function-like overload for a pair of
+/// inheritance-related classes that do not use virtual functions.
+class DiagnosticHandlerWrapper {
+public:
+  /// Kind of the diagnostic handler to use.
+  enum class Kind { EmitDiagnostics, VerifyDiagnostics };
+
+  /// Constructs the diagnostic handler of the specified kind of the given
+  /// source manager and context.
+  DiagnosticHandlerWrapper(Kind kind, llvm::SourceMgr &mgr,
+                           mlir::MLIRContext *context) {
+    if (kind == Kind::EmitDiagnostics)
+      handler = new mlir::SourceMgrDiagnosticHandler(mgr, context);
+    else
+      handler = new mlir::SourceMgrDiagnosticVerifierHandler(mgr, context);
+  }
+
+  /// This object is non-copyable but movable.
+  DiagnosticHandlerWrapper(const DiagnosticHandlerWrapper &) = delete;
+  DiagnosticHandlerWrapper(DiagnosticHandlerWrapper &&other) = default;
+  DiagnosticHandlerWrapper &
+  operator=(const DiagnosticHandlerWrapper &) = delete;
+  DiagnosticHandlerWrapper &operator=(DiagnosticHandlerWrapper &&) = default;
+
+  /// Verifies the captured "expected-*" diagnostics if required.
+  mlir::LogicalResult verify() const {
+    if (auto *ptr =
+            handler.dyn_cast<mlir::SourceMgrDiagnosticVerifierHandler *>()) {
+      return ptr->verify();
+    }
+    return mlir::success();
+  }
+
+  /// Destructs the object of the same type as allocated.
+  ~DiagnosticHandlerWrapper() {
+    if (auto *ptr = handler.dyn_cast<mlir::SourceMgrDiagnosticHandler *>()) {
+      delete ptr;
+    } else {
+      delete handler.get<mlir::SourceMgrDiagnosticVerifierHandler *>();
+    }
+  }
+
+private:
+  /// Internal storage is a type-safe union.
+  llvm::PointerUnion<mlir::SourceMgrDiagnosticHandler *,
+                     mlir::SourceMgrDiagnosticVerifierHandler *>
+      handler;
+};
+
+/// MLIR has deeply rooted expectations that the LLVM source manager contains
+/// exactly one buffer, until at least the lexer level. This class wraps
+/// multiple LLVM source managers each managing a buffer to match MLIR's
+/// expectations while still providing a centralized handling mechanism.
+class TransformSourceMgr {
+public:
+  /// Constructs the source manager indicating whether diagnostic messages will
+  /// be verified later on.
+  explicit TransformSourceMgr(bool verifyDiagnostics)
+      : verifyDiagnostics(verifyDiagnostics) {}
+
+  /// Deconstructs the source manager. Note that `checkResults` must have been
+  /// called on this instance before deconstructing it.
+  ~TransformSourceMgr() {
+    assert(resultChecked && "must check the result of diagnostic handlers by "
+                            "running TransformSourceMgr::checkResult");
+  }
+
+  /// Parses the given buffer and creates the top-level operation of the kind
+  /// specified as template argument in the given context. Additional parsing
+  /// options may be provided.
+  template <typename OpTy = mlir::Operation *>
+  mlir::OwningOpRef<OpTy> parseBuffer(std::unique_ptr<MemoryBuffer> buffer,
+                                      mlir::MLIRContext &context,
+                                      const mlir::ParserConfig &config) {
+    // Create a single-buffer LLVM source manager. Note that `unique_ptr` allows
+    // the code below to capture a reference to the source manager in such a way
+    // that it is not invalidated when the vector contents is eventually
+    // reallocated.
+    llvm::SourceMgr &mgr =
+        *sourceMgrs.emplace_back(std::make_unique<llvm::SourceMgr>());
+    mgr.AddNewSourceBuffer(std::move(buffer), llvm::SMLoc());
+
+    // Choose the type of diagnostic handler depending on whether diagnostic
+    // verification needs to happen and store it.
+    if (verifyDiagnostics) {
+      diagHandlers.emplace_back(
+          DiagnosticHandlerWrapper::Kind::VerifyDiagnostics, mgr, &context);
+    } else {
+      diagHandlers.emplace_back(DiagnosticHandlerWrapper::Kind::EmitDiagnostics,
+                                mgr, &context);
+    }
+
+    // Defer to MLIR's parser.
+    return mlir::parseSourceFile<OpTy>(mgr, config);
+  }
+
+  /// If diagnostic message verification has been requested upon construction of
+  /// this source manager, performs the verification, reports errors and returns
+  /// the result of the verification. Otherwise passes through the given value.
+  mlir::LogicalResult checkResult(mlir::LogicalResult result) {
+    resultChecked = true;
+    if (!verifyDiagnostics)
+      return result;
+
+    return mlir::failure(llvm::any_of(diagHandlers, [](const auto &handler) {
+      return mlir::failed(handler.verify());
+    }));
+  }
+
+private:
+  /// Indicates whether diagnostic message verification is requested.
+  const bool verifyDiagnostics;
+
+  /// Indicates that diagnostic message verification has taken place, and the
+  /// deconstruction is therefore safe.
+  bool resultChecked = false;
+
+  /// Storage for per-buffer source managers and diagnostic handlers. These are
+  /// wrapped into unique pointers in order to make it safe to capture
+  /// references to these objects: if the vector is reallocated, the unique
+  /// pointer objects are moved by the pointer addresses won't change. Also, for
+  /// handlers, this allows to store the pointer to the base class.
+  SmallVector<std::unique_ptr<llvm::SourceMgr>> sourceMgrs;
+  SmallVector<DiagnosticHandlerWrapper> diagHandlers;
+};
+} // namespace
+
+/// Trivial wrapper around `applyTransforms` that doesn't support extra mapping
+/// and doesn't enforce the entry point transform ops being top-level.
+static mlir::LogicalResult
+applyTransforms(mlir::Operation *payloadRoot,
+                mlir::transform::TransformOpInterface transformRoot,
+                const mlir::transform::TransformOptions &options) {
+  return applyTransforms(payloadRoot, transformRoot, {}, options,
+                         /*enforceToplevelTransformOp=*/false);
+}
+
+/// Applies transforms indicated in the transform dialect script to the input
+/// buffer. The transform script may be embedded in the input buffer or as a
+/// separate buffer. The transform script may have external symbols, the
+/// definitions of which must be provided in transform library buffers. If the
+/// application is successful, prints the transformed input buffer into the
+/// given output stream. Additional configuration options are derived from
+/// command-line options.
+static mlir::LogicalResult processPayloadBuffer(
+    raw_ostream &os, std::unique_ptr<MemoryBuffer> inputBuffer,
+    std::unique_ptr<llvm::MemoryBuffer> transformBuffer,
+    MutableArrayRef<std::unique_ptr<MemoryBuffer>> transformLibraries,
+    mlir::DialectRegistry &registry) {
+
+  // Initialize the MLIR context, and various configurations.
+  mlir::MLIRContext context(registry, mlir::MLIRContext::Threading::DISABLED);
+  context.allowUnregisteredDialects(clOptions->allowUnregisteredDialects);
+  mlir::ParserConfig config(&context);
+  TransformSourceMgr sourceMgr(
+      /*verifyDiagnostics=*/clOptions->verifyDiagnostics);
+
+  // Parse the input buffer that will be used as transform payload.
+  mlir::OwningOpRef<mlir::Operation *> payloadRoot =
+      sourceMgr.parseBuffer(std::move(inputBuffer), context, config);
+  if (!payloadRoot)
+    return sourceMgr.checkResult(mlir::failure());
+
+  // Identify the module containing the transform script entry point. This may
+  // be the same module as the input or a separate module. In the former case,
+  // make a copy of the module so it can be modified freely. Modification may
+  // happen in the script itself (at which point it could be rewriting itself
+  // during interpretation, leading to tricky memory errors) or by embedding
+  // library modules in the script.
+  mlir::OwningOpRef<mlir::ModuleOp> transformRoot;
+  if (transformBuffer) {
+    transformRoot = sourceMgr.parseBuffer<mlir::ModuleOp>(
+        std::move(transformBuffer), context, config);
+    if (!transformRoot)
+      return sourceMgr.checkResult(mlir::failure());
+  } else {
+    transformRoot = cast<mlir::ModuleOp>(payloadRoot->clone());
+  }
+
+  // Parse and merge the libraries into the main transform module.
+  for (auto &&transformLibrary : transformLibraries) {
+    mlir::OwningOpRef<mlir::ModuleOp> libraryModule =
+        sourceMgr.parseBuffer<mlir::ModuleOp>(std::move(transformLibrary),
+                                              context, config);
+
+    if (!libraryModule ||
+        mlir::failed(mlir::transform::detail::mergeSymbolsInto(
+            *transformRoot, std::move(libraryModule))))
+      return sourceMgr.checkResult(mlir::failure());
+  }
+
+  // If requested, dump the combined transform module.
+  if (clOptions->dumpLibraryModule)
+    transformRoot->dump();
+
+  // Find the entry point symbol. Even if it had originally been in the payload
+  // module, it was cloned into the transform module so only look there.
+  mlir::transform::TransformOpInterface entryPoint =
+      mlir::transform::detail::findTransformEntryPoint(
+          *transformRoot, mlir::ModuleOp(), clOptions->transformEntryPoint);
+  if (!entryPoint)
+    return sourceMgr.checkResult(mlir::failure());
+
+  // Apply the requested transformations.
+  mlir::transform::TransformOptions transformOptions;
+  transformOptions.enableExpensiveChecks(!clOptions->disableExpensiveChecks);
+  if (mlir::failed(applyTransforms(*payloadRoot, entryPoint, transformOptions)))
+    return sourceMgr.checkResult(mlir::failure());
+
+  // Print the transformed result and check the captured diagnostics if
+  // requested.
+  payloadRoot->print(os);
+  return sourceMgr.checkResult(mlir::success());
+}
+
+/// Tool entry point.
+static mlir::LogicalResult runMain(int argc, char **argv) {
+  // Register all upstream dialects and extensions. Specific uses are advised
+  // not to register all dialects indiscriminately but rather hand-pick what is
+  // necessary for their use case.
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::registerAllExtensions(registry);
+  mlir::registerAllPasses();
+
+  // Explicitly register the transform dialect. This is not strictly necessary
+  // since it has been already registered as part of the upstream dialect list,
+  // but useful for example purposes for cases when dialects to register are
+  // hand-picked. The transform dialect must be registered.
+  registry.insert<mlir::transform::TransformDialect>();
+
+  // Register various command-line options. Note that the LLVM initializer
+  // object is a RAII that ensures correct deconstruction of command-line option
+  // objects inside ManagedStatic.
+  llvm::InitLLVM y(argc, argv);
+  mlir::registerAsmPrinterCLOptions();
+  mlir::registerMLIRContextCLOptions();
+  registerCLOptions();
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Minimal Transform dialect driver\n");
+
+  // Try opening the main input file.
+  std::string errorMessage;
+  std::unique_ptr<llvm::MemoryBuffer> payloadFile =
+      mlir::openInputFile(clOptions->payloadFilename, &errorMessage);
+  if (!payloadFile) {
+    llvm::errs() << errorMessage << "\n";
+    return mlir::failure();
+  }
+
+  // Try opening the output file.
+  std::unique_ptr<llvm::ToolOutputFile> outputFile =
+      mlir::openOutputFile(clOptions->outputFilename, &errorMessage);
+  if (!outputFile) {
+    llvm::errs() << errorMessage << "\n";
+    return mlir::failure();
+  }
+
+  // Try opening the main transform file if provided.
+  std::unique_ptr<llvm::MemoryBuffer> transformRootFile;
+  if (!clOptions->transformMainFilename.empty()) {
+    if (clOptions->transformMainFilename == clOptions->payloadFilename) {
+      llvm::errs() << "warning: " << clOptions->payloadFilename
+                   << " is provided as both payload and transform file\n";
+    } else {
+      transformRootFile =
+          mlir::openInputFile(clOptions->transformMainFilename, &errorMessage);
+      if (!transformRootFile) {
+        llvm::errs() << errorMessage << "\n";
+        return mlir::failure();
+      }
+    }
+  }
+
+  // Try opening transform library files if provided.
+  SmallVector<std::unique_ptr<llvm::MemoryBuffer>> transformLibraries;
+  transformLibraries.reserve(clOptions->transformLibraryFilenames.size());
+  for (llvm::StringRef filename : clOptions->transformLibraryFilenames) {
+    transformLibraries.emplace_back(
+        mlir::openInputFile(filename, &errorMessage));
+    if (!transformLibraries.back()) {
+      llvm::errs() << errorMessage << "\n";
+      return mlir::failure();
+    }
+  }
+
+  return processPayloadBuffer(outputFile->os(), std::move(payloadFile),
+                              std::move(transformRootFile), transformLibraries,
+                              registry);
+}
+
+int main(int argc, char **argv) {
+  return mlir::asMainReturnCode(runMain(argc, argv));
+}
diff --git a/mlir/include/mlir/Config/mlir-config.h.cmake b/mlir/include/mlir/Config/mlir-config.h.cmake
index e152a36c0ce0cf..4a7d75e2266823 100644
--- a/mlir/include/mlir/Config/mlir-config.h.cmake
+++ b/mlir/include/mlir/Config/mlir-config.h.cmake
@@ -29,4 +29,8 @@
 /* If set, enables PDL usage. */
 #cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH
 
+/* If set, enables CUDA-related features in CUDA-related transforms, pipelines,
+   and targets. */
+#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS
+
 #endif
diff --git a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
index 23873d86b495c6..0eb67050608630 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
@@ -13,8 +13,8 @@ include "mlir/Dialect/Transform/IR/TransformAttrs.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
 include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
 
+include "mlir/Interfaces/SideEffectInterfaces.td"
 //===----------------------------------------------------------------------===//
 // ApplyOptimizeSharedMemoryReadsAndWritesOp
 //===----------------------------------------------------------------------===//
@@ -28,7 +28,9 @@ def ApplyOptimizeSharedMemoryReadsAndWritesOp :
     reads/writes with the goal of avoiding bank conflicts.
   }];
 
-  let arguments = (ins TransformHandleTypeInterface:$target);
+  let arguments = (ins TransformHandleTypeInterface:$target,
+                    DefaultValuedOptionalAttr<I64Attr, "128">:$sharedMemoryLineSizeBytes,
+                    DefaultValuedOptionalAttr<I64Attr, "128">:$defaultVectorSizeBits);
   let results = (outs);
 
   let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index c8059e6d316e8a..67f951fd19d172 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -37,10 +37,17 @@ def OptimizeSharedMemory : Pass<"amdgpu-optimize-shared-memory"> {
     attempts to optimize reads/writes from a memref representing GPU shared
     memory in order to avoid bank conflicts.
   }];
-
   let dependentDialects = [
     "memref::MemRefDialect", "vector::VectorDialect"
   ];
+  let options = [
+    Option<"sharedMemoryLineSizeBytes", "shared-memory-line-size-bytes", "int64_t",
+           /*default=*/"128",
+           "Shared memory line size in bytes">,
+    Option<"defaultVectorSizeBits", "default-vector-size-bits", "int64_t",
+           /*default=*/"128",
+           "Default vector size in bits">,
+  ];
 }
 
 #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
index 79f9ab71a2b430..843cea2c503b9a 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
@@ -45,11 +45,15 @@ namespace amdgpu {
 /// function that depends on the row Index. The permutation function is chosen
 /// to ensure that sequential distributed+vectorized reads/writes down a single
 /// dimension of the memref have minimal conflicts.
-LogicalResult optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
-                                                 Value memrefValue);
+LogicalResult
+optimizeSharedMemoryReadsAndWrites(Operation *parentOp, Value memrefValue,
+                                   int64_t sharedMemoryLineSizeBytes,
+                                   int64_t defaultVectorSizeBits);
 
 std::optional<LogicalResult>
-optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp);
+optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp,
+                                     int64_t sharedMemoryLineSizeBytes,
+                                     int64_t defaultVectorSizeBits);
 
 } // namespace amdgpu
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 309573a562872f..53ed31877c6f24 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -2295,6 +2295,49 @@ def ConvertConv2DToImg2ColOp : Op<Transform_Dialect,
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// FlattenElementwiseLinalgOp
+//===----------------------------------------------------------------------===//
+
+def FlattenElementwiseLinalgOp : Op<Transform_Dialect,
+    "structured.flatten_elementwise",
+    [FunctionalStyleTransformOpTrait,
+     MemoryEffectsOpInterface,
+     TransformOpInterface,
+     TransformEachOpTrait,
+     ReportTrackingListenerFailuresOpTrait]> {
+  let description = [{
+    Flattens the iteration space and (applicable) operands of elementwise
+    linalg ops to a single dimension.
+
+    Returns one handle:
+    - Flattened linalg operation.
+
+    #### Return modes:
+
+    Returns a definite failure if target is not isolated from above.
+    Returns a silenceable failure if the pattern application failed.
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$target);
+  let results = (outs TransformHandleTypeInterface:$transformed);
+
+  let assemblyFormat =
+    "$target attr-dict `:` functional-type($target, results)";
+
+  let builders = [
+    OpBuilder<(ins "Value":$target)>
+  ];
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::linalg::LinalgOp target,
+        ::mlir::transform::ApplyToEachResultList &results,
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Transpose Conv2D
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index a848d12fbbb50e..65cf19e7a4fcd6 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -1074,6 +1074,11 @@ bool isDimSequencePreserved(AffineMap map, ReassociationIndicesRef dimSequence);
 bool areDimSequencesPreserved(ArrayRef<AffineMap> maps,
                               ArrayRef<ReassociationIndices> dimSequences);
 
+struct CollapseResult {
+  SmallVector<Value> results;
+  LinalgOp collapsedOp;
+};
+
 /// Collapses dimensions of linalg.generic/linalg.copy operation. A precondition
 /// to calling this method is that for each list in `foldedIterationDim`, the
 /// sequence of dimensions is contiguous in domains of all `indexing_maps` of
@@ -1081,9 +1086,8 @@ bool areDimSequencesPreserved(ArrayRef<AffineMap> maps,
 /// When valid, the method also collapses the operands of the op. Returns
 /// replacement values of the results of the original `linalgOp` by inserting
 /// reshapes to get back values of compatible types.
-template <typename LinalgType>
-FailureOr<SmallVector<Value>>
-collapseOpIterationDims(LinalgType op,
+FailureOr<CollapseResult>
+collapseOpIterationDims(LinalgOp op,
                         ArrayRef<ReassociationIndices> foldedIterationDims,
                         RewriterBase &rewriter);
 
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
index 9e79b6aca1c9ba..5563cb907e9353 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -333,16 +333,28 @@ struct LevelType {
     return lvlBits & static_cast<uint64_t>(p);
   }
 
+  /// Check if the `LevelType` is considered to be sparse.
+  constexpr bool hasSparseSemantic() const {
+    return isa<LevelFormat::Compressed, LevelFormat::Singleton,
+               LevelFormat::LooseCompressed, LevelFormat::NOutOfM>();
+  }
+
+  /// Check if the `LevelType` is considered to be dense-like.
+  constexpr bool hasDenseSemantic() const {
+    return isa<LevelFormat::Dense, LevelFormat::Batch>();
+  }
+
   /// Check if the `LevelType` needs positions array.
   constexpr bool isWithPosLT() const {
-    return isa<LevelFormat::Compressed>() ||
-           isa<LevelFormat::LooseCompressed>();
+    assert(!isa<LevelFormat::Undef>());
+    return isa<LevelFormat::Compressed, LevelFormat::LooseCompressed>();
   }
 
   /// Check if the `LevelType` needs coordinates array.
   constexpr bool isWithCrdLT() const {
+    assert(!isa<LevelFormat::Undef>());
     // All sparse levels has coordinate array.
-    return !isa<LevelFormat::Dense, LevelFormat::Batch>();
+    return hasSparseSemantic();
   }
 
   std::string toMLIRString() const {
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
index 490ef3071af1b7..7f9820df984b29 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -509,8 +509,7 @@ class Merger {
   bool isSparseLvlWithNonTrivialIdxExp(TensorLoopId b) const {
     if (isLvlWithNonTrivialIdxExp(b)) {
       auto lt = getLoopDependentLevelType(b);
-      return isCompressedLT(lt) || isSingletonLT(lt) ||
-             isLooseCompressedLT(lt) || isNOutOfMLT(lt);
+      return lt.hasSparseSemantic();
     }
     return false;
   }
diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
index 1d6eb24156e334..86a2b3c21faf0d 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
@@ -66,7 +66,25 @@ def InterpreterPass : Pass<"transform-interpreter"> {
   let description = [{
     This pass runs the transform dialect interpreter and applies the named
     sequence transformation specified by the provided name (defaults to
-    `TransformDialect::kTransformEntryPointSymbolName` (i.e. `__transform_main`)).
+    `TransformDialect::kTransformEntryPointSymbolName`,
+    i.e. `__transform_main`).
+
+    Additional options can be used to narrow down the pass applicability for
+    debugging purposes:
+      * `debugPayloadRootTag` makes the transform script apply to the payload
+        operation that has a `transform.target_tag` string attribute with the
+        given value, rather than to the anchor operation of the pass.
+      * `debugBindTrailingArgs` allows one to bind values to trailing arguments
+        of the transform entry point as follows:
+        * arguments of `TransformHandleTypeInterface` type can be bound to all
+          payload operations with the name provided as a simple string;
+        * arguments of `TransformValueHandleTypeInterface` type can be bound to
+          a flattened list of results of all operations with the name provided
+          as a string prefixed with `^`;
+        * arguments of `TransformParamTypeInterface` type can be bound to
+          integer constants provided as `;`-separated list prefixed with `#`.
+      * `entryPoint` specifies the name of the transform symbol to serve as the
+        entry point.
   }];
   let dependentDialects = ["::mlir::transform::TransformDialect"];
   let options = [
@@ -83,7 +101,9 @@ def InterpreterPass : Pass<"transform-interpreter"> {
            "false",
            "Disable expensive checks in the interpreter for a faster run.">,
     Option<"entryPoint", "entry-point", "std::string",
-           /*default=*/[{TransformDialect::kTransformEntryPointSymbolName.str()}],
+           /*default=*/[{
+              TransformDialect::kTransformEntryPointSymbolName.str()
+            }],
            "Entry point of the pass pipeline.">,
   ];
 }
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index 051dd0e62cc532..989b77c5383075 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -589,7 +589,7 @@ class RewriterBase : public OpBuilder {
 
   /// Split the operations starting at "before" (inclusive) out of the given
   /// block into a new block, and return it.
-  virtual Block *splitBlock(Block *block, Block::iterator before);
+  Block *splitBlock(Block *block, Block::iterator before);
 
   /// Unlink this operation from its current block and insert it right before
   /// `existingOp` which may be in the same or another block in the same
diff --git a/mlir/include/mlir/IR/Value.h b/mlir/include/mlir/IR/Value.h
index fff3b87faff669..a74d0faa1dfc4b 100644
--- a/mlir/include/mlir/IR/Value.h
+++ b/mlir/include/mlir/IR/Value.h
@@ -90,7 +90,7 @@ class alignas(8) ValueImpl : public IRObjectWithUseList<OpOperand> {
 /// class has value-type semantics and is just a simple wrapper around a
 /// ValueImpl that is either owner by a block(in the case of a BlockArgument) or
 /// an Operation(in the case of an OpResult).
-/// As most IR construct, this isn't const-correct, but we keep method
+/// As most IR constructs, this isn't const-correct, but we keep method
 /// consistent and as such method that immediately modify this Value aren't
 /// marked `const` (include modifying the Value use-list).
 class Value {
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index e28921619fe582..5d90c197a6cced 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -14,6 +14,7 @@
 #ifndef MLIR_INITALLPASSES_H_
 #define MLIR_INITALLPASSES_H_
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Conversion/Passes.h"
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
 #include "mlir/Dialect/Affine/Passes.h"
@@ -96,7 +97,7 @@ inline void registerAllPasses() {
   bufferization::registerBufferizationPipelines();
   sparse_tensor::registerSparseTensorPipelines();
   tosa::registerTosaToLinalgPipelines();
-#if MLIR_CUDA_CONVERSIONS_ENABLED
+#if MLIR_ENABLE_CUDA_CONVERSIONS
   gpu::registerGPUToNVVMPipeline();
 #endif
 }
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 7e8e67a9d17824..84396529eb7c2e 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -741,9 +741,6 @@ class ConversionPatternRewriter final : public PatternRewriter {
   /// implemented for dialect conversion.
   void eraseBlock(Block *block) override;
 
-  /// PatternRewriter hook for splitting a block into two parts.
-  Block *splitBlock(Block *block, Block::iterator before) override;
-
   /// PatternRewriter hook for inlining the ops of a block into another block.
   void inlineBlockBefore(Block *source, Block *dest, Block::iterator before,
                          ValueRange argValues = std::nullopt) override;
diff --git a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
index e69f9c837ca1d6..10ccd5c97783bf 100644
--- a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
+++ b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Transforms/Transforms.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -558,6 +559,7 @@ class LowerAffinePass
     RewritePatternSet patterns(&getContext());
     populateAffineToStdConversionPatterns(patterns);
     populateAffineToVectorConversionPatterns(patterns);
+    populateAffineExpandIndexOpsPatterns(patterns);
     ConversionTarget target(getContext());
     target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
                            scf::SCFDialect, VectorDialect>();
diff --git a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
index 2ba0f30b1190ee..f41e3ca27ee4dd 100644
--- a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
+++ b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
@@ -12,12 +12,13 @@ add_mlir_conversion_library(MLIRAffineToStandard
 
   LINK_LIBS PUBLIC
   MLIRAffineDialect
+  MLIRAffineTransforms
   MLIRAffineUtils
   MLIRArithDialect
   MLIRIR
   MLIRMemRefDialect
-  MLIRSCFDialect
   MLIRPass
+  MLIRSCFDialect
   MLIRTransforms
   MLIRVectorDialect
   )
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 19cc914efae005..337f8bb6ab99ed 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1532,7 +1532,8 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
     auto punct = printOp.getPunctuation();
     if (auto stringLiteral = printOp.getStringLiteral()) {
       LLVM::createPrintStrCall(rewriter, loc, parent, "vector_print_str",
-                               *stringLiteral, *getTypeConverter());
+                               *stringLiteral, *getTypeConverter(),
+                               /*addNewline=*/false);
     } else if (punct != PrintPunctuation::NoPunctuation) {
       emitCall(rewriter, printOp->getLoc(), [&] {
         switch (punct) {
diff --git a/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp b/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
index ff29f9f6938535..b7e17a92897389 100644
--- a/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
@@ -27,7 +27,8 @@ DiagnosedSilenceableFailure
 ApplyOptimizeSharedMemoryReadsAndWritesOp::applyToOne(
     TransformRewriter &rewriter, FuncOp funcOp, ApplyToEachResultList &results,
     TransformState &state) {
-  optimizeSharedMemoryReadsAndWritesOp(funcOp);
+  optimizeSharedMemoryReadsAndWritesOp(funcOp, getSharedMemoryLineSizeBytes(),
+                                       getDefaultVectorSizeBits());
   return DiagnosedSilenceableFailure::success();
 }
 
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
index 6bd03ed833898d..32fab265e03cc0 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
@@ -35,13 +35,6 @@ namespace amdgpu {
 using namespace mlir;
 using namespace mlir::amdgpu;
 
-/// The size of a shared memory line according to AMD documentation.
-/// https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/instinct-mi200-cdna2-instruction-set-architecture.pdf
-constexpr int64_t kSharedMemoryLineSizeBytes = 64;
-/// We optimize for 64bit accesses, but this can be made an argument in the
-/// future.
-constexpr int64_t kDefaultVectorSizeBits = 64;
-
 /// Uses `srcIndexValue` to permute `tgtIndexValue` via
 /// `result = xor(floordiv(srcIdxVal,permuteEveryN),
 ///               floordiv(tgtIdxVal,vectorSize)))
@@ -49,7 +42,9 @@ constexpr int64_t kDefaultVectorSizeBits = 64;
 /// This is done using an optimized sequence of `arith` operations.
 static Value permuteVectorOffset(OpBuilder &b, Location loc,
                                  ArrayRef<Value> indices, MemRefType memrefTy,
-                                 int64_t srcDim, int64_t tgtDim) {
+                                 int64_t srcDim, int64_t tgtDim,
+                                 int64_t sharedMemoryLineSizeBytes,
+                                 int64_t defaultVectorSizeBits) {
   // Adjust the src index to change how often the permutation changes
   // if necessary.
   Value src = indices[srcDim];
@@ -57,9 +52,9 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
   // We only want to permute every N iterations of the target dim where N is
   // ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).
   const int64_t permuteEveryN = std::max<int64_t>(
-      1, kSharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *
-                                        memrefTy.getElementTypeBitWidth()) /
-                                       8));
+      1, sharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *
+                                       memrefTy.getElementTypeBitWidth()) /
+                                      8));
 
   // clang-format off
   // Index bit representation (b0 = least significant bit) for dim(1)
@@ -71,7 +66,7 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
   // bits[N:M] = vector index
   // clang-format on
   int64_t n =
-      llvm::Log2_64(kDefaultVectorSizeBits / memrefTy.getElementTypeBitWidth());
+      llvm::Log2_64(defaultVectorSizeBits / memrefTy.getElementTypeBitWidth());
   int64_t m = llvm::Log2_64(memrefTy.getDimSize(tgtDim));
 
   // Capture bits[0:(M-N)] of src by first creating a (M-N) mask.
@@ -105,9 +100,11 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
 static void transformIndices(OpBuilder &builder, Location loc,
                              SmallVector<Value, 4> &indices,
                              MemRefType memrefTy, int64_t srcDim,
-                             int64_t tgtDim) {
+                             int64_t tgtDim, int64_t sharedMemoryLineSizeBytes,
+                             int64_t defaultVectorSizeBits) {
   indices[tgtDim] =
-      permuteVectorOffset(builder, loc, indices, memrefTy, srcDim, tgtDim);
+      permuteVectorOffset(builder, loc, indices, memrefTy, srcDim, tgtDim,
+                          sharedMemoryLineSizeBytes, defaultVectorSizeBits);
 }
 
 // Return all operations within `parentOp` that read from or write to
@@ -149,8 +146,9 @@ getShmReadAndWriteOps(Operation *parentOp, Value shmMemRef,
   return success();
 }
 
-LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
-                                                         Value memrefValue) {
+LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(
+    Operation *parentOp, Value memrefValue, int64_t sharedMemoryLineSizeBytes,
+    int64_t defaultVectorSizeBits) {
   auto memRefType = dyn_cast<MemRefType>(memrefValue.getType());
   if (!memRefType ||
       !amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(memRefType))
@@ -167,10 +165,10 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
   // If dim[rank-1] is small enough to fit 8 rows in a 128B line.
   const int64_t rowSize = memRefType.getDimSize(memRefType.getRank() - 1);
   const int64_t rowsPerLine =
-      (8 * kSharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /
+      (8 * sharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /
       rowSize;
   const int64_t threadGroupSize =
-      1LL << (7 - llvm::Log2_64(kDefaultVectorSizeBits / 8));
+      1LL << (7 - llvm::Log2_64(defaultVectorSizeBits / 8));
   if (rowsPerLine >= threadGroupSize)
     return failure();
 
@@ -198,7 +196,8 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
     auto indices = amdgpu::getIndices(shmWriteOp);
     SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
     transformIndices(builder, shmWriteOp->getLoc(), transformedIndices,
-                     memRefType, srcDim, tgtDim);
+                     memRefType, srcDim, tgtDim, sharedMemoryLineSizeBytes,
+                     defaultVectorSizeBits);
     amdgpu::setIndices(shmWriteOp, transformedIndices);
   }
 
@@ -210,7 +209,8 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
     auto indices = amdgpu::getIndices(shmReadOp);
     SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
     transformIndices(builder, shmReadOp->getLoc(), transformedIndices,
-                     memRefType, srcDim, tgtDim);
+                     memRefType, srcDim, tgtDim, sharedMemoryLineSizeBytes,
+                     defaultVectorSizeBits);
     amdgpu::setIndices(shmReadOp, transformedIndices);
   }
 
@@ -218,7 +218,9 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
 }
 
 std::optional<LogicalResult>
-amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp) {
+amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp,
+                                             int64_t sharedMemoryLineSizeBytes,
+                                             int64_t defaultVectorSizeBits) {
   SmallVector<memref::AllocOp> shmAllocOps;
   funcOp.walk([&](memref::AllocOp allocOp) {
     if (!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(allocOp.getType()))
@@ -226,8 +228,9 @@ amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp) {
     shmAllocOps.push_back(allocOp);
   });
   for (auto allocOp : shmAllocOps) {
-    if (failed(amdgpu::optimizeSharedMemoryReadsAndWrites(funcOp,
-                                                          allocOp.getMemref())))
+    if (failed(amdgpu::optimizeSharedMemoryReadsAndWrites(
+            funcOp, allocOp.getMemref(), sharedMemoryLineSizeBytes,
+            defaultVectorSizeBits)))
       return failure();
   }
   return success();
@@ -237,7 +240,8 @@ struct OptimizeSharedMemoryPass
     : public amdgpu::impl::OptimizeSharedMemoryBase<OptimizeSharedMemoryPass> {
 public:
   OptimizeSharedMemoryPass() = default;
-
+  OptimizeSharedMemoryPass(const OptimizeSharedMemoryOptions &options)
+      : OptimizeSharedMemoryBase(options) {}
   void runOnOperation() override {
     Operation *op = getOperation();
     SmallVector<memref::AllocOp> shmAllocOps;
@@ -248,8 +252,9 @@ struct OptimizeSharedMemoryPass
       shmAllocOps.push_back(allocOp);
     });
     for (auto allocOp : shmAllocOps) {
-      if (failed(optimizeSharedMemoryReadsAndWrites(getOperation(),
-                                                    allocOp.getMemref())))
+      if (failed(optimizeSharedMemoryReadsAndWrites(op, allocOp.getMemref(),
+                                                    sharedMemoryLineSizeBytes,
+                                                    defaultVectorSizeBits)))
         return;
     }
   }
diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
index d0ae2b64ffa9e0..9139815ddce04f 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -278,68 +278,62 @@ struct BFloat16TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
       return rewriter.notifyMatchFailure(op, "not a trunc of f32 to bf16.");
     }
 
-    Type i1Ty = b.getI1Type();
     Type i16Ty = b.getI16Type();
     Type i32Ty = b.getI32Type();
     Type f32Ty = b.getF32Type();
     if (auto shapedTy = dyn_cast<ShapedType>(operandTy)) {
-      i1Ty = shapedTy.clone(i1Ty);
       i16Ty = shapedTy.clone(i16Ty);
       i32Ty = shapedTy.clone(i32Ty);
       f32Ty = shapedTy.clone(f32Ty);
     }
 
-    Value bitcast = b.create<arith::BitcastOp>(i32Ty, operand);
-
-    Value c23 = createConst(op.getLoc(), i32Ty, 23, rewriter);
-    Value c31 = createConst(op.getLoc(), i32Ty, 31, rewriter);
-    Value c23Mask = createConst(op.getLoc(), i32Ty, (1 << 23) - 1, rewriter);
-    Value expMask =
-        createConst(op.getLoc(), i32Ty, ((1 << 8) - 1) << 23, rewriter);
-    Value expMax =
-        createConst(op.getLoc(), i32Ty, ((1 << 8) - 2) << 23, rewriter);
-
-    // Grab the sign bit.
-    Value sign = b.create<arith::ShRUIOp>(bitcast, c31);
-
-    // Our mantissa rounding value depends on the sign bit and the last
-    // truncated bit.
-    Value cManRound = createConst(op.getLoc(), i32Ty, (1 << 15), rewriter);
-    cManRound = b.create<arith::SubIOp>(cManRound, sign);
-
-    // Grab out the mantissa and directly apply rounding.
-    Value man = b.create<arith::AndIOp>(bitcast, c23Mask);
-    Value manRound = b.create<arith::AddIOp>(man, cManRound);
-
-    // Grab the overflow bit and shift right if we overflow.
-    Value roundBit = b.create<arith::ShRUIOp>(manRound, c23);
-    Value manNew = b.create<arith::ShRUIOp>(manRound, roundBit);
-
-    // Grab the exponent and round using the mantissa's carry bit.
-    Value exp = b.create<arith::AndIOp>(bitcast, expMask);
-    Value expCarry = b.create<arith::AddIOp>(exp, manRound);
-    expCarry = b.create<arith::AndIOp>(expCarry, expMask);
-
-    // If the exponent is saturated, we keep the max value.
-    Value expCmp =
-        b.create<arith::CmpIOp>(arith::CmpIPredicate::uge, exp, expMax);
-    exp = b.create<arith::SelectOp>(expCmp, exp, expCarry);
-
-    // If the exponent is max and we rolled over, keep the old mantissa.
-    Value roundBitBool = b.create<arith::TruncIOp>(i1Ty, roundBit);
-    Value keepOldMan = b.create<arith::AndIOp>(expCmp, roundBitBool);
-    man = b.create<arith::SelectOp>(keepOldMan, man, manNew);
-
-    // Assemble the now rounded f32 value (as an i32).
-    Value rounded = b.create<arith::ShLIOp>(sign, c31);
-    rounded = b.create<arith::OrIOp>(rounded, exp);
-    rounded = b.create<arith::OrIOp>(rounded, man);
-
+    // Algorithm borrowed from this excellent code:
+    // https://github.com/pytorch/pytorch/blob/e1502c0cdbfd17548c612f25d5a65b1e4b86224d/c10/util/BFloat16.h#L60-L79
+    // There is a magic idea there, to let the addition of the rounding_bias to
+    // the mantissa simply overflow into the exponent bits. It's a bit of an
+    // aggressive, obfuscating optimization, but it is well-tested code, and it
+    // results in more concise and efficient IR.
+    // The case of NaN is handled separately (see isNaN and the final select).
+    // The case of infinities is NOT handled separately, which deserves an
+    // explanation. As the encoding of infinities has zero mantissa, the
+    // rounding-bias addition never carries into the exponent so that just gets
+    // truncated away, and as bfloat16 and float32 have the same number of
+    // exponent bits, that simple truncation is the desired outcome for
+    // infinities.
+    Value isNan =
+        b.create<arith::CmpFOp>(arith::CmpFPredicate::UNE, operand, operand);
+    // Constant used to make the rounding bias.
+    Value c7FFF = createConst(op.getLoc(), i32Ty, 0x7fff, rewriter);
+    // Constant used to generate a quiet NaN.
+    Value c7FC0_i16 = createConst(op.getLoc(), i16Ty, 0x7fc0, rewriter);
+    // Small constants used to address bits.
     Value c16 = createConst(op.getLoc(), i32Ty, 16, rewriter);
-    Value shr = b.create<arith::ShRUIOp>(rounded, c16);
-    Value trunc = b.create<arith::TruncIOp>(i16Ty, shr);
-    Value result = b.create<arith::BitcastOp>(resultTy, trunc);
-
+    Value c1 = createConst(op.getLoc(), i32Ty, 1, rewriter);
+    // Reinterpret the input f32 value as bits.
+    Value bitcast = b.create<arith::BitcastOp>(i32Ty, operand);
+    // Read bit 16 as a value in {0,1}.
+    Value bit16 =
+        b.create<arith::AndIOp>(b.create<arith::ShRUIOp>(bitcast, c16), c1);
+    // Determine the rounding bias to add as either 0x7fff or 0x8000 depending
+    // on bit 16, implementing the tie-breaking "to nearest even".
+    Value roundingBias = b.create<arith::AddIOp>(bit16, c7FFF);
+    // Add the rounding bias. Generally we want this to be added to the
+    // mantissa, but nothing prevents this to from carrying into the exponent
+    // bits, which would feel like a bug, but this is the magic trick here:
+    // when that happens, the mantissa gets reset to zero and the exponent
+    // gets incremented by the carry... which is actually exactly what we
+    // want.
+    Value biased = b.create<arith::AddIOp>(bitcast, roundingBias);
+    // Now that the rounding-bias has been added, truncating the low bits
+    // yields the correctly rounded result.
+    Value biasedAndShifted = b.create<arith::ShRUIOp>(biased, c16);
+    Value normalCaseResult_i16 =
+        b.create<arith::TruncIOp>(i16Ty, biasedAndShifted);
+    // Select either the above-computed result, or a quiet NaN constant
+    // if the input was NaN.
+    Value select =
+        b.create<arith::SelectOp>(isNan, c7FC0_i16, normalCaseResult_i16);
+    Value result = b.create<arith::BitcastOp>(resultTy, select);
     rewriter.replaceOp(op, result);
     return success();
   }
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
index 935f0deaf9c8a6..db1974ddb3773b 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
@@ -38,7 +39,7 @@
 
 using namespace mlir;
 
-#if MLIR_CUDA_CONVERSIONS_ENABLED
+#if MLIR_ENABLE_CUDA_CONVERSIONS
 namespace {
 
 //===----------------------------------------------------------------------===//
@@ -127,4 +128,4 @@ void mlir::gpu::registerGPUToNVVMPipeline() {
       buildLowerToNVVMPassPipeline);
 }
 
-#endif // MLIR_CUDA_CONVERSIONS_ENABLED
+#endif // MLIR_ENABLE_CUDA_CONVERSIONS
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index 0527073da85b69..f379ea8193923d 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -13,6 +13,7 @@
 
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -48,7 +49,7 @@ void GpuModuleToBinaryPass::getDependentDialects(
   // Register all GPU related translations.
   registry.insert<gpu::GPUDialect>();
   registry.insert<LLVM::LLVMDialect>();
-#if MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#if MLIR_ENABLE_CUDA_CONVERSIONS
   registry.insert<NVVM::NVVMDialect>();
 #endif
 #if MLIR_ROCM_CONVERSIONS_ENABLED == 1
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 299965bcfc3ab3..ef9cd5561665fc 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -3244,6 +3244,31 @@ DiagnosedSilenceableFailure transform::ConvertConv2DToImg2ColOp::applyToOne(
   return DiagnosedSilenceableFailure::success();
 }
 
+//===----------------------------------------------------------------------===//
+// FlattenElementwiseLinalgOp.
+//===----------------------------------------------------------------------===//
+
+DiagnosedSilenceableFailure transform::FlattenElementwiseLinalgOp::applyToOne(
+    transform::TransformRewriter &rewriter, linalg::LinalgOp target,
+    transform::ApplyToEachResultList &results,
+    transform::TransformState &state) {
+  rewriter.setInsertionPoint(target);
+  if (target.getNumLoops() <= 1)
+    return DiagnosedSilenceableFailure::success();
+  ReassociationIndices reassociation(target.getNumLoops());
+  std::iota(reassociation.begin(), reassociation.end(), 0);
+  auto maybeFlattened =
+      (isElementwise(target))
+          ? collapseOpIterationDims(target, reassociation, rewriter)
+          : FailureOr<CollapseResult>(rewriter.notifyMatchFailure(
+                target, "only elementwise flattening is supported"));
+  if (failed(maybeFlattened))
+    return emitDefaultSilenceableFailure(target);
+  results.push_back(maybeFlattened->collapsedOp);
+  rewriter.replaceOp(target, maybeFlattened->results);
+  return DiagnosedSilenceableFailure::success();
+}
+
 //===----------------------------------------------------------------------===//
 // TransposeConv2DOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 4977940cfbd797..4797bfb2267d7f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -1446,24 +1446,20 @@ void generateCollapsedIndexingRegion(Location loc, Block *block,
   }
 }
 
-template <typename LinalgType>
-Operation *createCollapsedOp(LinalgType op,
-                             const CollapsingInfo &collapsingInfo,
-                             RewriterBase &rewriter) {
-  static_assert(llvm::is_one_of<LinalgType, GenericOp, CopyOp>::value,
-                "unsupported linalg op type to create");
+void collapseOperandsAndResults(LinalgOp op,
+                                const CollapsingInfo &collapsingInfo,
+                                RewriterBase &rewriter,
+                                SmallVectorImpl<Value> &inputOperands,
+                                SmallVectorImpl<Value> &outputOperands,
+                                SmallVectorImpl<Type> &resultTypes) {
   Location loc = op->getLoc();
-
-  // Get the input operands.
-  SmallVector<Value> inputOperands =
+  inputOperands =
       llvm::map_to_vector(op.getDpsInputOperands(), [&](OpOperand *opOperand) {
         return getCollapsedOpOperand(loc, op, opOperand, collapsingInfo,
                                      rewriter);
       });
 
   // Get the output operands and result types.
-  SmallVector<Type> resultTypes;
-  SmallVector<Value> outputOperands;
   resultTypes.reserve(op.getNumDpsInits());
   outputOperands.reserve(op.getNumDpsInits());
   for (OpOperand &output : op.getDpsInitsMutable()) {
@@ -1475,41 +1471,69 @@ Operation *createCollapsedOp(LinalgType op,
     if (!op.hasPureBufferSemantics())
       resultTypes.push_back(newOutput.getType());
   }
+}
 
-  if (isa<linalg::CopyOp>(op)) {
-    return rewriter.create<linalg::CopyOp>(loc, inputOperands[0],
-                                           outputOperands[0]);
-  }
+/// Clone a `LinalgOp` to a collapsed version of same name
+template <typename OpTy>
+OpTy cloneToCollapsedOp(RewriterBase &rewriter, OpTy origOp,
+                        const CollapsingInfo &collapsingInfo) {
+  return nullptr;
+}
 
-  // Get the iterator types for the operand.
-  SmallVector<utils::IteratorType> iteratorTypes =
-      getCollapsedOpIteratorTypes(op.getIteratorTypesArray(), collapsingInfo);
+/// Collapse any `LinalgOp` that does not require any specialization such as
+/// indexing_maps, iterator_types, etc.
+template <>
+LinalgOp cloneToCollapsedOp<LinalgOp>(RewriterBase &rewriter, LinalgOp origOp,
+                                      const CollapsingInfo &collapsingInfo) {
+  SmallVector<Value> inputOperands, outputOperands;
+  SmallVector<Type> resultTypes;
+  collapseOperandsAndResults(origOp, collapsingInfo, rewriter, inputOperands,
+                             outputOperands, resultTypes);
+  return cast<LinalgOp>(clone(
+      rewriter, origOp, resultTypes,
+      llvm::to_vector(llvm::concat<Value>(inputOperands, outputOperands))));
+}
 
-  // Get the indexing maps.
-  auto indexingMaps =
-      llvm::map_to_vector(op.getIndexingMapsArray(), [&](AffineMap map) {
+/// Collapse a `GenericOp`
+template <>
+GenericOp cloneToCollapsedOp<GenericOp>(RewriterBase &rewriter,
+                                        GenericOp origOp,
+                                        const CollapsingInfo &collapsingInfo) {
+  SmallVector<Value> inputOperands, outputOperands;
+  SmallVector<Type> resultTypes;
+  collapseOperandsAndResults(origOp, collapsingInfo, rewriter, inputOperands,
+                             outputOperands, resultTypes);
+  SmallVector<AffineMap> indexingMaps(
+      llvm::map_range(origOp.getIndexingMapsArray(), [&](AffineMap map) {
         return getCollapsedOpIndexingMap(map, collapsingInfo);
-      });
+      }));
+
+  SmallVector<utils::IteratorType> iteratorTypes(getCollapsedOpIteratorTypes(
+      origOp.getIteratorTypesArray(), collapsingInfo));
 
-  Operation *collapsedOp = rewriter.create<linalg::GenericOp>(
-      loc, resultTypes, inputOperands, outputOperands, indexingMaps,
+  GenericOp collapsedOp = rewriter.create<linalg::GenericOp>(
+      origOp.getLoc(), resultTypes, inputOperands, outputOperands, indexingMaps,
       iteratorTypes, [](OpBuilder &builder, Location loc, ValueRange args) {});
-  Block *origOpBlock = &op->getRegion(0).front();
+  Block *origOpBlock = &origOp->getRegion(0).front();
   Block *collapsedOpBlock = &collapsedOp->getRegion(0).front();
   rewriter.mergeBlocks(origOpBlock, collapsedOpBlock,
                        collapsedOpBlock->getArguments());
-
   return collapsedOp;
 }
 
+LinalgOp createCollapsedOp(LinalgOp op, const CollapsingInfo &collapsingInfo,
+                           RewriterBase &rewriter) {
+  if (GenericOp genericOp = dyn_cast<GenericOp>(op.getOperation())) {
+    return cloneToCollapsedOp(rewriter, genericOp, collapsingInfo);
+  } else {
+    return cloneToCollapsedOp(rewriter, op, collapsingInfo);
+  }
+}
+
 /// Implementation of fusion with reshape operation by collapsing dimensions.
-template <typename LinalgType>
-FailureOr<SmallVector<Value>> mlir::linalg::collapseOpIterationDims(
-    LinalgType op, ArrayRef<ReassociationIndices> foldedIterationDims,
+FailureOr<CollapseResult> mlir::linalg::collapseOpIterationDims(
+    LinalgOp op, ArrayRef<ReassociationIndices> foldedIterationDims,
     RewriterBase &rewriter) {
-  static_assert(llvm::is_one_of<LinalgType, GenericOp, CopyOp>::value,
-                "unsupported linalg op type to collapse");
-
   // Bail on trivial no-op cases.
   if (op.getNumLoops() <= 1 || foldedIterationDims.empty() ||
       llvm::all_of(foldedIterationDims, [](ReassociationIndicesRef foldedDims) {
@@ -1538,8 +1562,7 @@ FailureOr<SmallVector<Value>> mlir::linalg::collapseOpIterationDims(
   }
 
   // Bail on non-canonical ranges.
-  SmallVector<Range> loopRanges =
-      cast<LinalgOp>(op.getOperation()).createLoopRanges(rewriter, op.getLoc());
+  SmallVector<Range> loopRanges = op.createLoopRanges(rewriter, op.getLoc());
   auto opFoldIsConstantValue = [](OpFoldResult ofr, int64_t value) {
     if (auto attr = llvm::dyn_cast_if_present<Attribute>(ofr))
       return cast<IntegerAttr>(attr).getInt() == value;
@@ -1555,8 +1578,7 @@ FailureOr<SmallVector<Value>> mlir::linalg::collapseOpIterationDims(
         op, "expected all loop ranges to have zero start and unit stride");
   }
 
-  LinalgType collapsedOp = cast<LinalgType>(
-      createCollapsedOp<LinalgType>(op, collapsingInfo, rewriter));
+  LinalgOp collapsedOp = createCollapsedOp(op, collapsingInfo, rewriter);
 
   Location loc = op->getLoc();
   if (collapsedOp.hasIndexSemantics()) {
@@ -1597,7 +1619,7 @@ FailureOr<SmallVector<Value>> mlir::linalg::collapseOpIterationDims(
       results.push_back(collapsedOpResult);
     }
   }
-  return results;
+  return CollapseResult{results, collapsedOp};
 }
 
 namespace {
@@ -1629,15 +1651,14 @@ class FoldWithProducerReshapeOpByCollapsing
         continue;
       }
 
-      std::optional<SmallVector<Value>> replacements =
-          collapseOpIterationDims<linalg::GenericOp>(
-              genericOp, collapsableIterationDims, rewriter);
-      if (!replacements) {
+      std::optional<CollapseResult> collapseResult = collapseOpIterationDims(
+          genericOp, collapsableIterationDims, rewriter);
+      if (!collapseResult) {
         return rewriter.notifyMatchFailure(
             genericOp, "failed to do the fusion by collapsing transformation");
       }
 
-      rewriter.replaceOp(genericOp, *replacements);
+      rewriter.replaceOp(genericOp, collapseResult->results);
       return success();
     }
     return failure();
@@ -1671,13 +1692,12 @@ class CollapseLinalgDimensions : public OpRewritePattern<LinalgType> {
           op, "specified dimensions cannot be collapsed");
     }
 
-    std::optional<SmallVector<Value>> replacements =
-        collapseOpIterationDims<LinalgType>(op, collapsableIterationDims,
-                                            rewriter);
-    if (!replacements) {
+    std::optional<CollapseResult> collapseResult =
+        collapseOpIterationDims(op, collapsableIterationDims, rewriter);
+    if (!collapseResult) {
       return rewriter.notifyMatchFailure(op, "failed to collapse dimensions");
     }
-    rewriter.replaceOp(op, *replacements);
+    rewriter.replaceOp(op, collapseResult->results);
     return success();
   }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index ac043e87223dfe..1e703dacfd0c75 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -891,8 +891,7 @@ static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val,
 
   // Conservatively reject Ops that could lead to indices with stride other
   // than 1.
-  if (!isa<arith::AddIOp, arith::SubIOp, arith::ConstantOp, linalg::IndexOp>(
-          ancestor))
+  if (!isa<arith::AddIOp, arith::ConstantOp, linalg::IndexOp>(ancestor))
     return false;
 
   bool result = false;
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index c2b471ab96183f..8a6980e2c6a2d9 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1957,7 +1957,10 @@ LogicalResult PrivateClauseOp::verify() {
   Type symType = getType();
 
   auto verifyTerminator = [&](Operation *terminator) -> LogicalResult {
-    if (!terminator->hasSuccessors() && !llvm::isa<YieldOp>(terminator))
+    if (!terminator->getBlock()->getSuccessors().empty())
+      return success();
+
+    if (!llvm::isa<YieldOp>(terminator))
       return mlir::emitError(terminator->getLoc())
              << "expected exit block terminator to be an `omp.yield` op.";
 
diff --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
index 731cd79a1e3b4b..72b722c69ae34b 100644
--- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
@@ -476,7 +476,7 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) {
     // Starts resetting from a dense level, so that the first bit (if kept)
     // is not undefined level-type.
     for (unsigned b = 0; b < be; b++) {
-      if (simple[b] && isDenseLT(getLvlType(TensorLoopId{b}))) {
+      if (simple[b] && getLvlType(TensorLoopId{b}).hasDenseSemantic()) {
         offset = be - b - 1; // relative to the end
         break;
       }
@@ -489,8 +489,7 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) {
     // Slice on dense level has `locate` property as well, and can be optimized.
     if (simple[b] && !isSparseLvlWithNonTrivialIdxExp(b)) {
       const auto lt = getLvlType(b);
-      if (!isCompressedLT(lt) && !isSingletonLT(lt) &&
-          !isLooseCompressedLT(lt) && !isNOutOfMLT(lt)) {
+      if (!lt.hasSparseSemantic()) {
         if (reset)
           simple.reset(b);
         reset = true;
@@ -670,8 +669,7 @@ bool Merger::isSingleCondition(TensorId t, ExprId e) const {
 bool Merger::hasAnySparse(const BitVector &bits) const {
   for (TensorLoopId b : bits.set_bits()) {
     const auto lt = getLvlType(b);
-    if (isCompressedLT(lt) || isSingletonLT(lt) || isLooseCompressedLT(lt) ||
-        isNOutOfMLT(lt))
+    if (lt.hasSparseSemantic())
       return true;
   }
   return hasSparseIdxReduction(bits);
diff --git a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
index 5073234a7e35e9..7adf223f3440a5 100644
--- a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
@@ -50,12 +50,79 @@ static Operation *findPayloadRoot(Operation *passRoot, StringRef tag) {
     return WalkResult::interrupt();
   });
 
+  if (!target) {
+    passRoot->emitError()
+        << "could not find the operation with transform.target_tag=\"" << tag
+        << "\" attribute";
+    return nullptr;
+  }
+
   return walkResult.wasInterrupted() ? nullptr : target;
 }
 
 namespace {
 class InterpreterPass
     : public transform::impl::InterpreterPassBase<InterpreterPass> {
+  // Parses the pass arguments to bind trailing arguments of the entry point.
+  std::optional<RaggedArray<transform::MappedValue>>
+  parseArguments(Operation *payloadRoot) {
+    MLIRContext *context = payloadRoot->getContext();
+
+    SmallVector<SmallVector<transform::MappedValue>, 2> trailingBindings;
+    trailingBindings.resize(debugBindTrailingArgs.size());
+
+    // Construct lists of op names to match.
+    SmallVector<std::optional<OperationName>> debugBindNames;
+    debugBindNames.reserve(debugBindTrailingArgs.size());
+    for (auto &&[position, nameString] :
+         llvm::enumerate(debugBindTrailingArgs)) {
+      StringRef name = nameString;
+
+      // Parse the integer literals.
+      if (name.starts_with("#")) {
+        debugBindNames.push_back(std::nullopt);
+        StringRef lhs = "";
+        StringRef rhs = name.drop_front();
+        do {
+          std::tie(lhs, rhs) = rhs.split(';');
+          int64_t value;
+          if (lhs.getAsInteger(10, value)) {
+            emitError(UnknownLoc::get(context))
+                << "couldn't parse integer pass argument " << name;
+            return std::nullopt;
+          }
+          trailingBindings[position].push_back(
+              Builder(context).getI64IntegerAttr(value));
+        } while (!rhs.empty());
+      } else if (name.starts_with("^")) {
+        debugBindNames.emplace_back(OperationName(name.drop_front(), context));
+      } else {
+        debugBindNames.emplace_back(OperationName(name, context));
+      }
+    }
+
+    // Collect operations or results for extra bindings.
+    payloadRoot->walk([&](Operation *payload) {
+      for (auto &&[position, name] : llvm::enumerate(debugBindNames)) {
+        if (!name || payload->getName() != *name)
+          continue;
+
+        if (StringRef(*std::next(debugBindTrailingArgs.begin(), position))
+                .starts_with("^")) {
+          llvm::append_range(trailingBindings[position], payload->getResults());
+        } else {
+          trailingBindings[position].push_back(payload);
+        }
+      }
+    });
+
+    RaggedArray<transform::MappedValue> bindings;
+    bindings.push_back(ArrayRef<Operation *>{payloadRoot});
+    for (SmallVector<transform::MappedValue> &trailing : trailingBindings)
+      bindings.push_back(std::move(trailing));
+    return bindings;
+  }
+
 public:
   using Base::Base;
 
@@ -67,34 +134,18 @@ class InterpreterPass
         findPayloadRoot(getOperation(), debugPayloadRootTag);
     if (!payloadRoot)
       return signalPassFailure();
-    auto debugBindNames = llvm::map_to_vector(
-        debugBindTrailingArgs,
-        [&](const std::string &name) { return OperationName(name, context); });
-    SmallVector<SmallVector<Operation *>, 2> trailingBindings;
-    trailingBindings.resize(debugBindNames.size());
-    payloadRoot->walk([&](Operation *payload) {
-      for (auto &&[position, name] : llvm::enumerate(debugBindNames)) {
-        if (payload->getName() == name)
-          trailingBindings[position].push_back(payload);
-      }
-    });
 
     Operation *transformEntryPoint = transform::detail::findTransformEntryPoint(
         getOperation(), transformModule, entryPoint);
-    if (!transformEntryPoint) {
-      getOperation()->emitError()
-          << "could not find transform entry point: " << entryPoint
-          << " in either payload or transform module";
+    if (!transformEntryPoint)
       return signalPassFailure();
-    }
-
-    RaggedArray<transform::MappedValue> bindings;
-    bindings.push_back(ArrayRef<Operation *>{payloadRoot});
-    for (SmallVector<Operation *> &trailing : trailingBindings)
-      bindings.push_back(std::move(trailing));
 
+    std::optional<RaggedArray<transform::MappedValue>> bindings =
+        parseArguments(payloadRoot);
+    if (!bindings)
+      return signalPassFailure();
     if (failed(transform::applyTransformNamedSequence(
-            bindings,
+            *bindings,
             cast<transform::TransformOpInterface>(transformEntryPoint),
             transformModule,
             options.enableExpensiveChecks(!disableExpensiveChecks)))) {
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 8c341a347ff662..5be6a628904cdf 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -2040,77 +2040,6 @@ class ExtractOpFromCreateMask final : public OpRewritePattern<ExtractOp> {
   }
 };
 
-// Patterns to rewrite ExtractOp(ConstantMaskOp)
-//
-// When the result of ExtractOp is a subvector of input, we can rewrite it as
-// a ConstantMaskOp with subvector ranks.
-//
-// ExtractOp(ConstantMaskOp) -> ConstantMaskOp
-//
-// When the result of ExtractOp is a scalar, we can get the scalar value
-// directly.
-//
-// ExtractOp(ConstantMaskOp) -> ConstantOp
-class ExtractOpFromConstantMask final : public OpRewritePattern<ExtractOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ExtractOp extractOp,
-                                PatternRewriter &rewriter) const override {
-    auto constantMaskOp =
-        extractOp.getVector().getDefiningOp<vector::ConstantMaskOp>();
-    if (!constantMaskOp)
-      return failure();
-
-    // All indices must be static.
-    ArrayRef<int64_t> extractOpPos = extractOp.getStaticPosition();
-    unsigned dynamicPosCount =
-        llvm::count_if(extractOpPos, ShapedType::isDynamic);
-    // If there is any dynamic position in ExtractOp, we cannot determine the
-    // scalar value.
-    if (dynamicPosCount)
-      return failure();
-
-    ArrayRef<Attribute> maskDimSizes =
-        constantMaskOp.getMaskDimSizes().getValue();
-    Type resultTy = extractOp.getResult().getType();
-    if (resultTy.isa<mlir::VectorType>()) {
-      auto resultVectorTy = resultTy.cast<mlir::VectorType>();
-      int64_t resultRank = resultVectorTy.getRank();
-      int64_t n = maskDimSizes.size();
-      std::vector<int64_t> indices;
-      for (auto i = n - resultRank; i < n; ++i)
-        indices.push_back(cast<IntegerAttr>(maskDimSizes[i]).getInt());
-
-      rewriter.replaceOpWithNewOp<vector::ConstantMaskOp>(
-          extractOp, resultVectorTy,
-          vector::getVectorSubscriptAttr(rewriter, indices));
-
-      return success();
-    } else if (resultTy.isa<mlir::IntegerType>()) {
-      // ConstantMaskOp creates and returns a vector mask where elements of the
-      // result vector are set to ‘0’ or ‘1’, based on whether the element
-      // indices are contained within a hyper-rectangular region.
-      // We go through ExtractOp static positions to determine the position is
-      // within the hyper-rectangular region or not.
-      Type boolType = rewriter.getI1Type();
-      IntegerAttr setAttr = IntegerAttr::get(boolType, 1);
-      for (size_t i = 0, end = extractOpPos.size(); i < end; ++i) {
-        if (cast<IntegerAttr>(maskDimSizes[i]).getInt() <= extractOpPos[i]) {
-          setAttr = IntegerAttr::get(boolType, 0);
-          break;
-        }
-      }
-
-      rewriter.replaceOpWithNewOp<arith::ConstantOp>(extractOp, boolType,
-                                                     setAttr);
-      return success();
-    }
-
-    return failure();
-  }
-};
-
 // Folds extract(shape_cast(..)) into shape_cast when the total element count
 // does not change.
 LogicalResult foldExtractFromShapeCastToShapeCast(ExtractOp extractOp,
@@ -2137,8 +2066,7 @@ LogicalResult foldExtractFromShapeCastToShapeCast(ExtractOp extractOp,
 void ExtractOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                             MLIRContext *context) {
   results.add<ExtractOpSplatConstantFolder, ExtractOpNonSplatConstantFolder,
-              ExtractOpFromBroadcast, ExtractOpFromCreateMask,
-              ExtractOpFromConstantMask>(context);
+              ExtractOpFromBroadcast, ExtractOpFromCreateMask>(context);
   results.add(foldExtractFromShapeCastToShapeCast);
 }
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 620ceee48b196d..b3ab4a916121e3 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -443,15 +443,24 @@ static vector::TransferWriteOp cloneWriteOp(RewriterBase &rewriter,
 /// d1) and return vector<16x2x64>
 static VectorType getDistributedType(VectorType originalType, AffineMap map,
                                      int64_t warpSize) {
-  if (map.getNumResults() != 1)
-    return VectorType();
   SmallVector<int64_t> targetShape(originalType.getShape().begin(),
                                    originalType.getShape().end());
   for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
     unsigned position = map.getDimPosition(i);
-    if (targetShape[position] % warpSize != 0)
-      return VectorType();
+    if (targetShape[position] % warpSize != 0) {
+      if (warpSize % targetShape[position] != 0) {
+        return VectorType();
+      }
+      warpSize /= targetShape[position];
+      targetShape[position] = 1;
+      continue;
+    }
     targetShape[position] = targetShape[position] / warpSize;
+    warpSize = 1;
+    break;
+  }
+  if (warpSize != 1) {
+    return VectorType();
   }
   VectorType targetType =
       VectorType::get(targetShape, originalType.getElementType());
@@ -526,7 +535,30 @@ struct WarpOpTransferWrite : public OpRewritePattern<WarpExecuteOnLane0Op> {
     // 4. Reindex the write using the distribution map.
     auto newWarpOp =
         newWriteOp.getVector().getDefiningOp<WarpExecuteOnLane0Op>();
+
+    // Delinearize the lane id based on the way threads are divided across the
+    // vector. To get the number of threads per vector dimension, divide the
+    // sequential size by the distributed size along each dim.
     rewriter.setInsertionPoint(newWriteOp);
+    SmallVector<OpFoldResult> delinearizedIdSizes;
+    for (auto [seqSize, distSize] :
+         llvm::zip_equal(writtenVectorType.getShape(), targetType.getShape())) {
+      assert(seqSize % distSize == 0 && "Invalid distributed vector shape");
+      delinearizedIdSizes.push_back(rewriter.getIndexAttr(seqSize / distSize));
+    }
+    SmallVector<Value> delinearized;
+    if (map.getNumResults() > 1) {
+      delinearized = rewriter
+                         .create<mlir::affine::AffineDelinearizeIndexOp>(
+                             newWarpOp.getLoc(), newWarpOp.getLaneid(),
+                             delinearizedIdSizes)
+                         .getResults();
+    } else {
+      // If there is only one map result, we can elide the delinearization
+      // op and use the lane id directly.
+      delinearized.append(targetType.getRank(), newWarpOp.getLaneid());
+    }
+
     AffineMap indexMap = map.compose(newWriteOp.getPermutationMap());
     Location loc = newWriteOp.getLoc();
     SmallVector<Value> indices(newWriteOp.getIndices().begin(),
@@ -539,11 +571,11 @@ struct WarpOpTransferWrite : public OpRewritePattern<WarpExecuteOnLane0Op> {
         continue;
       unsigned indexPos = indexExpr.getPosition();
       unsigned vectorPos = cast<AffineDimExpr>(std::get<1>(it)).getPosition();
+      Value laneId = delinearized[vectorPos];
       auto scale =
           rewriter.getAffineConstantExpr(targetType.getDimSize(vectorPos));
       indices[indexPos] = affine::makeComposedAffineApply(
-          rewriter, loc, d0 + scale * d1,
-          {indices[indexPos], newWarpOp.getLaneid()});
+          rewriter, loc, d0 + scale * d1, {indices[indexPos], laneId});
     }
     newWriteOp.getIndicesMutable().assign(indices);
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index fc11ae63e718a5..dc6f126aae4c87 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -729,8 +729,8 @@ static LogicalResult commonConversionPrecondition(PatternRewriter &rewriter,
 
   // TODO: consider relaxing this restriction in the future if we find ways
   // to really work with subbyte elements across the MLIR/LLVM boundary.
-  unsigned resultBitwidth = preconditionType.getElementTypeBitWidth();
-  if (resultBitwidth % 8 != 0)
+  unsigned bitwidth = preconditionType.getElementTypeBitWidth();
+  if (bitwidth % 8 != 0)
     return rewriter.notifyMatchFailure(op, "bitwidth is not k * 8");
 
   return success();
@@ -768,6 +768,10 @@ static LogicalResult alignedConversionPrecondition(PatternRewriter &rewriter,
       (dstElemBitwidth % srcElemBitwidth) != 0)
     return rewriter.notifyMatchFailure(op, "Not a supported aligned case");
 
+  if ((srcType.getShape().back() % 2) != 0)
+    return rewriter.notifyMatchFailure(
+        op, "Not an even number of i4 elements in trailing dim");
+
   return success();
 }
 
@@ -876,6 +880,58 @@ static Value rewriteI4ToI8SignedExt(PatternRewriter &rewriter, Location loc,
   return rewriter.create<vector::InterleaveOp>(loc, low, high);
 }
 
+/// Rewrite the i8 -> i4 truncation into a sequence of shuffles and bitwise ops
+/// that take advantage of high-level information to avoid leaving LLVM to
+/// scramble with peephole optimizations.
+static Value rewriteI8ToI4Trunc(PatternRewriter &rewriter, Location loc,
+                                Value srcValue) {
+  VectorType srcVecType = cast<VectorType>(srcValue.getType());
+  assert(srcVecType.getElementType().isSignlessInteger(8) &&
+         "Expected i8 type");
+
+  // 1. De-interleave low and high i8 elements.
+  int64_t vecDimSize = srcVecType.getShape().back();
+  SmallVector<int64_t> deinterleaveLowMaskValues;
+  SmallVector<int64_t> deinterleaveHighMaskValues;
+  assert((vecDimSize % 2) == 0 && "Odd number of i4 elements");
+  deinterleaveLowMaskValues.reserve(vecDimSize / 2);
+  deinterleaveHighMaskValues.reserve(vecDimSize / 2);
+  for (int i = 0, end = vecDimSize; i < end; i += 2) {
+    deinterleaveLowMaskValues.push_back(i);
+    deinterleaveHighMaskValues.push_back(i + 1);
+  }
+
+  auto lowShuffleOp = rewriter.create<vector::ShuffleOp>(
+      loc, srcValue, srcValue,
+      rewriter.getI64ArrayAttr(deinterleaveLowMaskValues));
+  auto highShuffleOp = rewriter.create<vector::ShuffleOp>(
+      loc, srcValue, srcValue,
+      rewriter.getI64ArrayAttr(deinterleaveHighMaskValues));
+
+  // 2. Zero out the upper side of each low i8 element.
+  constexpr int8_t i8LowBitMask = 0x0F;
+  Value zeroOutMask = rewriter.create<arith::ConstantOp>(
+      loc,
+      DenseElementsAttr::get(lowShuffleOp.getResultVectorType(), i8LowBitMask));
+  Value zeroOutLow =
+      rewriter.create<arith::AndIOp>(loc, lowShuffleOp, zeroOutMask);
+
+  // 3. Move high i4 values to upper side of the byte.
+  constexpr int8_t bitsToShift = 4;
+  VectorType deinterI8VecType = highShuffleOp.getResultVectorType();
+  auto shiftValues = rewriter.create<arith::ConstantOp>(
+      loc, DenseElementsAttr::get(deinterI8VecType, bitsToShift));
+  Value shlHigh =
+      rewriter.create<arith::ShLIOp>(loc, highShuffleOp, shiftValues);
+
+  // 4. Merge high and low i4 values.
+  auto mergedHiLowOp = rewriter.create<arith::OrIOp>(loc, zeroOutLow, shlHigh);
+
+  // 5. Generate a bitcast vector<Xxi8> -> vector<2Xxi4>.
+  auto i4VecType = srcVecType.cloneWith(std::nullopt, rewriter.getI4Type());
+  return rewriter.create<vector::BitCastOp>(loc, i4VecType, mergedHiLowOp);
+}
+
 namespace {
 /// Rewrite bitcast(trunci) to a sequence of shuffles and bitwise ops that take
 /// advantage of high-level information to avoid leaving LLVM to scramble with
@@ -1019,7 +1075,7 @@ struct RewriteAlignedSubByteIntSignedExt : OpRewritePattern<ConversionOpType> {
 
   LogicalResult matchAndRewrite(ConversionOpType conversionOp,
                                 PatternRewriter &rewriter) const override {
-    // Set up the BitCastRewriter and verify the preconditions.
+    // Verify the preconditions.
     Value srcValue = conversionOp.getIn();
     auto srcVecType = dyn_cast<VectorType>(srcValue.getType());
     auto dstVecType = dyn_cast<VectorType>(conversionOp.getType());
@@ -1043,6 +1099,65 @@ struct RewriteAlignedSubByteIntSignedExt : OpRewritePattern<ConversionOpType> {
   }
 };
 
+/// Rewrite the i8 -> i4 part of any truncation into a sequence of shuffles and
+/// bitwise ops that take advantage of high-level information to avoid leaving
+/// LLVM to scramble with peephole optimizations.
+///
+/// For example:
+///    arith.trunci %in : vector<8xi32> to vector<8xi4>
+///      is rewriten as
+///
+///        %cst = arith.constant dense<15> : vector<4xi8>
+///        %cst_0 = arith.constant dense<4> : vector<4xi8>
+///        %0 = arith.trunci %in : vector<8xi32> to vector<8xi8>
+///        %1 = vector.shuffle %0, %0 [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
+///        %2 = vector.shuffle %0, %0 [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+///        %3 = arith.andi %1, %cst : vector<4xi8>
+///        %4 = arith.shli %2, %cst_0 : vector<4xi8>
+///        %5 = arith.ori %3, %4 : vector<4xi8>
+///        %6 = vector.bitcast %5 : vector<4xi8> to vector<8xi4>
+///
+struct RewriteAlignedSubByteIntTrunc : OpRewritePattern<arith::TruncIOp> {
+  using OpRewritePattern<arith::TruncIOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::TruncIOp truncOp,
+                                PatternRewriter &rewriter) const override {
+    // Verify the preconditions.
+    Value srcValue = truncOp.getIn();
+    auto srcVecType = dyn_cast<VectorType>(srcValue.getType());
+    auto dstVecType = dyn_cast<VectorType>(truncOp.getType());
+    if (!srcVecType || !dstVecType)
+      return failure();
+
+    // Only single dim vectors are supported until we have
+    // `vector.deinterleave`.
+    if (srcVecType.getRank() != 1)
+      return failure();
+
+    if (failed(commonConversionPrecondition(rewriter, srcVecType, truncOp)))
+      return failure();
+
+    // Check general alignment preconditions. We invert the src/dst type order
+    // to reuse the existing precondition logic.
+    if (failed(alignedConversionPrecondition(rewriter, dstVecType, srcVecType,
+                                             truncOp)))
+      return failure();
+
+    // Create a new iX -> i8 truncation op.
+    Location loc = truncOp.getLoc();
+    auto i8VecType = srcVecType.cloneWith(std::nullopt, rewriter.getI8Type());
+    Value i8TruncVal =
+        rewriter.create<arith::TruncIOp>(loc, i8VecType, srcValue);
+
+    // Rewrite the i8 -> i4 truncation part.
+    Value subByteTrunc = rewriteI8ToI4Trunc(rewriter, loc, i8TruncVal);
+
+    // Finalize the rewrite.
+    rewriter.replaceOp(truncOp, subByteTrunc);
+    return success();
+  }
+};
+
 /// Rewrite a sub-byte vector transpose into a sequence of instructions that
 /// perform the transpose on wider (byte) element types.
 /// For example:
@@ -1115,8 +1230,9 @@ void vector::populateVectorNarrowTypeRewritePatterns(
   // Patterns for aligned cases. We set higher priority as they are expected to
   // generate better performance for aligned cases.
   patterns.add<RewriteAlignedSubByteIntSignedExt<arith::ExtSIOp>,
-               RewriteAlignedSubByteIntSignedExt<arith::SIToFPOp>>(
-      patterns.getContext(), benefit.getBenefit() + 1);
+               RewriteAlignedSubByteIntSignedExt<arith::SIToFPOp>,
+               RewriteAlignedSubByteIntTrunc>(patterns.getContext(),
+                                              benefit.getBenefit() + 1);
 }
 
 void vector::populateVectorTransposeNarrowTypeRewritePatterns(
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 74dd1dfaca0da8..a2d4e216633181 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -710,6 +710,76 @@ struct BubbleDownBitCastForStridedSliceExtract
   }
 };
 
+// Shuffles vector.bitcast op before vector.insert_strided_slice op.
+//
+// This transforms IR like:
+//   %0 = vector.insert %val, %dst[4] : vector<32xi4> into vector<8x32xi4>
+//   %1 = vector.bitcast %0 : vector<8x32xi4> to vector<8x16xi8>
+// Into:
+//   %0 = vector.bitcast %val : vector<32xi4> to vector<16xi8>
+//   %1 = vector.bitcast %dst : vector<8x32xi4> to vector<8x16xi8>
+//   %2 = vector.insert %0, %1 [4] : vector<16xi8> into vector<8x16xi8>
+//
+struct BubbleUpBitCastForInsert : public OpRewritePattern<vector::BitCastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::BitCastOp bitcastOp,
+                                PatternRewriter &rewriter) const override {
+    VectorType castSrcType = bitcastOp.getSourceVectorType();
+    VectorType castDstType = bitcastOp.getResultVectorType();
+
+    // 0-D and scalable vectors are not supported yet.
+    if (castSrcType.getRank() == 0 || castSrcType.isScalable() ||
+        castDstType.isScalable())
+      return failure();
+
+    int64_t castSrcLastDim = castSrcType.getShape().back();
+    int64_t castDstLastDim = castDstType.getShape().back();
+    bool isNumElemsShrink = castSrcLastDim >= castDstLastDim;
+    int64_t ratio;
+    if (isNumElemsShrink) {
+      assert(castSrcLastDim % castDstLastDim == 0);
+      ratio = castSrcLastDim / castDstLastDim;
+    } else {
+      assert(castDstLastDim % castSrcLastDim == 0);
+      ratio = castDstLastDim / castSrcLastDim;
+    }
+
+    auto insertOp = bitcastOp.getSource().getDefiningOp<vector::InsertOp>();
+    if (!insertOp)
+      return failure();
+
+    // Only vector sources are supported for now.
+    auto insertSrcType = dyn_cast<VectorType>(insertOp.getSourceType());
+    if (!insertSrcType)
+      return failure();
+
+    // Bitcast the source.
+    SmallVector<int64_t> srcDims(insertSrcType.getShape());
+    srcDims.back() =
+        isNumElemsShrink ? srcDims.back() / ratio : srcDims.back() * ratio;
+    VectorType newCastSrcType =
+        VectorType::get(srcDims, castDstType.getElementType());
+    auto newCastSrcOp = rewriter.create<vector::BitCastOp>(
+        bitcastOp.getLoc(), newCastSrcType, insertOp.getSource());
+
+    SmallVector<int64_t> dstDims(insertOp.getDestVectorType().getShape());
+    dstDims.back() =
+        isNumElemsShrink ? dstDims.back() / ratio : dstDims.back() * ratio;
+    VectorType newCastDstType =
+        VectorType::get(dstDims, castDstType.getElementType());
+
+    // Bitcast the destination.
+    auto newCastDstOp = rewriter.create<vector::BitCastOp>(
+        bitcastOp.getLoc(), newCastDstType, insertOp.getDest());
+
+    // Generate new insert.
+    rewriter.replaceOpWithNewOp<vector::InsertOp>(
+        bitcastOp, newCastSrcOp, newCastDstOp, insertOp.getMixedPosition());
+    return success();
+  }
+};
+
 // Shuffles vector.bitcast op before vector.insert_strided_slice op.
 //
 // This transforms IR like:
@@ -1782,8 +1852,8 @@ void mlir::vector::populateBubbleVectorBitCastOpPatterns(
     RewritePatternSet &patterns, PatternBenefit benefit) {
   patterns.add<BubbleDownVectorBitCastForExtract,
                BubbleDownBitCastForStridedSliceExtract,
-               BubbleUpBitCastForStridedSliceInsert>(patterns.getContext(),
-                                                     benefit);
+               BubbleUpBitCastForInsert, BubbleUpBitCastForStridedSliceInsert>(
+      patterns.getContext(), benefit);
 }
 
 void mlir::vector::populateBreakDownVectorBitCastOpPatterns(
diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp
index 71b15a92782ed7..d5b6645631edd6 100644
--- a/mlir/lib/Target/LLVM/NVVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp
@@ -13,6 +13,7 @@
 
 #include "mlir/Target/LLVM/NVVM/Target.h"
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Target/LLVM/NVVM/Utils.h"
@@ -156,7 +157,7 @@ SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
   return std::move(bcFiles);
 }
 
-#if MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#if MLIR_ENABLE_CUDA_CONVERSIONS
 namespace {
 class NVPTXSerializer : public SerializeGPUModuleBase {
 public:
@@ -562,7 +563,7 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) {
   return compileToBinary(*serializedISA);
 #endif // MLIR_NVPTXCOMPILER_ENABLED == 1
 }
-#endif // MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#endif // MLIR_ENABLE_CUDA_CONVERSIONS
 
 std::optional<SmallVector<char, 0>>
 NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
@@ -574,7 +575,7 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
     module->emitError("Module must be a GPU module.");
     return std::nullopt;
   }
-#if MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#if MLIR_ENABLE_CUDA_CONVERSIONS
   NVPTXSerializer serializer(*module, cast<NVVMTargetAttr>(attribute), options);
   serializer.init();
   return serializer.run();
@@ -582,7 +583,7 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
   module->emitError(
       "The `NVPTX` target was not built. Please enable it when building LLVM.");
   return std::nullopt;
-#endif // MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#endif // MLIR_ENABLE_CUDA_CONVERSIONS
 }
 
 Attribute
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6e53d801a0d2f0..fd1de274da60e8 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -396,9 +396,9 @@ collectReductionDecls(T loop,
 
 /// Translates the blocks contained in the given region and appends them to at
 /// the current insertion point of `builder`. The operations of the entry block
-/// are appended to the current insertion block, which is not expected to have a
-/// terminator. If set, `continuationBlockArgs` is populated with translated
-/// values that correspond to the values omp.yield'ed from the region.
+/// are appended to the current insertion block. If set, `continuationBlockArgs`
+/// is populated with translated values that correspond to the values
+/// omp.yield'ed from the region.
 static LogicalResult inlineConvertOmpRegions(
     Region &region, StringRef blockName, llvm::IRBuilderBase &builder,
     LLVM::ModuleTranslation &moduleTranslation,
@@ -409,7 +409,14 @@ static LogicalResult inlineConvertOmpRegions(
   // Special case for single-block regions that don't create additional blocks:
   // insert operations without creating additional blocks.
   if (llvm::hasSingleElement(region)) {
+    llvm::Instruction *potentialTerminator =
+        builder.GetInsertBlock()->empty() ? nullptr
+                                          : &builder.GetInsertBlock()->back();
+
+    if (potentialTerminator && potentialTerminator->isTerminator())
+      potentialTerminator->removeFromParent();
     moduleTranslation.mapBlock(&region.front(), builder.GetInsertBlock());
+
     if (failed(moduleTranslation.convertBlock(
             region.front(), /*ignoreArguments=*/true, builder)))
       return failure();
@@ -423,6 +430,10 @@ static LogicalResult inlineConvertOmpRegions(
     // Drop the mapping that is no longer necessary so that the same region can
     // be processed multiple times.
     moduleTranslation.forgetMapping(region);
+
+    if (potentialTerminator && potentialTerminator->isTerminator())
+      potentialTerminator->insertAfter(&builder.GetInsertBlock()->back());
+
     return success();
   }
 
@@ -656,8 +667,22 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder,
                         moduleTranslation, bodyGenStatus);
   };
   auto finiCB = [&](InsertPointTy codeGenIP) {};
+
+  // Handle copyprivate
+  Operation::operand_range cpVars = singleOp.getCopyprivateVars();
+  std::optional<ArrayAttr> cpFuncs = singleOp.getCopyprivateFuncs();
+  llvm::SmallVector<llvm::Value *> llvmCPVars;
+  llvm::SmallVector<llvm::Function *> llvmCPFuncs;
+  for (size_t i = 0, e = cpVars.size(); i < e; ++i) {
+    llvmCPVars.push_back(moduleTranslation.lookupValue(cpVars[i]));
+    auto llvmFuncOp = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
+        singleOp, cast<SymbolRefAttr>((*cpFuncs)[i]));
+    llvmCPFuncs.push_back(
+        moduleTranslation.lookupFunction(llvmFuncOp.getName()));
+  }
+
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createSingle(
-      ompLoc, bodyCB, finiCB, singleOp.getNowait(), /*DidIt=*/nullptr));
+      ompLoc, bodyCB, finiCB, singleOp.getNowait(), llvmCPVars, llvmCPFuncs));
   return bodyGenStatus;
 }
 
@@ -1000,11 +1025,50 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
+/// A RAII class that on construction replaces the region arguments of the
+/// parallel op (which correspond to private variables) with the actual private
+/// variables they correspond to. This prepares the parallel op so that it
+/// matches what is expected by the OMPIRBuilder.
+///
+/// On destruction, it restores the original state of the operation so that on
+/// the MLIR side, the op is not affected by conversion to LLVM IR.
+class OmpParallelOpConversionManager {
+public:
+  OmpParallelOpConversionManager(omp::ParallelOp opInst)
+      : region(opInst.getRegion()), privateVars(opInst.getPrivateVars()),
+        privateArgBeginIdx(opInst.getNumReductionVars()),
+        privateArgEndIdx(privateArgBeginIdx + privateVars.size()) {
+    auto privateVarsIt = privateVars.begin();
+
+    for (size_t argIdx = privateArgBeginIdx; argIdx < privateArgEndIdx;
+         ++argIdx, ++privateVarsIt)
+      mlir::replaceAllUsesInRegionWith(region.getArgument(argIdx),
+                                       *privateVarsIt, region);
+  }
+
+  ~OmpParallelOpConversionManager() {
+    auto privateVarsIt = privateVars.begin();
+
+    for (size_t argIdx = privateArgBeginIdx; argIdx < privateArgEndIdx;
+         ++argIdx, ++privateVarsIt)
+      mlir::replaceAllUsesInRegionWith(*privateVarsIt,
+                                       region.getArgument(argIdx), region);
+  }
+
+private:
+  Region &region;
+  OperandRange privateVars;
+  unsigned privateArgBeginIdx;
+  unsigned privateArgEndIdx;
+};
+
 /// Converts the OpenMP parallel operation to LLVM IR.
 static LogicalResult
 convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+  OmpParallelOpConversionManager raii(opInst);
+
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
   // relying on captured variables.
   LogicalResult bodyGenStatus = success();
@@ -1086,12 +1150,81 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
   // TODO: Perform appropriate actions according to the data-sharing
   // attribute (shared, private, firstprivate, ...) of variables.
-  // Currently defaults to shared.
+  // Currently shared and private are supported.
   auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,
                     llvm::Value &, llvm::Value &vPtr,
                     llvm::Value *&replacementValue) -> InsertPointTy {
     replacementValue = &vPtr;
 
+    // If this is a private value, this lambda will return the corresponding
+    // mlir value and its `PrivateClauseOp`. Otherwise, empty values are
+    // returned.
+    auto [privVar, privatizerClone] =
+        [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
+      if (!opInst.getPrivateVars().empty()) {
+        auto privVars = opInst.getPrivateVars();
+        auto privatizers = opInst.getPrivatizers();
+
+        for (auto [privVar, privatizerAttr] :
+             llvm::zip_equal(privVars, *privatizers)) {
+          // Find the MLIR private variable corresponding to the LLVM value
+          // being privatized.
+          llvm::Value *llvmPrivVar = moduleTranslation.lookupValue(privVar);
+          if (llvmPrivVar != &vPtr)
+            continue;
+
+          SymbolRefAttr privSym = llvm::cast<SymbolRefAttr>(privatizerAttr);
+          omp::PrivateClauseOp privatizer =
+              SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
+                  opInst, privSym);
+
+          // Clone the privatizer in case it is used by more than one parallel
+          // region. The privatizer is processed in-place (see below) before it
+          // gets inlined in the parallel region and therefore processing the
+          // original op is dangerous.
+          return {privVar, privatizer.clone()};
+        }
+      }
+
+      return {mlir::Value(), omp::PrivateClauseOp()};
+    }();
+
+    if (privVar) {
+      if (privatizerClone.getDataSharingType() ==
+          omp::DataSharingClauseType::FirstPrivate) {
+        privatizerClone.emitOpError(
+            "TODO: delayed privatization is not "
+            "supported for `firstprivate` clauses yet.");
+        bodyGenStatus = failure();
+        return codeGenIP;
+      }
+
+      Region &allocRegion = privatizerClone.getAllocRegion();
+
+      // Replace the privatizer block argument with mlir value being privatized.
+      // This way, the body of the privatizer will be changed from using the
+      // region/block argument to the value being privatized.
+      auto allocRegionArg = allocRegion.getArgument(0);
+      replaceAllUsesInRegionWith(allocRegionArg, privVar, allocRegion);
+
+      auto oldIP = builder.saveIP();
+      builder.restoreIP(allocaIP);
+
+      SmallVector<llvm::Value *, 1> yieldedValues;
+      if (failed(inlineConvertOmpRegions(allocRegion, "omp.privatizer", builder,
+                                         moduleTranslation, &yieldedValues))) {
+        opInst.emitError("failed to inline `alloc` region of an `omp.private` "
+                         "op in the parallel region");
+        bodyGenStatus = failure();
+      } else {
+        assert(yieldedValues.size() == 1);
+        replacementValue = yieldedValues.front();
+      }
+
+      privatizerClone.erase();
+      builder.restoreIP(oldIP);
+    }
+
     return codeGenIP;
   };
 
@@ -1635,7 +1768,7 @@ getRefPtrIfDeclareTarget(mlir::Value value,
 // A small helper structure to contain data gathered
 // for map lowering and coalese it into one area and
 // avoiding extra computations such as searches in the
-// llvm module for lowered mapped varibles or checking
+// llvm module for lowered mapped variables or checking
 // if something is declare target (and retrieving the
 // value) more than neccessary.
 struct MapInfoData : llvm::OpenMPIRBuilder::MapInfosTy {
@@ -2854,26 +2987,26 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation(
                                                 moduleTranslation);
               return failure();
             })
-      .Case(
-          "omp.requires",
-          [&](Attribute attr) {
-            if (auto requiresAttr = attr.dyn_cast<omp::ClauseRequiresAttr>()) {
-              using Requires = omp::ClauseRequires;
-              Requires flags = requiresAttr.getValue();
-              llvm::OpenMPIRBuilderConfig &config =
-                  moduleTranslation.getOpenMPBuilder()->Config;
-              config.setHasRequiresReverseOffload(
-                  bitEnumContainsAll(flags, Requires::reverse_offload));
-              config.setHasRequiresUnifiedAddress(
-                  bitEnumContainsAll(flags, Requires::unified_address));
-              config.setHasRequiresUnifiedSharedMemory(
-                  bitEnumContainsAll(flags, Requires::unified_shared_memory));
-              config.setHasRequiresDynamicAllocators(
-                  bitEnumContainsAll(flags, Requires::dynamic_allocators));
-              return success();
-            }
-            return failure();
-          })
+      .Case("omp.requires",
+            [&](Attribute attr) {
+              if (auto requiresAttr =
+                      attr.dyn_cast<omp::ClauseRequiresAttr>()) {
+                using Requires = omp::ClauseRequires;
+                Requires flags = requiresAttr.getValue();
+                llvm::OpenMPIRBuilderConfig &config =
+                    moduleTranslation.getOpenMPBuilder()->Config;
+                config.setHasRequiresReverseOffload(
+                    bitEnumContainsAll(flags, Requires::reverse_offload));
+                config.setHasRequiresUnifiedAddress(
+                    bitEnumContainsAll(flags, Requires::unified_address));
+                config.setHasRequiresUnifiedSharedMemory(
+                    bitEnumContainsAll(flags, Requires::unified_shared_memory));
+                config.setHasRequiresDynamicAllocators(
+                    bitEnumContainsAll(flags, Requires::dynamic_allocators));
+                return success();
+              }
+              return failure();
+            })
       .Default([](Attribute) {
         // Fall through for omp attributes that do not require lowering.
         return success();
@@ -2988,12 +3121,13 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
       .Case([&](omp::TargetOp) {
         return convertOmpTarget(*op, builder, moduleTranslation);
       })
-      .Case<omp::MapInfoOp, omp::DataBoundsOp>([&](auto op) {
-        // No-op, should be handled by relevant owning operations e.g.
-        // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
-        // discarded
-        return success();
-      })
+      .Case<omp::MapInfoOp, omp::DataBoundsOp, omp::PrivateClauseOp>(
+          [&](auto op) {
+            // No-op, should be handled by relevant owning operations e.g.
+            // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
+            // discarded
+            return success();
+          })
       .Default([&](Operation *inst) {
         return inst->emitError("unsupported OpenMP operation: ")
                << inst->getName();
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index a11603a44dcd1c..c00628a420a000 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1512,13 +1512,14 @@ ModuleTranslation::getOrCreateAliasScope(AliasScopeAttr aliasScopeAttr) {
   if (!scopeInserted)
     return scopeIt->second;
   llvm::LLVMContext &ctx = llvmModule->getContext();
+  auto dummy = llvm::MDNode::getTemporary(ctx, std::nullopt);
   // Convert the domain metadata node if necessary.
   auto [domainIt, insertedDomain] = aliasDomainMetadataMapping.try_emplace(
       aliasScopeAttr.getDomain(), nullptr);
   if (insertedDomain) {
     llvm::SmallVector<llvm::Metadata *, 2> operands;
     // Placeholder for self-reference.
-    operands.push_back({});
+    operands.push_back(dummy.get());
     if (StringAttr description = aliasScopeAttr.getDomain().getDescription())
       operands.push_back(llvm::MDString::get(ctx, description));
     domainIt->second = llvm::MDNode::get(ctx, operands);
@@ -1529,7 +1530,7 @@ ModuleTranslation::getOrCreateAliasScope(AliasScopeAttr aliasScopeAttr) {
   assert(domainIt->second && "Scope's domain should already be valid");
   llvm::SmallVector<llvm::Metadata *, 3> operands;
   // Placeholder for self-reference.
-  operands.push_back({});
+  operands.push_back(dummy.get());
   operands.push_back(domainIt->second);
   if (StringAttr description = aliasScopeAttr.getDescription())
     operands.push_back(llvm::MDString::get(ctx, description));
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index f967e8352bf4c8..26899301eb742e 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -192,7 +192,6 @@ class IRRewrite {
     EraseBlock,
     InlineBlock,
     MoveBlock,
-    SplitBlock,
     BlockTypeConversion,
     ReplaceBlockArg,
     // Operation rewrites
@@ -400,30 +399,6 @@ class MoveBlockRewrite : public BlockRewrite {
   Block *insertBeforeBlock;
 };
 
-/// Splitting of a block. This rewrite is immediately reflected in the IR.
-class SplitBlockRewrite : public BlockRewrite {
-public:
-  SplitBlockRewrite(ConversionPatternRewriterImpl &rewriterImpl, Block *block,
-                    Block *originalBlock)
-      : BlockRewrite(Kind::SplitBlock, rewriterImpl, block),
-        originalBlock(originalBlock) {}
-
-  static bool classof(const IRRewrite *rewrite) {
-    return rewrite->getKind() == Kind::SplitBlock;
-  }
-
-  void rollback() override {
-    // Merge back the block that was split out.
-    originalBlock->getOperations().splice(originalBlock->end(),
-                                          block->getOperations());
-    eraseBlock(block);
-  }
-
-private:
-  // The original block from which this block was split.
-  Block *originalBlock;
-};
-
 /// This structure contains the information pertaining to an argument that has
 /// been converted.
 struct ConvertedArgInfo {
@@ -798,13 +773,12 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
                             PatternRewriter &rewriter, ValueRange values,
                             SmallVectorImpl<Value> &remapped);
 
-  /// Returns true if the given operation is ignored, and does not need to be
+  /// Return "true" if the given operation is ignored, and does not need to be
   /// converted.
   bool isOpIgnored(Operation *op) const;
 
-  /// Recursively marks the nested operations under 'op' as ignored. This
-  /// removes them from being considered for legalization.
-  void markNestedOpsIgnored(Operation *op);
+  /// Return "true" if the given operation was replaced or erased.
+  bool wasOpReplaced(Operation *op) const;
 
   //===--------------------------------------------------------------------===//
   // Type Conversion
@@ -884,9 +858,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   void notifyBlockInserted(Block *block, Region *previous,
                            Region::iterator previousIt) override;
 
-  /// Notifies that a block was split.
-  void notifySplitBlock(Block *block, Block *continuation);
-
   /// Notifies that a block is being inlined into another block.
   void notifyBlockBeingInlined(Block *block, Block *srcBlock,
                                Block::iterator before);
@@ -946,18 +917,15 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// Ordered list of block operations (creations, splits, motions).
   SmallVector<std::unique_ptr<IRRewrite>> rewrites;
 
-  /// A set of operations that should no longer be considered for legalization,
-  /// but were not directly replace/erased/etc. by a pattern. These are
-  /// generally child operations of other operations who were
-  /// replaced/erased/etc. This is not meant to be an exhaustive list of all
-  /// operations, but the minimal set that can be used to detect if a given
-  /// operation should be `ignored`. For example, we may add the operations that
-  /// define non-empty regions to the set, but not any of the others. This
-  /// simplifies the amount of memory needed as we can query if the parent
-  /// operation was ignored.
+  /// A set of operations that should no longer be considered for legalization.
+  /// E.g., ops that are recursively legal. Ops that were replaced/erased are
+  /// tracked separately.
   SetVector<Operation *> ignoredOps;
 
-  // A set of operations that were erased.
+  /// A set of operations that were replaced/erased. Such ops are not erased
+  /// immediately but only when the dialect conversion succeeds. In the mean
+  /// time, they should no longer be considered for legalization and any attempt
+  /// to modify/access them is invalid rewriter API usage.
   SetVector<Operation *> replacedOps;
 
   /// The current type converter, or nullptr if no type converter is currently
@@ -1237,24 +1205,14 @@ LogicalResult ConversionPatternRewriterImpl::remapValues(
   return success();
 }
 
-// TODO: This function is a misnomer. It does not actually check if `op` is in
-// `ignoredOps`.
 bool ConversionPatternRewriterImpl::isOpIgnored(Operation *op) const {
-  // Check to see if this operation or the parent operation is ignored.
-  return ignoredOps.count(op->getParentOp()) || replacedOps.count(op);
+  // Check to see if this operation is ignored or was replaced.
+  return replacedOps.count(op) || ignoredOps.count(op);
 }
 
-void ConversionPatternRewriterImpl::markNestedOpsIgnored(Operation *op) {
-  // Walk this operation and collect nested operations that define non-empty
-  // regions. We mark such operations as 'ignored' so that we know we don't have
-  // to convert them, or their nested ops.
-  if (op->getNumRegions() == 0)
-    return;
-  op->walk([&](Operation *op) {
-    if (llvm::any_of(op->getRegions(),
-                     [](Region &region) { return !region.empty(); }))
-      ignoredOps.insert(op);
-  });
+bool ConversionPatternRewriterImpl::wasOpReplaced(Operation *op) const {
+  // Check to see if this operation was replaced.
+  return replacedOps.count(op);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1476,6 +1434,9 @@ void ConversionPatternRewriterImpl::notifyOperationInserted(
     logger.startLine() << "** Insert  : '" << op->getName() << "'(" << op
                        << ")\n";
   });
+  assert(!wasOpReplaced(op->getParentOp()) &&
+         "attempting to insert into a block within a replaced/erased op");
+
   if (!previous.isSet()) {
     // This is a newly created op.
     appendRewrite<CreateOperationRewrite>(op);
@@ -1490,7 +1451,7 @@ void ConversionPatternRewriterImpl::notifyOperationInserted(
 void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
                                                      ValueRange newValues) {
   assert(newValues.size() == op->getNumResults());
-  assert(!replacedOps.contains(op) && "operation was already replaced");
+  assert(!ignoredOps.contains(op) && "operation was already replaced");
 
   // Track if any of the results changed, e.g. erased and replaced with null.
   bool resultChanged = false;
@@ -1509,10 +1470,8 @@ void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
   appendRewrite<ReplaceOperationRewrite>(op, currentTypeConverter,
                                          resultChanged);
 
-  // Mark this operation as recursively ignored so that we don't need to
-  // convert any nested operations.
-  replacedOps.insert(op);
-  markNestedOpsIgnored(op);
+  // Mark this operation and all nested ops as replaced.
+  op->walk([&](Operation *op) { replacedOps.insert(op); });
 }
 
 void ConversionPatternRewriterImpl::notifyBlockIsBeingErased(Block *block) {
@@ -1523,6 +1482,9 @@ void ConversionPatternRewriterImpl::notifyBlockIsBeingErased(Block *block) {
 
 void ConversionPatternRewriterImpl::notifyBlockInserted(
     Block *block, Region *previous, Region::iterator previousIt) {
+  assert(!wasOpReplaced(block->getParentOp()) &&
+         "attempting to insert into a region within a replaced/erased op");
+
   if (!previous) {
     // This is a newly created block.
     appendRewrite<CreateBlockRewrite>(block);
@@ -1532,11 +1494,6 @@ void ConversionPatternRewriterImpl::notifyBlockInserted(
   appendRewrite<MoveBlockRewrite>(block, previous, prevBlock);
 }
 
-void ConversionPatternRewriterImpl::notifySplitBlock(Block *block,
-                                                     Block *continuation) {
-  appendRewrite<SplitBlockRewrite>(continuation, block);
-}
-
 void ConversionPatternRewriterImpl::notifyBlockBeingInlined(
     Block *block, Block *srcBlock, Block::iterator before) {
   appendRewrite<InlineBlockRewrite>(block, srcBlock, before);
@@ -1604,6 +1561,9 @@ void ConversionPatternRewriter::eraseOp(Operation *op) {
 }
 
 void ConversionPatternRewriter::eraseBlock(Block *block) {
+  assert(!impl->wasOpReplaced(block->getParentOp()) &&
+         "attempting to erase a block within a replaced/erased op");
+
   // Mark all ops for erasure.
   for (Operation &op : *block)
     eraseOp(&op);
@@ -1619,18 +1579,27 @@ void ConversionPatternRewriter::eraseBlock(Block *block) {
 Block *ConversionPatternRewriter::applySignatureConversion(
     Region *region, TypeConverter::SignatureConversion &conversion,
     const TypeConverter *converter) {
+  assert(!impl->wasOpReplaced(region->getParentOp()) &&
+         "attempting to apply a signature conversion to a block within a "
+         "replaced/erased op");
   return impl->applySignatureConversion(region, conversion, converter);
 }
 
 FailureOr<Block *> ConversionPatternRewriter::convertRegionTypes(
     Region *region, const TypeConverter &converter,
     TypeConverter::SignatureConversion *entryConversion) {
+  assert(!impl->wasOpReplaced(region->getParentOp()) &&
+         "attempting to apply a signature conversion to a block within a "
+         "replaced/erased op");
   return impl->convertRegionTypes(region, converter, entryConversion);
 }
 
 LogicalResult ConversionPatternRewriter::convertNonEntryRegionTypes(
     Region *region, const TypeConverter &converter,
     ArrayRef<TypeConverter::SignatureConversion> blockConversions) {
+  assert(!impl->wasOpReplaced(region->getParentOp()) &&
+         "attempting to apply a signature conversion to a block within a "
+         "replaced/erased op");
   return impl->convertNonEntryRegionTypes(region, converter, blockConversions);
 }
 
@@ -1663,25 +1632,22 @@ ConversionPatternRewriter::getRemappedValues(ValueRange keys,
                            results);
 }
 
-Block *ConversionPatternRewriter::splitBlock(Block *block,
-                                             Block::iterator before) {
-  auto *continuation = block->splitBlock(before);
-  impl->notifySplitBlock(block, continuation);
-  return continuation;
-}
-
 void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
                                                   Block::iterator before,
                                                   ValueRange argValues) {
+#ifndef NDEBUG
   assert(argValues.size() == source->getNumArguments() &&
          "incorrect # of argument replacement values");
-#ifndef NDEBUG
+  assert(!impl->wasOpReplaced(source->getParentOp()) &&
+         "attempting to inline a block from a replaced/erased op");
+  assert(!impl->wasOpReplaced(dest->getParentOp()) &&
+         "attempting to inline a block into a replaced/erased op");
   auto opIgnored = [&](Operation *op) { return impl->isOpIgnored(op); };
-#endif // NDEBUG
   // The source block will be deleted, so it should not have any users (i.e.,
   // there should be no predecessors).
   assert(llvm::all_of(source->getUsers(), opIgnored) &&
          "expected 'source' to have no predecessors");
+#endif // NDEBUG
 
   impl->notifyBlockBeingInlined(dest, source, before);
   for (auto it : llvm::zip(source->getArguments(), argValues))
@@ -1691,6 +1657,8 @@ void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
 }
 
 void ConversionPatternRewriter::startOpModification(Operation *op) {
+  assert(!impl->wasOpReplaced(op) &&
+         "attempting to modify a replaced/erased op");
 #ifndef NDEBUG
   impl->pendingRootUpdates.insert(op);
 #endif
@@ -1698,6 +1666,8 @@ void ConversionPatternRewriter::startOpModification(Operation *op) {
 }
 
 void ConversionPatternRewriter::finalizeOpModification(Operation *op) {
+  assert(!impl->wasOpReplaced(op) &&
+         "attempting to modify a replaced/erased op");
   PatternRewriter::finalizeOpModification(op);
   // There is nothing to do here, we only need to track the operation at the
   // start of the update.
@@ -1912,8 +1882,13 @@ OperationLegalizer::legalize(Operation *op,
 
     // If this operation is recursively legal, mark its children as ignored so
     // that we don't consider them for legalization.
-    if (legalityInfo->isRecursivelyLegal)
-      rewriter.getImpl().markNestedOpsIgnored(op);
+    if (legalityInfo->isRecursivelyLegal) {
+      op->walk([&](Operation *nested) {
+        if (op != nested)
+          rewriter.getImpl().ignoredOps.insert(nested);
+      });
+    }
+
     return success();
   }
 
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 74921544c55578..baf07ea1f010ac 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -173,6 +173,7 @@ if(LLVM_BUILD_EXAMPLES)
     transform-opt-ch3
     transform-opt-ch4
     mlir-minimal-opt
+    mlir-transform-opt
     )
   if(MLIR_ENABLE_EXECUTION_ENGINE)
     list(APPEND MLIR_TEST_DEPENDS
diff --git a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
index 00d7b6b8d65f67..23e0edd510cbb1 100644
--- a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
+++ b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
@@ -927,3 +927,57 @@ func.func @affine_parallel_with_reductions_i64(%arg0: memref<3x3xi64>, %arg1: me
 // CHECK:      scf.reduce.return %[[RES]] : i64
 // CHECK:    }
 // CHECK:  }
+
+///////////////////////////////////////////////////////////////////////
+
+func.func @test_dilinearize_index(%linear_index: index) -> (index, index, index) {
+  %b0 = arith.constant 16 : index
+  %b1 = arith.constant 224 : index
+  %b2 = arith.constant 224 : index
+  %1:3 = affine.delinearize_index %linear_index into (%b0, %b1, %b2) : index, index, index
+  return %1#0, %1#1, %1#2 : index, index, index
+}
+// CHECK-LABEL:   func.func @test_dilinearize_index(
+// CHECK-SAME:                                      %[[VAL_0:.*]]: index) -> (index, index, index) {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 16 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 224 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 224 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 50176 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 50176 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant -1 : index
+// CHECK:           %[[VAL_8:.*]] = arith.cmpi slt, %[[VAL_0]], %[[VAL_6]] : index
+// CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_7]], %[[VAL_0]] : index
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_8]], %[[VAL_9]], %[[VAL_0]] : index
+// CHECK:           %[[VAL_11:.*]] = arith.divsi %[[VAL_10]], %[[VAL_5]] : index
+// CHECK:           %[[VAL_12:.*]] = arith.subi %[[VAL_7]], %[[VAL_11]] : index
+// CHECK:           %[[VAL_13:.*]] = arith.select %[[VAL_8]], %[[VAL_12]], %[[VAL_11]] : index
+// CHECK:           %[[VAL_14:.*]] = arith.constant 50176 : index
+// CHECK:           %[[VAL_15:.*]] = arith.remsi %[[VAL_0]], %[[VAL_14]] : index
+// CHECK:           %[[VAL_16:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_17:.*]] = arith.cmpi slt, %[[VAL_15]], %[[VAL_16]] : index
+// CHECK:           %[[VAL_18:.*]] = arith.addi %[[VAL_15]], %[[VAL_14]] : index
+// CHECK:           %[[VAL_19:.*]] = arith.select %[[VAL_17]], %[[VAL_18]], %[[VAL_15]] : index
+// CHECK:           %[[VAL_20:.*]] = arith.constant 50176 : index
+// CHECK:           %[[VAL_21:.*]] = arith.remsi %[[VAL_0]], %[[VAL_20]] : index
+// CHECK:           %[[VAL_22:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_23:.*]] = arith.cmpi slt, %[[VAL_21]], %[[VAL_22]] : index
+// CHECK:           %[[VAL_24:.*]] = arith.addi %[[VAL_21]], %[[VAL_20]] : index
+// CHECK:           %[[VAL_25:.*]] = arith.select %[[VAL_23]], %[[VAL_24]], %[[VAL_21]] : index
+// CHECK:           %[[VAL_26:.*]] = arith.constant 224 : index
+// CHECK:           %[[VAL_27:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_28:.*]] = arith.constant -1 : index
+// CHECK:           %[[VAL_29:.*]] = arith.cmpi slt, %[[VAL_25]], %[[VAL_27]] : index
+// CHECK:           %[[VAL_30:.*]] = arith.subi %[[VAL_28]], %[[VAL_25]] : index
+// CHECK:           %[[VAL_31:.*]] = arith.select %[[VAL_29]], %[[VAL_30]], %[[VAL_25]] : index
+// CHECK:           %[[VAL_32:.*]] = arith.divsi %[[VAL_31]], %[[VAL_26]] : index
+// CHECK:           %[[VAL_33:.*]] = arith.subi %[[VAL_28]], %[[VAL_32]] : index
+// CHECK:           %[[VAL_34:.*]] = arith.select %[[VAL_29]], %[[VAL_33]], %[[VAL_32]] : index
+// CHECK:           %[[VAL_35:.*]] = arith.constant 224 : index
+// CHECK:           %[[VAL_36:.*]] = arith.remsi %[[VAL_0]], %[[VAL_35]] : index
+// CHECK:           %[[VAL_37:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_38:.*]] = arith.cmpi slt, %[[VAL_36]], %[[VAL_37]] : index
+// CHECK:           %[[VAL_39:.*]] = arith.addi %[[VAL_36]], %[[VAL_35]] : index
+// CHECK:           %[[VAL_40:.*]] = arith.select %[[VAL_38]], %[[VAL_39]], %[[VAL_36]] : index
+// CHECK:           return %[[VAL_13]], %[[VAL_34]], %[[VAL_40]] : index, index, index
+// CHECK:         }
diff --git a/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir b/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
index a1de1ff87c229f..983eee732e2afe 100644
--- a/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
+++ b/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
@@ -1,13 +1,13 @@
-// RUN: mlir-opt  %s --pass-pipeline='builtin.module(func.func(amdgpu-optimize-shared-memory))' | FileCheck %s
+// RUN: mlir-opt %s --pass-pipeline='builtin.module(func.func(amdgpu-optimize-shared-memory))' | FileCheck %s
   
   // CHECK: @optimize_shmem([[arg0:%.+]]: memref<{{.*}}>, [[readRow:%.+]]: index, [[readCol:%.+]]: index, [[writeRow:%.+]]: index, [[writeCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index, [[fragColPerm:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index)
-  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>, 
+  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>,
                     %readRow: index, %readCol: index,
                     %writeRow: index, %writeCol: index,
-                    %fragRow: index, %fragCol: index, 
+                    %fragRow: index, %fragCol: index,
                     %fragColPerm: index,
                     %stRow: index, %stCol: index) {
-    // CHECK:    %[[cst:.+]] = arith.constant 0.000000e+00 : f16                  
+    // CHECK:    %[[cst:.+]] = arith.constant 0.000000e+00 : f16
     %cst = arith.constant 0.000000e+00 : f16
 
     // CHECK: [[shmA:%.+]] = memref.alloc
@@ -15,42 +15,36 @@
     %shmA = memref.alloc() {alignment = 64 : i64} : memref<128x32xf16, 3>
     %shmB = memref.alloc() {alignment = 64 : i64} : memref<256x32xf16, 3>
 
-    // CHECK: %[[D0:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
     %0 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
-    // CHECK: vector.transfer_write %[[D0:.+]], [[shmB]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %0, %shmB[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]     
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]       
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
     // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]] 
-    // CHECK:  vector.load [[shmB:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<256x32xf16, 3>, vector<8xf16>
     %1 = vector.load %shmB[%fragRow, %fragColPerm] : memref<256x32xf16, 3>, vector<8xf16>
 
-    // CHECK: %[[D2:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
     %2 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
-    // CHECK: vector.transfer_write %[[D2:.+]], [[shmA:%.+]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %2, %shmA[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]          
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                     
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]] 
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
     // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
-    // CHECK:  vector.load [[shmA:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<128x32xf16, 3>, vector<8xf16>
     %3 = vector.load %shmA[%fragRow, %fragColPerm] : memref<128x32xf16, 3>, vector<8xf16>
     return
   }
diff --git a/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir b/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
index 143e7c2d270952..b1bb91ffc29721 100644
--- a/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
+++ b/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt  %s -transform-interpreter  | FileCheck %s
+// RUN: mlir-opt %s -transform-interpreter | FileCheck %s
 
   // CHECK: @optimize_shmem([[arg0:%.+]]: memref<{{.*}}>, [[readRow:%.+]]: index, [[readCol:%.+]]: index, [[writeRow:%.+]]: index, [[writeCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index, [[fragColPerm:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index)
-  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>, 
+  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>,
                     %readRow: index, %readCol: index,
                     %writeRow: index, %writeCol: index,
-                    %fragRow: index, %fragCol: index, 
+                    %fragRow: index, %fragCol: index,
                     %fragColPerm: index,
                     %stRow: index, %stCol: index) {
     %cst = arith.constant 0.000000e+00 : f16
@@ -13,33 +13,33 @@
     %shmB = memref.alloc() {alignment = 64 : i64} : memref<256x32xf16, 3>
 
     %0 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %0, %shmB[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]     
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]       
-    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]] 
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
     %1 = vector.load %shmB[%fragRow, %fragColPerm] : memref<256x32xf16, 3>, vector<8xf16>
     %2 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %2, %shmA[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]          
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                     
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]] 
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
     // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
     %3 = vector.load %shmA[%fragRow, %fragColPerm] : memref<128x32xf16, 3>, vector<8xf16>
     return
@@ -48,7 +48,7 @@
 module attributes { transform.with_named_sequence } {
   transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
-    transform.amdgpu.optimize_shared_memory_reads_and_writes %0 : (!transform.any_op) -> ()
+    transform.amdgpu.optimize_shared_memory_reads_and_writes %0 {sharedMemoryLineSizeBytes = 128, defaultVectorSizeBits = 128}: (!transform.any_op) -> ()
     transform.yield
   } // @__transform_main
 } // module
diff --git a/mlir/test/Dialect/Arith/expand-ops.mlir b/mlir/test/Dialect/Arith/expand-ops.mlir
index ec258a8138311e..cd993d16c2da3d 100644
--- a/mlir/test/Dialect/Arith/expand-ops.mlir
+++ b/mlir/test/Dialect/Arith/expand-ops.mlir
@@ -255,36 +255,21 @@ func.func @truncf_f32(%arg0 : f32) -> bf16 {
 }
 
 // CHECK-LABEL: @truncf_f32
-
-// CHECK-DAG: %[[C16:.+]] = arith.constant 16
-// CHECK-DAG: %[[C32768:.+]] = arith.constant 32768
-// CHECK-DAG: %[[C2130706432:.+]] = arith.constant 2130706432
-// CHECK-DAG: %[[C2139095040:.+]] = arith.constant 2139095040
-// CHECK-DAG: %[[C8388607:.+]] = arith.constant 8388607
-// CHECK-DAG: %[[C31:.+]] = arith.constant 31
-// CHECK-DAG: %[[C23:.+]] = arith.constant 23
-// CHECK-DAG: %[[BITCAST:.+]] = arith.bitcast %arg0
-// CHECK-DAG: %[[SIGN:.+]] = arith.shrui %[[BITCAST:.+]], %[[C31]]
-// CHECK-DAG: %[[ROUND:.+]] = arith.subi %[[C32768]], %[[SIGN]]
-// CHECK-DAG: %[[MANTISSA:.+]] = arith.andi %[[BITCAST]], %[[C8388607]]
-// CHECK-DAG: %[[ROUNDED:.+]] = arith.addi %[[MANTISSA]], %[[ROUND]]
-// CHECK-DAG: %[[ROLL:.+]] = arith.shrui %[[ROUNDED]], %[[C23]]
-// CHECK-DAG: %[[SHR:.+]] = arith.shrui %[[ROUNDED]], %[[ROLL]]
-// CHECK-DAG: %[[EXP:.+]] = arith.andi %0, %[[C2139095040]]
-// CHECK-DAG: %[[EXPROUND:.+]] = arith.addi %[[EXP]], %[[ROUNDED]]
-// CHECK-DAG: %[[EXPROLL:.+]] = arith.andi %[[EXPROUND]], %[[C2139095040]]
-// CHECK-DAG: %[[EXPMAX:.+]] = arith.cmpi uge, %[[EXP]], %[[C2130706432]]
-// CHECK-DAG: %[[EXPNEW:.+]] = arith.select %[[EXPMAX]], %[[EXP]], %[[EXPROLL]]
-// CHECK-DAG: %[[OVERFLOW_B:.+]] = arith.trunci %[[ROLL]]
-// CHECK-DAG: %[[KEEP_MAN:.+]] = arith.andi %[[EXPMAX]], %[[OVERFLOW_B]]
-// CHECK-DAG: %[[MANNEW:.+]] = arith.select %[[KEEP_MAN]], %[[MANTISSA]], %[[SHR]]
-// CHECK-DAG: %[[NEWSIGN:.+]] = arith.shli %[[SIGN]], %[[C31]]
-// CHECK-DAG: %[[WITHEXP:.+]] = arith.ori %[[NEWSIGN]], %[[EXPNEW]]
-// CHECK-DAG: %[[WITHMAN:.+]] = arith.ori %[[WITHEXP]], %[[MANNEW]]
-// CHECK-DAG: %[[SHIFT:.+]] = arith.shrui %[[WITHMAN]], %[[C16]]
-// CHECK-DAG: %[[TRUNC:.+]] = arith.trunci %[[SHIFT]]
-// CHECK-DAG: %[[RES:.+]] = arith.bitcast %[[TRUNC]]
-// CHECK: return %[[RES]]
+// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : i32
+// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : i32
+// CHECK-DAG: %[[C7FC0_i16:.+]] = arith.constant 32704 : i16
+// CHECK-DAG: %[[C7FFF:.+]] = arith.constant 32767 : i32
+// CHECK-DAG: %[[ISNAN:.+]] = arith.cmpf une, %arg0, %arg0 : f32
+// CHECK-DAG: %[[BITCAST:.+]] = arith.bitcast %arg0 : f32 to i32
+// CHECK-DAG: %[[SHRUI:.+]] = arith.shrui %[[BITCAST]], %[[C16]] : i32
+// CHECK-DAG: %[[BIT16:.+]] = arith.andi %[[SHRUI]], %[[C1]] : i32
+// CHECK-DAG: %[[ROUNDING_BIAS:.+]] = arith.addi %[[BIT16]], %[[C7FFF]] : i32
+// CHECK-DAG: %[[BIASED:.+]] = arith.addi %[[BITCAST]], %[[ROUNDING_BIAS]] : i32
+// CHECK-DAG: %[[BIASED_SHIFTED:.+]] = arith.shrui %[[BIASED]], %[[C16]] : i32
+// CHECK-DAG: %[[NORMAL_CASE_RESULT_i16:.+]] = arith.trunci %[[BIASED_SHIFTED]] : i32 to i16
+// CHECK-DAG: %[[SELECT:.+]] = arith.select %[[ISNAN]], %[[C7FC0_i16]], %[[NORMAL_CASE_RESULT_i16]] : i16
+// CHECK-DAG: %[[RESULT:.+]] = arith.bitcast %[[SELECT]] : i16 to bf16
+// CHECK: return %[[RESULT]]
 
 // -----
 
diff --git a/mlir/test/Dialect/Linalg/flatten-elementwise.mlir b/mlir/test/Dialect/Linalg/flatten-elementwise.mlir
new file mode 100644
index 00000000000000..858c133dd536ca
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/flatten-elementwise.mlir
@@ -0,0 +1,99 @@
+// RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func.func @fill(
+// CHECK-SAME:                  %[[ARG0:.*]]: f32,
+// CHECK-SAME:                  %[[ARG1:.*]]: memref<32x7xf32>
+// CHECK-NEXT:    %[[FLATTENED:.*]] = memref.collapse_shape %[[ARG1]] {{\[}}[0, 1]]
+// CHECK-NEXT:    linalg.fill ins(%[[ARG0]] : f32) outs(%[[FLATTENED]] : memref<224xf32>)
+func.func @fill(%cst: f32, %arg: memref<32x7xf32>) {
+    linalg.fill ins(%cst: f32) outs(%arg: memref<32x7xf32>)
+    return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %flattened = transform.structured.flatten_elementwise %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @fill_tensor(
+// CHECK-SAME:                         %[[ARG0:.*]]: f32,
+// CHECK-SAME:                         %[[ARG1:.*]]: tensor<32x7xf32>
+// CHECK-NEXT:    %[[FLATTENED:.*]] = tensor.collapse_shape %[[ARG1]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_RESULT:.*]] = linalg.fill ins(%[[ARG0]] : f32) outs(%[[FLATTENED]] : tensor<224xf32>)
+// CHECK-NEXT:    %[[RESULT:.*]] = tensor.expand_shape %[[FLATTENED_RESULT]] {{\[}}[0, 1]]
+func.func @fill_tensor(%cst: f32, %arg: tensor<32x7xf32>) -> tensor<32x7xf32> {
+    %0 = linalg.fill ins(%cst: f32) outs(%arg: tensor<32x7xf32>) ->  tensor<32x7xf32>
+    return %0 :  tensor<32x7xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %flattened = transform.structured.flatten_elementwise %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @map(
+// CHECK-SAME:                 %[[ARG0:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-SAME:                 %[[ARG1:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-SAME:                 %[[ARG2:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-NEXT:    %[[FLATTENED_0:.*]] = memref.collapse_shape %[[ARG0]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_1:.*]] = memref.collapse_shape %[[ARG1]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_2:.*]] = memref.collapse_shape %[[ARG2]] {{\[}}[0, 1]]
+// CHECK-NEXT:    linalg.map { arith.addf } ins(%[[FLATTENED_0]], %[[FLATTENED_1]] : memref<224xf32>, memref<224xf32>) outs(%[[FLATTENED_2]] : memref<224xf32>)
+func.func @map(%arg0: memref<32x7xf32>, %arg1: memref<32x7xf32>, %arg2: memref<32x7xf32>) {
+    linalg.map {arith.addf} ins(%arg0, %arg1: memref<32x7xf32>, memref<32x7xf32>) outs(%arg2: memref<32x7xf32>)
+    return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %flattened = transform.structured.flatten_elementwise %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK: #[[$MAP0:.*]] = affine_map<(d0) -> (d0)>
+// CHECK-LABEL: func.func @generic
+// CHECK-SAME:                 %[[ARG0:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-SAME:                 %[[ARG1:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-SAME:                 %[[ARG2:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-NEXT:    %[[FLATTENED_0:.*]] = memref.collapse_shape %[[ARG0]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_1:.*]] = memref.collapse_shape %[[ARG1]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_2:.*]] = memref.collapse_shape %[[ARG2]] {{\[}}[0, 1]]
+// CHECK-NEXT:    linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP0]], #[[$MAP0]]], iterator_types = ["parallel"]} ins(%[[FLATTENED_0]], %[[FLATTENED_1]] : memref<224xf32>, memref<224xf32>) outs(%[[FLATTENED_2]] : memref<224xf32>)
+// CHECK-NEXT:       ^bb0(%[[A:.*]]: f32, %[[B:.*]]: f32, %[[C:.*]]: f32)
+// CHECK-NEXT:         %[[SUM:.*]] = arith.addf %[[A]], %[[B]]
+// CHECK-NEXT:         linalg.yield %[[SUM]]
+#map = affine_map<(d0, d1) -> (d0, d1)>
+func.func @generic( %arg0: memref<32x7xf32>, %arg1: memref<32x7xf32>, %arg2: memref<32x7xf32>) {
+    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1: memref<32x7xf32>, memref<32x7xf32>) outs(%arg2: memref<32x7xf32>) {
+        ^bb0(%a: f32, %b: f32, %c: f32):
+            %0 = arith.addf %a, %b : f32
+            linalg.yield %0 : f32
+    }
+    return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %flattened = transform.structured.flatten_elementwise %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
index 96953c234a0873..85e1c56dd45a0d 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
@@ -550,3 +550,48 @@ module attributes {transform.with_named_sequence} {
      transform.yield
    }
 }
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>
+func.func @vectorize_reverse_like_tensor_extract(%arg0: tensor<1x2x3xf32>, %arg1: tensor<1x1x3xf32>, %arg2: index) -> tensor<1x1x3xf32> {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %0 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel"]} outs(%arg1 : tensor<1x1x3xf32>) {
+  ^bb0(%out: f32):
+    %1 = linalg.index 1 : index
+    %2 = linalg.index 0 : index
+    %3 = affine.apply #map1(%1, %2, %arg2)
+    %4 = linalg.index 2 : index
+    %5 = arith.subi %c2, %4 : index
+    %extracted = tensor.extract %arg0[%c0, %3, %5] : tensor<1x2x3xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x1x3xf32>
+  return %0 : tensor<1x1x3xf32>
+}
+// CHECK-LABEL: func.func @vectorize_reverse_like_tensor_extract
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]
+// CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]*]]
+// CHECK-SAME:    %[[ARG2:[0-9a-zA-Z]*]]
+// CHECK-DAG:    %[[CST:.+]] = arith.constant dense<3> : vector<1x1x3xindex>
+// CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:    %[[MASK:.*]] = arith.constant dense<true> : vector<1x1x3xi1>
+// CHECK-DAG:    %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
+// CHECK-DAG:    %[[INIT_IDX:.+]] = arith.constant dense<[2, 1, 0]> : vector<3xindex>
+// CHECK:        %[[T0:.+]] = vector.broadcast %[[ARG2]] : index to vector<1x1x3xindex>
+// CHECK:        %[[T1:.+]] = arith.muli %[[T0]], %[[CST]] : vector<1x1x3xindex>
+// CHECK:        %[[T2:.+]] = vector.broadcast %[[INIT_IDX]]
+// CHECK:        %[[T3:.+]] = arith.addi %[[T2]], %[[T1]]
+// CHECK:        %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]], %[[C0]]] [%[[T3]]], %[[MASK]], %[[PASSTHRU]]
+// CHECK:        vector.transfer_write %[[GATHER]]
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
+     transform.yield
+   }
+}
diff --git a/mlir/test/Dialect/Transform/include/test-interpreter-external-concurrent-source.mlir b/mlir/test/Dialect/Transform/include/test-interpreter-external-concurrent-source.mlir
index 316b90f85236e4..255ff5f31ed3f5 100644
--- a/mlir/test/Dialect/Transform/include/test-interpreter-external-concurrent-source.mlir
+++ b/mlir/test/Dialect/Transform/include/test-interpreter-external-concurrent-source.mlir
@@ -1,16 +1,21 @@
 // RUN: mlir-opt %s
 // No need to check anything else than parsing here, this is being used by another test as data.
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @func_return : benefit(1) {
-    %0 = pdl.operation "func.return"
-    pdl.rewrite %0 with "transform.dialect"
-  }
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @func_return : benefit(1) {
+        %0 = pdl.operation "func.return"
+        pdl.rewrite %0 with "transform.dialect"
+      }
 
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @func_return in %arg1 : (!transform.any_op) -> !transform.op<"func.return">
-    transform.debug.emit_remark_at %0, "matched" : !transform.op<"func.return">
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @func_return in %arg1 : (!transform.any_op) -> !transform.op<"func.return">
+        transform.debug.emit_remark_at %0, "matched" : !transform.op<"func.return">
+      }
+    }
+    transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Transform/include/test-interpreter-external-source.mlir b/mlir/test/Dialect/Transform/include/test-interpreter-external-source.mlir
index 5956c86ebbe4b2..f6b7f787cc2c38 100644
--- a/mlir/test/Dialect/Transform/include/test-interpreter-external-source.mlir
+++ b/mlir/test/Dialect/Transform/include/test-interpreter-external-source.mlir
@@ -1,11 +1,13 @@
 // RUN: mlir-opt %s
 // No need to check anything else than parsing here, this is being used by another test as data.
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  transform.debug.emit_remark_at %arg0, "outer" : !transform.any_op
-  transform.sequence %arg0 : !transform.any_op failures(propagate) attributes {transform.target_tag="transform"} {
-  ^bb1(%arg1: !transform.any_op):
-    transform.debug.emit_remark_at %arg1, "inner" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.debug.emit_remark_at %arg0, "outer" : !transform.any_op
+    transform.sequence %arg0 : !transform.any_op failures(propagate) attributes {transform.target_tag="transform"} {
+    ^bb1(%arg1: !transform.any_op):
+      transform.debug.emit_remark_at %arg1, "inner" : !transform.any_op
+    }
+    transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Transform/multi-arg-top-level-ops.mlir b/mlir/test/Dialect/Transform/multi-arg-top-level-ops.mlir
index 9a7e7ca2f9536e..1c018b1b1f7796 100644
--- a/mlir/test/Dialect/Transform/multi-arg-top-level-ops.mlir
+++ b/mlir/test/Dialect/Transform/multi-arg-top-level-ops.mlir
@@ -1,10 +1,15 @@
-// RUN: mlir-opt %s --pass-pipeline='builtin.module(test-transform-dialect-interpreter{bind-first-extra-to-ops=func.func bind-second-extra-to-ops=func.return})' \
-// RUN:             --split-input-file --verify-diagnostics
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(transform-interpreter{\
+// RUN:       debug-bind-trailing-args=func.func,func.return})" \
+// RUN:   --split-input-file --verify-diagnostics
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_op):
-  transform.debug.emit_remark_at %arg1, "first extra" : !transform.any_op
-  transform.debug.emit_remark_at %arg2, "second extra" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      %arg2: !transform.any_op) {
+    transform.debug.emit_remark_at %arg1, "first extra" : !transform.any_op
+    transform.debug.emit_remark_at %arg2, "second extra" : !transform.any_op
+    transform.yield
+  }
 }
 
 // expected-remark @below {{first extra}}
@@ -26,9 +31,13 @@ func.func @bar(%arg0: i1) {
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.param<i64>):
-  // expected-error @above {{wrong kind of value provided for top-level parameter}}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      %arg2: !transform.param<i64>) {
+    // expected-error @above {{wrong kind of value provided for top-level parameter}}
+    transform.yield
+  }
 }
 
 func.func @foo() {
@@ -37,9 +46,13 @@ func.func @foo() {
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_value):
-  // expected-error @above {{wrong kind of value provided for the top-level value handle}}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      %arg2: !transform.any_value) {
+    // expected-error @above {{wrong kind of value provided for the top-level value handle}}
+    transform.yield
+  }
 }
 
 func.func @foo() {
@@ -48,19 +61,27 @@ func.func @foo() {
 
 // -----
 
-// expected-error @below {{operation expects 1 extra value bindings, but 2 were provided to the interpreter}}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op):
+
+module attributes {transform.with_named_sequence} {
+  // expected-error @below {{operation expects 1 extra value bindings, but 2 were provided to the interpreter}}
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op) {
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_op):
-  transform.sequence %arg0, %arg1, %arg2 : !transform.any_op, !transform.any_op, !transform.any_op failures(propagate) {
-  ^bb0(%arg3: !transform.any_op, %arg4: !transform.any_op, %arg5: !transform.any_op):
-    transform.debug.emit_remark_at %arg4, "first extra" : !transform.any_op
-    transform.debug.emit_remark_at %arg5, "second extra" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      %arg2: !transform.any_op) {
+    transform.sequence %arg0, %arg1, %arg2 : !transform.any_op, !transform.any_op, !transform.any_op failures(propagate) {
+    ^bb0(%arg3: !transform.any_op, %arg4: !transform.any_op, %arg5: !transform.any_op):
+      transform.debug.emit_remark_at %arg4, "first extra" : !transform.any_op
+      transform.debug.emit_remark_at %arg5, "second extra" : !transform.any_op
+    }
+    transform.yield
   }
 }
 
diff --git a/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir b/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir
index f59a4b6d4ccc32..6486bcae3294e4 100644
--- a/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir
+++ b/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir
@@ -1,24 +1,37 @@
-// RUN: mlir-opt %s --pass-pipeline='builtin.module(test-transform-dialect-interpreter{bind-first-extra-to-params=1,2,3 bind-second-extra-to-params=42,45})' \
+// RUN: mlir-opt %s --pass-pipeline='builtin.module(transform-interpreter{\
+// RUN:                             debug-bind-trailing-args=#1;2;3,#42;45})' \
 // RUN:          --split-input-file --verify-diagnostics
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.param<i64>, %arg2: !transform.param<i64>):
-  // expected-remark @below {{1 : i64, 2 : i64, 3 : i64}}
-  transform.debug.emit_param_as_remark %arg1 : !transform.param<i64>
-  // expected-remark @below {{42 : i64, 45 : i64}}
-  transform.debug.emit_param_as_remark %arg2 : !transform.param<i64>
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.param<i64>,
+      %arg2: !transform.param<i64>) {
+    // expected-remark @below {{1 : i64, 2 : i64, 3 : i64}}
+    transform.debug.emit_param_as_remark %arg1 : !transform.param<i64>
+    // expected-remark @below {{42 : i64, 45 : i64}}
+    transform.debug.emit_param_as_remark %arg2 : !transform.param<i64>
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.param<i64>):
-  // expected-error @above {{wrong kind of value provided for top-level operation handle}}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      // expected-error @above {{wrong kind of value provided for top-level operation handle}}
+      %arg2: !transform.param<i64>) {
+    transform.yield
+  }
 }
 
 // -----
 
-// expected-error @below {{operation expects 3 extra value bindings, but 2 were provided to the interpreter}}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.param<i64>, %arg2: !transform.param<i64>, %arg3: !transform.param<i64>):
+module attributes {transform.with_named_sequence} {
+  // expected-error @below {{operation expects 3 extra value bindings, but 2 were provided to the interpreter}}
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.param<i64>,
+      %arg2: !transform.param<i64>, %arg3: !transform.param<i64>) {
+    transform.yield
+  }
 }
diff --git a/mlir/test/Dialect/Transform/multi-arg-top-level-values.mlir b/mlir/test/Dialect/Transform/multi-arg-top-level-values.mlir
index 38d7e28697774d..dcc1079267dc7c 100644
--- a/mlir/test/Dialect/Transform/multi-arg-top-level-values.mlir
+++ b/mlir/test/Dialect/Transform/multi-arg-top-level-values.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt %s --pass-pipeline='builtin.module(test-transform-dialect-interpreter{bind-first-extra-to-results-of-ops=test.some_returning_op bind-second-extra-to-results-of-ops=test.some_other_returning_op})' \
+// RUN: mlir-opt %s --pass-pipeline='builtin.module(transform-interpreter{\
+// RUN:     debug-bind-trailing-args=^test.some_returning_op,^test.some_other_returning_op})' \
 // RUN:             --split-input-file --verify-diagnostics
 
 // Note that diagnostic checker will merge two diagnostics with the same message
@@ -21,10 +22,14 @@
 // expected-note @below {{value handle points to an op result #1}}
 %2:2 = "test.some_other_returning_op"() : () -> (f32, f64)
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_value, %arg2: !transform.any_value):
-  transform.debug.emit_remark_at %arg1, "first extra" : !transform.any_value
-  transform.debug.emit_remark_at %arg2, "second extra" : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_value,
+      %arg2: !transform.any_value) {
+    transform.debug.emit_remark_at %arg1, "first extra" : !transform.any_value
+    transform.debug.emit_remark_at %arg2, "second extra" : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
@@ -32,14 +37,19 @@ transform.sequence failures(propagate) {
 %0:2 = "test.some_returning_op"() : () -> (i32, i64)
 %1 = "test.some_returning_op"() : () -> index
 
-transform.sequence failures(propagate) {
-// expected-error @below {{wrong kind of value provided for top-level operation handle}}
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_value):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      // expected-error @below {{wrong kind of value provided for top-level operation handle}}
+      %arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_value) {
+    transform.yield
+  }
 }
 
 // -----
 
-// expected-error @below {{operation expects 1 extra value bindings, but 2 were provided to the interpreter}}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_value):
+module attributes {transform.with_named_sequence} {
+  // expected-error @below {{operation expects 1 extra value bindings, but 2 were provided to the interpreter}}
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op, %arg1: !transform.any_value) {
+    transform.yield
+  }
 }
diff --git a/mlir/test/Dialect/Transform/test-interpreter-debug.mlir b/mlir/test/Dialect/Transform/test-interpreter-debug.mlir
index c7dad582dd432c..99301ea23c6f8d 100644
--- a/mlir/test/Dialect/Transform/test-interpreter-debug.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter-debug.mlir
@@ -1,19 +1,21 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{debug-payload-root-tag=payload debug-transform-root-tag=transform})" \
-// RUN:             --allow-unregistered-dialect --split-input-file --verify-diagnostics
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(transform-interpreter{\
+// RUN:         debug-payload-root-tag=payload \
+// RUN:         entry-point=transform})" \
+// RUN:   --allow-unregistered-dialect --split-input-file --verify-diagnostics
 
 // expected-error @below {{could not find the operation with transform.target_tag="payload" attribute}}
-module {
-  transform.sequence failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @transform(%arg0: !transform.any_op) {
+    transform.yield
   }
 }
 
 // -----
 
-// expected-error @below {{could not find the operation with transform.target_tag="transform" attribute}}
-module {
-  transform.sequence failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
+// expected-error @below {{could not find a nested named sequence with name: transform}}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @not_transform(%arg0: !transform.any_op) {
+    transform.yield
   }
 
   module attributes {transform.target_tag="payload"} {}
@@ -21,42 +23,16 @@ module {
 
 // -----
 
-// expected-error @below {{more than one operation with transform.target_tag="transform" attribute}}
-module {
-  // expected-note @below {{first operation}}
-  transform.sequence failures(propagate) attributes {transform.target_tag="transform"} {
-  ^bb0(%arg0: !transform.any_op):
-  }
-
-  // expected-note @below {{other operation}}
-  transform.sequence failures(propagate) attributes {transform.target_tag="transform"} {
-  ^bb0(%arg0: !transform.any_op):
-  }
-
-  module attributes {transform.target_tag="payload"} {}
-}
-
-// -----
-
-module {
-  // expected-error @below {{expected the transform entry point to be a top-level transform op}}
-  func.func private @foo() attributes {transform.target_tag="transform"}
-
-  module attributes {transform.target_tag="payload"} {}
-}
-
-// -----
-
-module {
-  transform.sequence failures(suppress) attributes {transform.target_tag="transform"} {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @transform(%arg0: !transform.any_op) {
     transform.debug.emit_remark_at %arg0, "payload" : !transform.any_op
+    transform.yield
   }
 
-  // This will not be executed because it's not tagged.
-  transform.sequence failures(suppress)  {
-  ^bb0(%arg0: !transform.any_op):
+  // This will not be executed.
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     transform.debug.emit_remark_at %arg0, "some other text that is not printed" : !transform.any_op
+    transform.yield
   }
 
   module {
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external-concurrent.mlir b/mlir/test/Dialect/Transform/test-interpreter-external-concurrent.mlir
index 59c2b672a6e6b1..9884102c6c0ff5 100644
--- a/mlir/test/Dialect/Transform/test-interpreter-external-concurrent.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter-external-concurrent.mlir
@@ -1,4 +1,6 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(test-transform-dialect-interpreter{transform-file-name=%p%{fs-sep}include%{fs-sep}test-interpreter-external-concurrent-source.mlir}))" \
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(\
+// RUN:     transform-preload-library{transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-external-concurrent-source.mlir},\
+// RUN:     func.func(transform-interpreter))" \
 // RUN:             --verify-diagnostics
 
 // Exercising the pass on multiple functions of different lengths that may be
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external.mlir b/mlir/test/Dialect/Transform/test-interpreter-external.mlir
index ba8e0c6870dbf8..599ce05fcc40b1 100644
--- a/mlir/test/Dialect/Transform/test-interpreter-external.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter-external.mlir
@@ -1,4 +1,6 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-file-name=%p%{fs-sep}include%{fs-sep}test-interpreter-external-source.mlir})" \
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(\
+// RUN:                 transform-preload-library{transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-external-source.mlir},\
+// RUN:                 transform-interpreter)" \
 // RUN:             --verify-diagnostics
 
 // The schedule in the separate file emits remarks at the payload root.
diff --git a/mlir/test/Dialect/Transform/test-interpreter.mlir b/mlir/test/Dialect/Transform/test-interpreter.mlir
index de5807b2874b27..b6850e2024d53d 100644
--- a/mlir/test/Dialect/Transform/test-interpreter.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter.mlir
@@ -1411,7 +1411,6 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 // expected-error @below {{could not find a nested named sequence with name: __transform_main}}
-// expected-error @below {{could not find transform entry point: __transform_main in either payload or transform module}}
 module {
 }
 
diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir
index 65625457c86898..7cb5387b937d45 100644
--- a/mlir/test/Dialect/Transform/test-pass-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pass-application.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s --transform-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL: func @successful_pass_application(
 //       CHECK:   %[[c5:.*]] = arith.constant 5 : index
@@ -9,10 +9,12 @@ func.func @successful_pass_application(%t: tensor<5xf32>) -> index {
   return %dim : index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_registered_pass "canonicalize" to %1 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "canonicalize" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -22,12 +24,14 @@ func.func @pass_pipeline() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // This pipeline does not do anything. Just make sure that the pipeline is
-  // found and no error is produced.
-  transform.apply_registered_pass "test-options-pass-pipeline" to %1 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // This pipeline does not do anything. Just make sure that the pipeline is
+    // found and no error is produced.
+    transform.apply_registered_pass "test-options-pass-pipeline" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -36,11 +40,13 @@ func.func @invalid_pass_name() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{unknown pass or pass pipeline: non-existing-pass}}
-  transform.apply_registered_pass "non-existing-pass" to %1 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{unknown pass or pass pipeline: non-existing-pass}}
+    transform.apply_registered_pass "non-existing-pass" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -53,11 +59,13 @@ func.func @not_isolated_from_above(%t: tensor<5xf32>) -> index {
   return %dim : index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["tensor.dim"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{pass pipeline failed}}
-  transform.apply_registered_pass "canonicalize" to %1 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["tensor.dim"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{pass pipeline failed}}
+    transform.apply_registered_pass "canonicalize" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -66,11 +74,13 @@ func.func @invalid_pass_option() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{failed to add pass or pass pipeline to pipeline: canonicalize}}
-  transform.apply_registered_pass "canonicalize" to %1 {options = "invalid-option=1"} : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{failed to add pass or pass pipeline to pipeline: canonicalize}}
+    transform.apply_registered_pass "canonicalize" to %1 {options = "invalid-option=1"} : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -80,27 +90,29 @@ func.func @valid_pass_option() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_registered_pass "canonicalize" to %1 {options = "top-down=false"} : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "canonicalize" to %1 {options = "top-down=false"} : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-module {
+module attributes {transform.with_named_sequence} {
   // expected-error @below {{trying to schedule a pass on an unsupported operation}}
   // expected-note @below {{target op}}
   func.func @invalid_target_op_type() {
     return
   }
 
-  transform.sequence failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
 
     // func-bufferize can be applied only to ModuleOps.
     // expected-error @below {{pass pipeline failed}}
     transform.apply_registered_pass "func-bufferize" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Transform/test-pattern-application.mlir b/mlir/test/Dialect/Transform/test-pattern-application.mlir
index 10cd9ef351fe54..0c41e81b17b522 100644
--- a/mlir/test/Dialect/Transform/test-pattern-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pattern-application.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s --transform-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL: func @update_tracked_op_mapping()
 //       CHECK:   "test.container"() ({
@@ -11,15 +11,17 @@ func.func @update_tracked_op_mapping() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
-  // Add an attribute to %1, which is now mapped to a new op.
-  transform.annotate %1 "annotated" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    // Add an attribute to %1, which is now mapped to a new op.
+    transform.annotate %1 "annotated" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -33,19 +35,21 @@ func.func @replacement_op_not_found() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-note @below {{replacement is required because this handle must be updated}}
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{tracking listener failed to find replacement op during application of this transform op}}
-  // expected-note @below {{ran out of suitable replacement values}}
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
-  // %1 must be used in some way. If no replacement payload op could be found,
-  // an error is thrown only if the handle is not dead.
-  transform.annotate %1 "annotated" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-note @below {{replacement is required because this handle must be updated}}
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{tracking listener failed to find replacement op during application of this transform op}}
+    // expected-note @below {{ran out of suitable replacement values}}
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    // %1 must be used in some way. If no replacement payload op could be found,
+    // an error is thrown only if the handle is not dead.
+    transform.annotate %1 "annotated" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -61,14 +65,16 @@ func.func @replacement_op_for_dead_handle_not_found() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // No error because %1 is dead.
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // No error because %1 is dead.
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -84,14 +90,16 @@ func.func @replacement_op_not_found_silenced() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } {transform.silence_tracking_failures} : !transform.any_op
-  transform.annotate %1 "annotated" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } {transform.silence_tracking_failures} : !transform.any_op
+    transform.annotate %1 "annotated" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -103,12 +111,14 @@ func.func @patterns_apply_only_to_target_body() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-%0 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+  %0 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -125,16 +135,18 @@ func.func @erase_tracked_op() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.erase_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.debug.emit_remark_at %1, "matched op" : !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
-  // No marker should be printed.
-  transform.debug.emit_remark_at %1, "op was deleted" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.erase_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.debug.emit_remark_at %1, "matched op" : !transform.any_op
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    // No marker should be printed.
+    transform.debug.emit_remark_at %1, "op was deleted" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -143,7 +155,7 @@ transform.sequence failures(propagate) {
 //       CHECK:   "test.container"() ({
 //  CHECK-NEXT:   ^bb0:
 //  CHECK-NEXT:   }) : () -> ()
-module {
+module attributes {transform.with_named_sequence} {
   func.func @erase_tracked_op_in_named_sequence() {
     "test.container"() ({
       // expected-remark @below {{matched op}}
@@ -152,23 +164,21 @@ module {
     return
   }
 
-  module attributes { transform.with_named_sequence } {
-    transform.named_sequence @foo(%arg0: !transform.any_op {transform.readonly}) -> () {
-      transform.apply_patterns to %arg0 {
-        transform.apply_patterns.transform.test_patterns
-      } : !transform.any_op
-      transform.yield
-    }
+  transform.named_sequence @foo(%arg0: !transform.any_op {transform.readonly}) -> () {
+    transform.apply_patterns to %arg0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    transform.yield
+  }
 
-    transform.sequence failures(propagate) {
-    ^bb1(%arg1: !transform.any_op):
-      %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1 = transform.structured.match ops{["test.erase_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      transform.debug.emit_remark_at %1, "matched op" : !transform.any_op
-      include @foo failures(propagate) (%0) : (!transform.any_op) -> ()
-      // No marker should be printed.
-      transform.debug.emit_remark_at %1, "op was deleted" : !transform.any_op
-    }
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.erase_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.debug.emit_remark_at %1, "matched op" : !transform.any_op
+    transform.include @foo failures(propagate) (%0) : (!transform.any_op) -> ()
+    // No marker should be printed.
+    transform.debug.emit_remark_at %1, "op was deleted" : !transform.any_op
+    transform.yield
   }
 }
 
@@ -183,13 +193,15 @@ func.func @canonicalization(%t: tensor<5xf32>) -> index {
   return %dim : index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["tensor.dim"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %1 {
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["tensor.dim"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %1 {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -200,13 +212,13 @@ module {
     return
   }
 
-  module {
-    transform.sequence failures(propagate) {
-    ^bb1(%arg1: !transform.any_op):
+  module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
       // expected-error @below {{cannot apply transform to itself (or one of its ancestors)}}
       transform.apply_patterns to %arg1 {
         transform.apply_patterns.canonicalization
       } : !transform.any_op
+      transform.yield
     }
   }
 }
@@ -224,12 +236,14 @@ func.func @canonicalization_and_cse(%m: memref<5xf32>) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %1 {
-    transform.apply_patterns.canonicalization
-  } {apply_cse} : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %1 {
+      transform.apply_patterns.canonicalization
+    } {apply_cse} : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -243,15 +257,17 @@ func.func @full_dialect_conversion() -> tensor<5xf32> {
   return %0 : tensor<5xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    transform.apply_conversion_patterns.transform.test_conversion_patterns
-  } with type_converter {
-    transform.apply_conversion_patterns.transform.test_type_converter
-  } {legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      transform.apply_conversion_patterns.transform.test_conversion_patterns
+    } with type_converter {
+      transform.apply_conversion_patterns.transform.test_type_converter
+    } {legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -266,16 +282,18 @@ func.func @full_dialect_conversion_failed() -> tensor<5xf32> {
   return %0 : tensor<5xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below{{dialect conversion failed}}
-  transform.apply_conversion_patterns to %0 {
-    transform.apply_conversion_patterns.transform.test_conversion_patterns
-  } with type_converter {
-    transform.apply_conversion_patterns.transform.test_type_converter
-  } {legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below{{dialect conversion failed}}
+    transform.apply_conversion_patterns to %0 {
+      transform.apply_conversion_patterns.transform.test_conversion_patterns
+    } with type_converter {
+      transform.apply_conversion_patterns.transform.test_type_converter
+    } {legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -294,98 +312,108 @@ func.func @partial_dialect_conversion() -> tensor<5xf32> {
   return %0 : tensor<5xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    transform.apply_conversion_patterns.transform.test_conversion_patterns
-  } with type_converter {
-    transform.apply_conversion_patterns.transform.test_type_converter
-  } {legal_ops = ["func.func", "func.return", "test.new_op"],
-     partial_conversion} : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      transform.apply_conversion_patterns.transform.test_conversion_patterns
+    } with type_converter {
+      transform.apply_conversion_patterns.transform.test_type_converter
+    } {legal_ops = ["func.func", "func.return", "test.new_op"],
+       partial_conversion} : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below{{pattern descriptor does not specify type converter and apply_conversion_patterns op has no default type converter}}
-  transform.apply_conversion_patterns to %0 {
-    // expected-note @below{{pattern descriptor op}}
-    transform.apply_conversion_patterns.transform.test_conversion_patterns
-  } {illegal_ops = ["test.foo"]} : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below{{pattern descriptor does not specify type converter and apply_conversion_patterns op has no default type converter}}
+    transform.apply_conversion_patterns to %0 {
+      // expected-note @below{{pattern descriptor op}}
+      transform.apply_conversion_patterns.transform.test_conversion_patterns
+    } {illegal_ops = ["test.foo"]} : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    // expected-error @below{{expected LLVMTypeConverter}}
-    transform.apply_conversion_patterns.dialect_to_llvm "test"
-  } with type_converter {
-    transform.apply_conversion_patterns.transform.test_type_converter
-  } {illegal_ops = ["test.foo"],
-     legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      // expected-error @below{{expected LLVMTypeConverter}}
+      transform.apply_conversion_patterns.dialect_to_llvm "test"
+    } with type_converter {
+      transform.apply_conversion_patterns.transform.test_type_converter
+    } {illegal_ops = ["test.foo"],
+       legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    // expected-error @below{{unknown dialect or dialect not loaded: this_dialect_does_not_exist}}
-    transform.apply_conversion_patterns.dialect_to_llvm "this_dialect_does_not_exist"
-  } with type_converter {
-    transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
-  } {illegal_ops = ["test.foo"],
-     legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      // expected-error @below{{unknown dialect or dialect not loaded: this_dialect_does_not_exist}}
+      transform.apply_conversion_patterns.dialect_to_llvm "this_dialect_does_not_exist"
+    } with type_converter {
+      transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
+    } {illegal_ops = ["test.foo"],
+       legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    // expected-error @below{{dialect does not implement ConvertToLLVMPatternInterface or extension was not loaded: transform}}
-    transform.apply_conversion_patterns.dialect_to_llvm "transform"
-  } with type_converter {
-    transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
-  } {illegal_ops = ["test.foo"],
-     legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      // expected-error @below{{dialect does not implement ConvertToLLVMPatternInterface or extension was not loaded: transform}}
+      transform.apply_conversion_patterns.dialect_to_llvm "transform"
+    } with type_converter {
+      transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
+    } {illegal_ops = ["test.foo"],
+       legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
 module attributes { transform.with_named_sequence } {
-func.func @replacement_op_not_found() {
-  // No op replacement can be found, but there are no handles that must be
-  // updated. No error should be reported.
-  "test.container"() ({
-    %0 = "test.foo"() {replace_with_new_op = "test.bar"} : () -> (i32)
-  }) : () -> ()
-  return
-}
+  func.func @replacement_op_not_found() {
+    // No op replacement can be found, but there are no handles that must be
+    // updated. No error should be reported.
+    "test.container"() ({
+      %0 = "test.foo"() {replace_with_new_op = "test.bar"} : () -> (i32)
+    }) : () -> ()
+    return
+  }
 
-transform.named_sequence @patterns(%container: !transform.any_op {transform.readonly}) {
-  transform.apply_patterns to %container {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
-  transform.yield
-}
+  transform.named_sequence @patterns(%container: !transform.any_op {transform.readonly}) {
+    transform.apply_patterns to %container {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    transform.yield
+  }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.annotate %1 "annotated" : !transform.any_op
-  transform.include @patterns failures(propagate) (%0) : (!transform.any_op) -> ()
-}
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.annotate %1 "annotated" : !transform.any_op
+    transform.include @patterns failures(propagate) (%0) : (!transform.any_op) -> ()
+    transform.yield
+  }
 }
diff --git a/mlir/test/Dialect/Transform/test-pdl-extension.mlir b/mlir/test/Dialect/Transform/test-pdl-extension.mlir
index a9710f75531262..a3349c1ba50594 100644
--- a/mlir/test/Dialect/Transform/test-pdl-extension.mlir
+++ b/mlir/test/Dialect/Transform/test-pdl-extension.mlir
@@ -1,21 +1,26 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics
-
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.debug.emit_remark_at %0, "matched" : !transform.any_op
-  }
-
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operation "test.some_op"
-    pdl.rewrite %0 with "transform.dialect"
-  }
-
-  pdl.pattern @other : benefit(1) {
-    %0 = pdl.operation "test.other_op"
-    pdl.rewrite %0 with "transform.dialect"
+// RUN: mlir-opt %s --transform-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.debug.emit_remark_at %0, "matched" : !transform.any_op
+      }
+
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operation "test.some_op"
+        pdl.rewrite %0 with "transform.dialect"
+      }
+
+      pdl.pattern @other : benefit(1) {
+        %0 = pdl.operation "test.other_op"
+        pdl.rewrite %0 with "transform.dialect"
+      }
+    }
+    transform.yield
   }
 }
 
@@ -28,17 +33,22 @@ transform.with_pdl_patterns {
 
 // -----
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-  }
-
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operation "test.some_op"
-    pdl.apply_native_constraint "verbose_constraint"(%0 : !pdl.operation)
-    pdl.rewrite %0 with "transform.dialect"
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+      }
+
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operation "test.some_op"
+        pdl.apply_native_constraint "verbose_constraint"(%0 : !pdl.operation)
+        pdl.rewrite %0 with "transform.dialect"
+      }
+    }
+    transform.yield
   }
 }
 
diff --git a/mlir/test/Dialect/Transform/transform-state-extension.mlir b/mlir/test/Dialect/Transform/transform-state-extension.mlir
index a26293fbe51ca6..e8c0b7a8a3aac0 100644
--- a/mlir/test/Dialect/Transform/transform-state-extension.mlir
+++ b/mlir/test/Dialect/Transform/transform-state-extension.mlir
@@ -1,89 +1,95 @@
-// RUN: mlir-opt %s -test-transform-dialect-interpreter -verify-diagnostics -split-input-file
+// RUN: mlir-opt %s -transform-interpreter -verify-diagnostics -split-input-file
 
 // expected-note @below {{associated payload op}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     // expected-remark @below {{extension absent}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
-    test_add_test_extension "A"
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_add_test_extension "A"
     // expected-remark @below {{extension present, A}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
-    test_remove_test_extension
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_remove_test_extension
     // expected-remark @below {{extension absent}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.yield
   }
 }
 
 // -----
 
 // expected-note @below {{associated payload op}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
-    test_add_test_extension "A"
-    test_remove_test_extension
-    test_add_test_extension "B"
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
+    transform.test_remove_test_extension
+    transform.test_add_test_extension "B"
     // expected-remark @below {{extension present, B}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.yield
   }
 }
 
 // -----
 
 // expected-note @below {{associated payload op}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
-    test_add_test_extension "A"
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
     // expected-remark @below {{extension present, A}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
     // expected-note @below {{associated payload op}}
-    test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
     // expected-remark @below {{extension present, A}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.yield
   }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  test_add_test_extension "A"
-   // This is okay because we are replacing the top-level module operation
-   // (0 results) with this operation that has _more_ (1) results.
-  %dummy = test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
+     // This is okay because we are replacing the top-level module operation
+     // (0 results) with this operation that has _more_ (1) results.
+    %dummy = transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  test_add_test_extension "A"
-  %dummy = test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
-  // This is still okay. Even though we are replacing the previous
-  // operation with (1 result) with this operation that has less (0) results,
-  // there is no handle to the result, hence no issue with value handle update.
-  test_remap_operand_to_self %dummy : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
+    %dummy = transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    // This is still okay. Even though we are replacing the previous
+    // operation with (1 result) with this operation that has less (0) results,
+    // there is no handle to the result, hence no issue with value handle update.
+    transform.test_remap_operand_to_self %dummy : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  test_add_test_extension "A"
-  // expected-error @below {{cannot replace an op with another op producing fewer results while tracking handles}}
-  %dummy = test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
-  %valuehandle = transform.get_result %dummy[0] : (!transform.any_op) -> !transform.any_value
-  test_remap_operand_to_self %dummy : (!transform.any_op) -> ()
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
+    // expected-error @below {{cannot replace an op with another op producing fewer results while tracking handles}}
+    %dummy = transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    %valuehandle = transform.get_result %dummy[0] : (!transform.any_op) -> !transform.any_value
+    transform.test_remap_operand_to_self %dummy : (!transform.any_op) -> ()
+    transform.yield
+  }
 }
 
 // -----
 
-module {
-  transform.sequence failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     // expected-error @below {{TestTransformStateExtension missing}}
-    test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index dc486181ebe960..e6f045e12e5197 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -2567,76 +2567,3 @@ func.func @load_store_forwarding_rank_mismatch(%v0: vector<4x1x1xf32>, %arg0: te
       tensor<4x4x4xf32>, vector<1x100x4x5xf32>
   return %r : vector<1x100x4x5xf32>
 }
-
-// -----
-
-// CHECK-LABEL: func.func @extract_true_from_constant_mask() -> i1 {
-func.func @extract_true_from_constant_mask() -> i1 {
-// CHECK:      %[[TRUE:.*]] = arith.constant true
-// CHECK-NEXT: return %[[TRUE]] : i1
-  %mask = vector.constant_mask [2, 2, 3] : vector<4x4x4xi1>
-  %extract = vector.extract %mask[1, 1, 2] : i1 from vector<4x4x4xi1>
-  return %extract : i1
-}
-
-// -----
-
-// CHECK-LABEL: func.func @extract_false_from_constant_mask() -> i1 {
-func.func @extract_false_from_constant_mask() -> i1 {
-// CHECK:      %[[FALSE:.*]] = arith.constant false
-// CHECK-NEXT: return %[[FALSE]] : i1
-  %mask = vector.constant_mask [2, 2, 3] : vector<4x4x4xi1>
-  %extract = vector.extract %mask[1, 2, 2] : i1 from vector<4x4x4xi1>
-  return %extract : i1
-}
-
-// -----
-
-// CHECK-LABEL: func.func @extract_from_create_mask() -> i1 {
-func.func @extract_from_create_mask() -> i1 {
-// CHECK:      %[[TRUE:.*]] = arith.constant true
-// CHECK-NEXT: return %[[TRUE]] : i1
-  %c2 = arith.constant 2 : index
-  %c3 = arith.constant 3 : index
-  %mask = vector.create_mask %c2, %c2, %c3 : vector<4x4x4xi1>
-  %extract = vector.extract %mask[1, 1, 2] : i1 from vector<4x4x4xi1>
-  return %extract : i1
-}
-
-// -----
-
-// CHECK-LABEL: func.func @extract_subvector_from_constant_mask() ->
-// CHECK-SAME:  vector<6xi1> {
-func.func @extract_subvector_from_constant_mask() -> vector<6xi1> {
-// CHECK:      %[[S0:.*]] = vector.constant_mask [4] : vector<6xi1>
-// CHECK-NEXT: return %[[S0]] : vector<6xi1>
-  %mask = vector.constant_mask [2, 3, 4] : vector<4x5x6xi1>
-  %extract = vector.extract %mask[1, 2] : vector<6xi1> from vector<4x5x6xi1>
-  return %extract : vector<6xi1>
-}
-
-// -----
-
-// CHECK-LABEL: func.func @extract_scalar_with_dynamic_positions(
-// CHECK-SAME:    %[[INDEX:.*]]: index) -> i1 {
-func.func @extract_scalar_with_dynamic_positions(%index: index) -> i1 {
-// CHECK:       %[[S0:.*]] = vector.constant_mask [2, 2, 3] : vector<4x4x4xi1>
-// CHECK-NEXT:  %[[S1:.*]] = vector.extract %[[S0]][1, 1, %[[INDEX]]] : i1 from vector<4x4x4xi1>
-// CHECK-NEXT:  return %[[S1]] : i1
-  %mask = vector.constant_mask [2, 2, 3] : vector<4x4x4xi1>
-  %extract = vector.extract %mask[1, 1, %index] : i1 from vector<4x4x4xi1>
-  return %extract : i1
-}
-
-// -----
-
-// CHECK-LABEL: func.func @extract_subvector_with_dynamic_positions
-// CHECK-SAME:    %[[INDEX:.*]]: index) -> vector<6xi1> {
-func.func @extract_subvector_with_dynamic_positions(%index: index) -> vector<6xi1> {
-// CHECK:      %[[S0:.*]] = vector.constant_mask [2, 3, 4] : vector<4x5x6xi1>
-// CHECK-NEXT: %[[S1:.*]] = vector.extract %[[S0]][1, %[[INDEX]]] : vector<6xi1> from vector<4x5x6xi1>
-// CHECK-NEXT: return %[[S1]] : vector<6xi1>
-  %mask = vector.constant_mask [2, 3, 4] : vector<4x5x6xi1>
-  %extract = vector.extract %mask[1, %index] : vector<6xi1> from vector<4x5x6xi1>
-  return %extract : vector<6xi1>
-}
diff --git a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
index 94e78ce40a3c19..8f0148119806c9 100644
--- a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
+++ b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
@@ -262,6 +262,48 @@ func.func @aligned_sitofp_2d(%a: vector<8x32xi4>) -> vector<8x32xf32> {
   return %0 : vector<8x32xf32>
 }
 
+// CHECK-LABEL: func.func @aligned_trunci(
+func.func @aligned_trunci(%a: vector<8xi32>) -> vector<8xi4> {
+// CHECK-SAME:    %[[IN:.*]]: vector<8xi32>) -> vector<8xi4> {
+// CHECK-DAG:       %[[LOW_MASK:.*]] = arith.constant dense<15> : vector<4xi8>
+// CHECK-DAG:       %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
+// CHECK:           %[[I8:.*]] = arith.trunci %[[IN]] : vector<8xi32> to vector<8xi8>
+// CHECK:           %[[LOW:.*]] = vector.shuffle %[[I8]], %[[I8]] [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[HIGH:.*]] = vector.shuffle %[[I8]], %[[I8]] [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[LOW_MASK]] : vector<4xi8>
+// CHECK:           %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<4xi8>
+// CHECK:           %[[I4:.*]] = vector.bitcast %[[MERGED]] : vector<4xi8> to vector<8xi4>
+  %0 = arith.trunci %a : vector<8xi32> to vector<8xi4>
+  return %0 : vector<8xi4>
+}
+
+// CHECK-LABEL: func.func @aligned_trunci_base_case(
+func.func @aligned_trunci_base_case(%a: vector<8xi8>) -> vector<8xi4> {
+// CHECK-SAME:    %[[IN:.*]]: vector<8xi8>) -> vector<8xi4> {
+// CHECK-DAG:       %[[LOW_MASK:.*]] = arith.constant dense<15> : vector<4xi8>
+// CHECK-DAG:       %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
+// CHECK:           %[[LOW:.*]] = vector.shuffle %[[IN]], %[[IN]] [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[HIGH:.*]] = vector.shuffle %[[IN]], %[[IN]] [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[LOW_MASK]] : vector<4xi8>
+// CHECK:           %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<4xi8>
+// CHECK:           %[[I4:.*]] = vector.bitcast %[[MERGED]] : vector<4xi8> to vector<8xi4>
+  %0 = arith.trunci %a : vector<8xi8> to vector<8xi4>
+  return %0 : vector<8xi4>
+}
+
+// CHECK-LABEL: func.func @aligned_trunci_2d(
+func.func @aligned_trunci_2d(%a: vector<8x32xi32>) -> vector<8x32xi4> {
+// CHECK-NOT:       vector.shuffle
+// CHECK-NOT:       vector.andi
+// CHECK-NOT:       vector.shli
+// CHECK-NOT:       vector.ori
+// CHECK:           arith.trunci
+  %0 = arith.trunci %a : vector<8x32xi32> to vector<8x32xi4>
+  return %0 : vector<8x32xi4>
+}
+
 // CHECK-LABEL: func.func @i4_transpose(
 func.func @i4_transpose(%a: vector<8x16xi4>) -> vector<16x8xi4> {
 // CHECK-SAME:    %[[IN:.*]]: vector<8x16xi4>) -> vector<16x8xi4> {
diff --git a/mlir/test/Dialect/Vector/vector-transforms.mlir b/mlir/test/Dialect/Vector/vector-transforms.mlir
index ea10bd56390c78..eda6a5cc40d999 100644
--- a/mlir/test/Dialect/Vector/vector-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-transforms.mlir
@@ -339,6 +339,51 @@ func.func @bubble_down_bitcast_in_strided_slice_extract_odd_size(%arg0: vector<4
   return %0: vector<3xf16>
 }
 
+// CHECK-LABEL:   func.func @bubble_up_bitcast_in_insert_i4_i8(
+// CHECK-SAME:                                                 %[[VAL:.*]]: vector<32xi4>,
+// CHECK-SAME:                                                 %[[DST:.*]]: vector<8x32xi4>) -> vector<8x16xi8> {
+func.func @bubble_up_bitcast_in_insert_i4_i8(%val: vector<32xi4>, %src: vector<8x32xi4>) -> vector<8x16xi8> {
+// CHECK:           %[[BC_VAL:.*]] = vector.bitcast %[[VAL]] : vector<32xi4> to vector<16xi8>
+// CHECK:           %[[BC_DST:.*]] = vector.bitcast %[[DST]] : vector<8x32xi4> to vector<8x16xi8>
+// CHECK:           vector.insert %[[BC_VAL]], %[[BC_DST]] [4] : vector<16xi8> into vector<8x16xi8>
+  %0 = vector.insert %val, %src[4] : vector<32xi4> into vector<8x32xi4>
+  %1 = vector.bitcast %0 : vector<8x32xi4> to vector<8x16xi8>
+  return %1 : vector<8x16xi8>
+}
+
+// CHECK-LABEL:   func.func @bubble_up_bitcast_in_insert_i8_i4(
+// CHECK-SAME:                                                 %[[VAL:.*]]: vector<16xi8>,
+// CHECK-SAME:                                                 %[[DST:.*]]: vector<8x16xi8>) -> vector<8x32xi4> {
+func.func @bubble_up_bitcast_in_insert_i8_i4(%val: vector<16xi8>, %src: vector<8x16xi8>) -> vector<8x32xi4> {
+// CHECK:           %[[BC_VAL:.*]] = vector.bitcast %[[VAL]] : vector<16xi8> to vector<32xi4>
+// CHECK:           %[[BC_DST:.*]] = vector.bitcast %[[DST]] : vector<8x16xi8> to vector<8x32xi4>
+// CHECK:           vector.insert %[[BC_VAL]], %[[BC_DST]] [4] : vector<32xi4> into vector<8x32xi4>
+  %0 = vector.insert %val, %src[4] : vector<16xi8> into vector<8x16xi8>
+  %1 = vector.bitcast %0 : vector<8x16xi8> to vector<8x32xi4>
+  return %1 : vector<8x32xi4>
+}
+
+// CHECK-LABEL:   func.func @bubble_up_bitcast_in_insert_i32_f32(
+// CHECK-SAME:                                                 %[[VAL:.*]]: vector<16xi32>,
+// CHECK-SAME:                                                 %[[DST:.*]]: vector<8x16xi32>) -> vector<8x16xf32> {
+func.func @bubble_up_bitcast_in_insert_i32_f32(%val: vector<16xi32>, %src: vector<8x16xi32>) -> vector<8x16xf32> {
+// CHECK:           %[[BC_VAL:.*]] = vector.bitcast %[[VAL]] : vector<16xi32> to vector<16xf32>
+// CHECK:           %[[BC_DST:.*]] = vector.bitcast %[[DST]] : vector<8x16xi32> to vector<8x16xf32>
+// CHECK:           vector.insert %[[BC_VAL]], %[[BC_DST]] [4] : vector<16xf32> into vector<8x16xf32>
+  %0 = vector.insert %val, %src[4] : vector<16xi32> into vector<8x16xi32>
+  %1 = vector.bitcast %0 : vector<8x16xi32> to vector<8x16xf32>
+  return %1 : vector<8x16xf32>
+}
+
+// CHECK-LABEL:   func.func @bubble_up_bitcast_in_insert_scalar(
+func.func @bubble_up_bitcast_in_insert_scalar(%val: i8, %src: vector<8x16xi8>) -> vector<8x32xi4> {
+// CHECK:           vector.insert
+// CHECK-NEXT:      vector.bitcast
+  %0 = vector.insert %val, %src[4, 8] : i8 into vector<8x16xi8>
+  %1 = vector.bitcast %0 : vector<8x16xi8> to vector<8x32xi4>
+  return %1 : vector<8x32xi4>
+}
+
 // CHECK-LABEL: func @bubble_up_bitcast_in_strided_slice_insert
 //  CHECK-SAME: (%[[DST:.+]]: vector<8xf16>, %[[SRC1:.+]]: vector<4xf16>, %[[SRC2:.+]]: vector<4xf16>)
 func.func @bubble_up_bitcast_in_strided_slice_insert(%dst: vector<8xf16>, %src1: vector<4xf16>, %src2: vector<4xf16>) -> vector<4xf32> {
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index 9072603734879e..bf90c4a6ebb3c2 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -1559,3 +1559,28 @@ func.func @warp_propagate_multi_dim_create_mask(%laneid: index, %m0: index, %m1:
 //       CHECK-PROP:   %[[DISTM0:.+]] = affine.apply #[[$SUBM0]]()[%[[M0]], %[[LANEID]]]
 //       CHECK-PROP:   %[[DISTM1:.+]] = affine.apply #[[$SUBM1]]()[%[[M1]], %[[LANEID]]]
 //       CHECK-PROP:   vector.create_mask %[[DISTM0]], %[[DISTM1]], %[[M2]] : vector<1x2x4xi1>
+
+// -----
+
+func.func @warp_propagate_nd_write(%laneid: index, %dest: memref<4x1024xf32>) {
+  %c0 = arith.constant 0 : index
+  vector.warp_execute_on_lane_0(%laneid)[32] -> () {
+    %0 = "some_def"() : () -> (vector<4x1024xf32>)
+    vector.transfer_write %0, %dest[%c0, %c0] : vector<4x1024xf32>, memref<4x1024xf32>
+    vector.yield
+  }
+  return
+}
+
+//       CHECK-DIST-AND-PROP: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 128)>
+
+// CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_nd_write(
+//       CHECK-DIST-AND-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x128xf32>) {
+//       CHECK-DIST-AND-PROP:     %[[V0:.*]] = "some_def"
+//       CHECK-DIST-AND-PROP:     vector.yield %[[V0]]
+//  CHECK-DIST-AND-PROP-SAME:       vector<4x1024xf32>
+//       CHECK-DIST-AND-PROP:   }
+
+//       CHECK-DIST-AND-PROP:   %[[IDS:.+]]:2 = affine.delinearize_index %{{.*}} into (%c4, %c8) : index, index
+//       CHECK-DIST-AND-PROP:   %[[INNER_ID:.+]] = affine.apply #map()[%[[IDS]]#1]
+//       CHECK-DIST-AND-PROP:   vector.transfer_write %[[W]], %{{.*}}[%[[IDS]]#0, %[[INNER_ID]]] {{.*}} : vector<1x128xf32>
diff --git a/mlir/test/Examples/transform-opt/empty.mlir b/mlir/test/Examples/transform-opt/empty.mlir
new file mode 100644
index 00000000000000..b525769db68822
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/empty.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-transform-opt %s --transform=%p/self-contained.mlir | FileCheck %s
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/external-def.mlir --transform-entry-point=external_def | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --transform-library=%p/external-def.mlir | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/syntax-error.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/self-contained.mlir --transform-library=%p/syntax-error.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/self-contained.mlir --transform-library=%p/external-def.mlir --transform-library=%p/syntax-error.mlir --verify-diagnostics
+
+// CHECK: IR printer: in self-contained
+// EXTERNAL: IR printer: external_def
+// CHECK-NOT: @__transform_main
+module {}
diff --git a/mlir/test/Examples/transform-opt/external-decl.mlir b/mlir/test/Examples/transform-opt/external-decl.mlir
new file mode 100644
index 00000000000000..5a73735892429b
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/external-decl.mlir
@@ -0,0 +1,18 @@
+// This test just needs to parse. Note that the diagnostic message below will
+// be produced in *another* multi-file test, do *not* -verify-diagnostics here.
+// RUN: mlir-opt %s
+
+// RUN: mlir-transform-opt %s --transform-library=%p/external-def.mlir | FileCheck %s
+
+module attributes {transform.with_named_sequence} {
+  // The definition should not be printed here.
+  // CHECK: @external_def
+  // CHECK-NOT: transform.print
+  transform.named_sequence private @external_def(%root: !transform.any_op {transform.readonly})
+
+  transform.named_sequence private @__transform_main(%root: !transform.any_op) {
+    // expected-error @below {{unresolved external named sequence}}
+    transform.include @external_def failures(propagate) (%root) : (!transform.any_op) -> ()
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/external-def.mlir b/mlir/test/Examples/transform-opt/external-def.mlir
new file mode 100644
index 00000000000000..9dc4fbbdd6b619
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/external-def.mlir
@@ -0,0 +1,8 @@
+// RUN: mlir-opt %s
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @external_def(%root: !transform.any_op {transform.readonly}) {
+    transform.print %root { name = "external_def" } : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/pass.mlir b/mlir/test/Examples/transform-opt/pass.mlir
new file mode 100644
index 00000000000000..5c7c8bf1e256df
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/pass.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-transform-opt %s | FileCheck %s
+
+module attributes {transform.with_named_sequence} {
+  // CHECK-LABEL: @return_42
+  // CHECK: %[[C42:.+]] = arith.constant 42
+  // CHECK: return %[[C42]]
+  func.func @return_42() -> i32 {
+    %0 = arith.constant 21 : i32
+    %1 = arith.constant 2 : i32
+    %2 = arith.muli %0, %1 : i32
+    return %2 : i32
+  }
+
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %arg1 = transform.apply_registered_pass "canonicalize" to %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.print %arg1 : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/self-contained.mlir b/mlir/test/Examples/transform-opt/self-contained.mlir
new file mode 100644
index 00000000000000..b9a93af61b8bbc
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/self-contained.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-transform-opt %s | FileCheck %s
+// RUN: mlir-transform-opt %s --transform=%s | FileCheck %s
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/external-def.mlir --transform-entry-point=external_def | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --transform-library=%p/external-def.mlir | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/syntax-error.mlir --verify-diagnostics
+
+// CHECK: IR printer: in self-contained
+// EXTERNAL: IR printer: external_def
+
+// The first occurrence comes from the print operation and the second is the
+// roundtrip output. However, we shouldn't have the symbol duplicated because
+// of library merging.
+// CHECK-COUNT-2: @__transform_main
+// CHECK-NOT: @__transform_main
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence private @__transform_main(%root: !transform.any_op) {
+    transform.print %root { name = "in self-contained" } : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/syntax-error.mlir b/mlir/test/Examples/transform-opt/syntax-error.mlir
new file mode 100644
index 00000000000000..89f1d472fe8952
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/syntax-error.mlir
@@ -0,0 +1,5 @@
+// RUN: mlir-opt %s --verify-diagnostics
+// This file is used as additional input.
+
+// expected-error @below {{expected operation name in quotes}}
+module {
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir
index 15bafeda67403e..437e49a6b81446 100644
--- a/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir
@@ -26,7 +26,7 @@ func.func @check_results(%lhs : i16, %rhs : i16, %res0 : i16, %res1 : i16) -> ()
   %mismatch = arith.cmpi ne, %res0, %res1 : i16
   scf.if %mismatch -> () {
     vector.print %res1 : i16
-    vector.print str "Mismatch"
+    vector.print str "Mismatch\n"
   }
   return
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
index 12f13e8dbc4a9f..881e2799b5b06b 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
@@ -88,7 +88,7 @@ func.func @entry() {
   }
 
   // CHECK: SME: END OF TEST OUTPUT
-  vector.print str "SME: END OF TEST OUTPUT"
+  vector.print str "SME: END OF TEST OUTPUT\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
index ee3866de303e03..588b44a36c29f3 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
@@ -24,23 +24,23 @@ func.func @use_too_many_tiles(%a: memref<?x?xi16>, %b:  memref<?x?xi16>, %c: mem
 
   // CHECK-LABEL: tile_a:
   // CHECK-COUNT-8: ( 0, 0, 0, 0, 0, 0, 0, 0
-  vector.print str "tile_a:"
+  vector.print str "tile_a:\n"
   vector.print %tile_a : vector<[8]x[8]xi16>
   // CHECK-LABEL: tile_b:
   // CHECK-COUNT-8: ( 1, 1, 1, 1, 1, 1, 1, 1
-  vector.print str "tile_b:"
+  vector.print str "tile_b:\n"
   vector.print %tile_b : vector<[8]x[8]xi16>
   // CHECK-LABEL: tile_c:
   // CHECK-COUNT-8: ( 2, 2, 2, 2, 2, 2, 2, 2
-  vector.print str "tile_c:"
+  vector.print str "tile_c:\n"
   vector.print %tile_c : vector<[8]x[8]xi16>
   // CHECK-LABEL: tile_d:
   // CHECK-COUNT-8: ( 3, 3, 3, 3, 3, 3, 3, 3
-  vector.print str "tile_d:"
+  vector.print str "tile_d:\n"
   vector.print %tile_d : vector<[8]x[8]xi16>
   // CHECK-LABEL: tile_e:
   // CHECK-COUNT-8: ( 4, 4, 4, 4, 4, 4, 4, 4
-  vector.print str "tile_e:"
+  vector.print str "tile_e:\n"
   vector.print %tile_e : vector<[8]x[8]xi16>
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir
index 415181171e27b8..1794564a6a7244 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir
@@ -12,13 +12,13 @@ func.func @checkSVL() {
   %svl_h = arm_sme.streaming_vl <half>
   %svl_w = arm_sme.streaming_vl <word>
   %svl_d = arm_sme.streaming_vl <double>
-  vector.print str "SVL.b"
+  vector.print str "SVL.b\n"
   vector.print %svl_b : index
-  vector.print str "SVL.h"
+  vector.print str "SVL.h\n"
   vector.print %svl_h : index
-  vector.print str "SVL.w"
+  vector.print str "SVL.w\n"
   vector.print %svl_w : index
-  vector.print str "SVL.d"
+  vector.print str "SVL.d\n"
   vector.print %svl_d : index
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
index 2b8899b6c6fc32..41e724844fe403 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
@@ -53,13 +53,13 @@ func.func @test_load_store_zaq0() {
 
   // CHECK-LABEL: INITIAL TILE A:
   // CHECK: ( 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 )
-  vector.print str "INITIAL TILE A:"
+  vector.print str "INITIAL TILE A:\n"
   func.call @print_i8s(%tile_a_bytes, %zaq_size_bytes) : (memref<?xi8>, index) -> ()
   vector.print punctuation <newline>
 
   // CHECK-LABEL: INITIAL TILE B:
   // CHECK: ( 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 )
-  vector.print str "INITIAL TILE B:"
+  vector.print str "INITIAL TILE B:\n"
   func.call @print_i8s(%tile_b_bytes, %zaq_size_bytes) : (memref<?xi8>, index) -> ()
   vector.print punctuation <newline>
 
@@ -68,13 +68,13 @@ func.func @test_load_store_zaq0() {
 
   // CHECK-LABEL: FINAL TILE A:
   // CHECK: ( 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 )
-  vector.print str "FINAL TILE A:"
+  vector.print str "FINAL TILE A:\n"
   func.call @print_i8s(%tile_a_bytes, %zaq_size_bytes) : (memref<?xi8>, index) -> ()
   vector.print punctuation <newline>
 
   // CHECK-LABEL: FINAL TILE B:
   // CHECK: ( 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 )
-  vector.print str "FINAL TILE B:"
+  vector.print str "FINAL TILE B:\n"
   func.call @print_i8s(%tile_b_bytes, %zaq_size_bytes) : (memref<?xi8>, index) -> ()
 
   return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir
index 27be801252b812..68c31ac1dd8e9c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir
@@ -49,12 +49,12 @@ func.func @entry() {
   // CHECK-NEXT: ( 2, 2, 2, 2
   // CHECK-NEXT: ( 3, 3, 3, 3
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   scf.for %i = %c0 to %za_s_size step %svl_s {
     %tileslice = vector.load %mem1[%i] : memref<?xi32>, vector<[4]xi32>
     vector.print %tileslice : vector<[4]xi32>
   }
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   // 2. VERTICAL LAYOUT
   // Dump "mem2". The smallest SVL is 128-bits so the tile will be at least
@@ -66,9 +66,9 @@ func.func @entry() {
   // CHECK-NEXT: ( 0, 1, 2, 3
   // CHECK-NEXT: ( 0, 1, 2, 3
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %0 : vector<[4]x[4]xi32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir
index 9d836d93c85bb7..cd48f2a9ebfd89 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir
@@ -46,12 +46,12 @@ func.func @testTransposedReadWithMask(%maskRows: index, %maskCols: index) {
   vector.transfer_write %readTransposed, %outDyn[%c0, %c0] {in_bounds = [true, true]} : vector<[16]x[4]xf32>, memref<?x?xf32>
 
   /// Print the input memref.
-  vector.print str "Input memref:"
+  vector.print str "Input memref:\n"
   %inUnranked = memref.cast %inDyn : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%inUnranked) : (memref<*xf32>) -> ()
 
   /// Print the result memref.
-  vector.print str "Masked transposed result:"
+  vector.print str "Masked transposed result:\n"
   %outUnranked = memref.cast %outDyn : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%outUnranked) : (memref<*xf32>) -> ()
 
@@ -84,12 +84,12 @@ func.func @testTransposedWriteWithMask(%maskRows: index, %maskCols: index) {
     : vector<[16]x[4]xf32>, memref<?x?xf32>
 
   /// Print the input memref.
-  vector.print str "Input memref:"
+  vector.print str "Input memref:\n"
   %inUnranked = memref.cast %inDyn : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%inUnranked) : (memref<*xf32>) -> ()
 
   /// Print the result memref.
-  vector.print str "Masked transposed result:"
+  vector.print str "Masked transposed result:\n"
   %outUnranked = memref.cast %outDyn : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%outUnranked) : (memref<*xf32>) -> ()
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
index 7e7869d1c957aa..fb6c06cfd69999 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
@@ -35,9 +35,9 @@ func.func @test_outerproduct_no_accumulator_4x4xf32() {
   // WITHOUT-ACC-NEXT: ( 0, 2, 4, 6
   // WITHOUT-ACC-NEXT: ( 0, 3, 6, 9
   // WITHOUT-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xf32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -60,9 +60,9 @@ func.func @test_outerproduct_with_accumulator_4x4xf32() {
   // WITH-ACC-NEXT: ( 10, 12, 14, 16
   // WITH-ACC-NEXT: ( 10, 13, 16, 19
   // WITH-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xf32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -91,9 +91,9 @@ func.func @test_masked_outerproduct_no_accumulator_4x4xf32() {
   // WITH-MASK-NEXT: ( 3, 6, 0, 0
   // WITH-MASK-NEXT: ( 0, 0, 0, 0
   // WITH-MASK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xf32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -124,9 +124,9 @@ func.func @test_masked_outerproduct_with_accumulator_4x4xf32() {
   // WITH-MASK-AND-ACC-NEXT: ( 10, 10, 10, 10
   // WITH-MASK-AND-ACC-NEXT: ( 10, 10, 10, 10
   // WITH-MASK-AND-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xf32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
index 46bf799232ae3a..b8458606d3f324 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
@@ -40,9 +40,9 @@ func.func @test_outerproduct_no_accumulator_2x2xf64() {
   // CHECK-NEXT: ( 1, 2
   // CHECK-NEXT: ( 2, 4
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[2]x[2]xf64>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -66,9 +66,9 @@ func.func @test_outerproduct_with_accumulator_2x2xf64() {
   // WITH-ACC-NEXT: ( 11, 12
   // WITH-ACC-NEXT: ( 12, 14
   // WITH-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[2]x[2]xf64>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -96,9 +96,9 @@ func.func @test_masked_outerproduct_no_accumulator_2x2xf64() {
   // WITH-MASK-NEXT: ( 1, 0
   // WITH-MASK-NEXT: ( 2, 0
   // WITH-MASK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[2]x[2]xf64>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -127,9 +127,9 @@ func.func @test_masked_outerproduct_with_accumulator_2x2xf64() {
   // WITH-MASK-AND-ACC-NEXT: ( 11, 12
   // WITH-MASK-AND-ACC-NEXT: ( 10, 10
   // WITH-MASK-AND-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[2]x[2]xf64>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir
index 52f56883cad9c1..7421521b96bf92 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir
@@ -14,7 +14,7 @@ func.func @transfer_read_2d(%A : memref<?x?xf32>, %base1: index, %base2: index)
   %0 = vector.transfer_read %A[%base1, %base2], %pad {in_bounds=[true, true]} :
     memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
@@ -27,7 +27,7 @@ func.func @transfer_read_2d_transposed(%A : memref<?x?xf32>, %base1: index, %bas
     {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]}
       : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0 : vector<[4]x[4]xf32>
 
   return
@@ -42,7 +42,7 @@ func.func @transfer_read_2d_mask(%A : memref<?x?xf32>, %base1: index, %base2: in
   %0 = vector.transfer_read %A[%base1, %base2], %pad, %mask
     {in_bounds = [true, true]} : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
@@ -58,7 +58,7 @@ func.func @transfer_read_2d_mask_transposed(%A : memref<?x?xf32>, %base1: index,
     {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]}
       : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
@@ -73,7 +73,7 @@ func.func @transfer_read_2d_mask_non_zero_pad(%A : memref<?x?xf32>, %base1: inde
   %0 = vector.transfer_read %A[%base1, %base2], %pad, %mask
     {in_bounds = [true, true]} : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
@@ -89,7 +89,7 @@ func.func @transfer_read_2d_mask_non_zero_pad_transposed(%A : memref<?x?xf32>, %
     {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]}
       : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
index 710cc6672f0057..2fef705861f28e 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
@@ -51,7 +51,7 @@ func.func @transfer_write_2d_mask_transposed(%A : memref<?x?xf32>, %base1: index
 func.func @load_and_print(%A : memref<?x?xf32>, %base1: index, %base2: index) {
   %0 = vector.load %A[%base1, %base2] : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir
index 88bc0d0709d489..177c96f1d8aae6 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir
@@ -51,9 +51,9 @@ func.func @entry() {
   // CHECK-NEXT: ( 2, 2, 2, 2
   // CHECK-NEXT: ( 3, 3, 3, 3
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xi32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   // Dump the transposed tile. The smallest SVL is 128-bits so the tile will be
   // at least 4x4xi32.
@@ -64,9 +64,9 @@ func.func @entry() {
   // CHECK-NEXT: ( 0, 1, 2, 3
   // CHECK-NEXT: ( 0, 1, 2, 3
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %transposed_tile : vector<[4]x[4]xi32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
index e14917486d845d..3d74508cd23b57 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
@@ -23,9 +23,9 @@ func.func @entry() -> i32 {
   // CHECK-NEXT: ( 123, 123, 123, 123
   // CHECK-NEXT: ( 123, 123, 123, 123
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xi32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   %c0_i32 = arith.constant 0 : i32
   return %c0_i32 : i32
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir
index b29790db14ddc4..48080fd0a26a2b 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir
@@ -255,7 +255,7 @@ func.func @load_store_two_za_s_tiles() -> i32 {
   // CHECK-NEXT: ( 1, 1, 1, 1
   // CHECK-NEXT: ( 1, 1, 1, 1
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   scf.for %i = %c0 to %size_of_two_tiles step %svl_s {
     %av = vector.load %mem2[%i] : memref<?xi32>, vector<[4]xi32>
     vector.print %av : vector<[4]xi32>
@@ -263,11 +263,11 @@ func.func @load_store_two_za_s_tiles() -> i32 {
     %tileSizeMinusStep = arith.subi %size_of_tile, %svl_s : index
     %isNextTile = arith.cmpi eq, %i, %tileSizeMinusStep : index
     scf.if %isNextTile {
-      vector.print str "TILE END"
-      vector.print str "TILE BEGIN"
+      vector.print str "TILE END\n"
+      vector.print str "TILE BEGIN\n"
     }
   }
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return %c0_i32 : i32
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
index c486bf0de5d352..afb23e8e520660 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
@@ -24,7 +24,7 @@ func.func @read_and_print_2d_vector(%memref: memref<3x?xf32>)  {
   /// Print each of the vectors.
   /// vscale is >= 1, so at least 8 elements will be printed.
 
-  vector.print str "read_and_print_2d_vector()"
+  vector.print str "read_and_print_2d_vector()\n"
   // CHECK-LABEL: read_and_print_2d_vector()
   // CHECK: ( 8, 8, 8, 8, 8, 8, 8, 8
   vector.print %row0 : vector<[8]xf32>
@@ -62,21 +62,21 @@ func.func @add_arrays_of_scalable_vectors(%a: memref<1x2x?xf32>, %b: memref<1x2x
   // CHECK-LABEL: Vector A
   // CHECK-NEXT: ( 5, 5, 5, 5
   // CHECK-NEXT: ( 5, 5, 5, 5
-  vector.print str "\nVector A"
+  vector.print str "\nVector A\n"
   %vector_a = vector.transfer_read %a[%c0, %c0, %c0], %cst, %mask_a {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
   func.call @print_1x2xVSCALExf32(%vector_a) : (vector<1x2x[4]xf32>) -> ()
 
   // CHECK-LABEL: Vector B
   // CHECK-NEXT: ( 4, 4, 4, 4
   // CHECK-NEXT: ( 4, 4, 4, 4
-  vector.print str "\nVector B"
+  vector.print str "\nVector B\n"
   %vector_b = vector.transfer_read %b[%c0, %c0, %c0], %cst, %mask_b {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
   func.call @print_1x2xVSCALExf32(%vector_b) : (vector<1x2x[4]xf32>) -> ()
 
   // CHECK-LABEL: Sum
   // CHECK-NEXT: ( 9, 9, 9, 9
   // CHECK-NEXT: ( 9, 9, 9, 9
-  vector.print str "\nSum"
+  vector.print str "\nSum\n"
   %sum = arith.addf %vector_a, %vector_b : vector<1x2x[4]xf32>
   func.call @print_1x2xVSCALExf32(%sum) : (vector<1x2x[4]xf32>) -> ()
 
@@ -97,7 +97,7 @@ func.func @entry() {
 
   linalg.fill ins(%f32_8 : f32) outs(%test_1_memref :memref<3x?xf32>)
 
-  vector.print str "=> Print and read 2D arrays of scalable vectors:"
+  vector.print str "=> Print and read 2D arrays of scalable vectors:\n"
   func.call @read_and_print_2d_vector(%test_1_memref) : (memref<3x?xf32>) -> ()
 
   vector.print str "\n====================\n"
@@ -109,7 +109,7 @@ func.func @entry() {
   linalg.fill ins(%f32_5 : f32) outs(%test_2_memref_a :memref<1x2x?xf32>)
   linalg.fill ins(%f32_4 : f32) outs(%test_2_memref_b :memref<1x2x?xf32>)
 
-  vector.print str "=> Reading and adding two 3D arrays of scalable vectors:"
+  vector.print str "=> Reading and adding two 3D arrays of scalable vectors:\n"
   func.call @add_arrays_of_scalable_vectors(
     %test_2_memref_a, %test_2_memref_b) : (memref<1x2x?xf32>, memref<1x2x?xf32>) -> ()
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
index d86ff56d79e33c..79121bf31c26ed 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
@@ -1,4 +1,4 @@
-// DEFINE: %{compile} = mlir-opt %s  -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule\
+// DEFINE: %{compile} = mlir-opt %s  -transform-interpreter -test-transform-dialect-erase-schedule\
 // DEFINE:    -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage\
 // DEFINE:    -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
 // DEFINE: %{entry} =
@@ -188,12 +188,14 @@ func.func @matmul_f32() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%module_op: !transform.any_op):
-  %f = transform.structured.match ops{["func.func"]} in %module_op
-    : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op) {
+    %f = transform.structured.match ops{["func.func"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
 
-  transform.apply_patterns to %f {
-    transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
-  } : !transform.any_op
+    transform.apply_patterns to %f {
+      transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
+    } : !transform.any_op
+    transform.yield
+  }
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-print-str.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-print-str.mlir
index 78d6609ccaf9a9..25a44f22c2dc0b 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-print-str.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-print-str.mlir
@@ -7,8 +7,8 @@
 
 func.func @entry() {
    // CHECK: Hello, World!
-   vector.print str "Hello, World!"
+   vector.print str "Hello, World!\n"
    // CHECK-NEXT: Bye!
-   vector.print str "Bye!"
+   vector.print str "Bye!\n"
    return
 }
diff --git a/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir b/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
index 4434aea4ec965f..fa3395533af220 100644
--- a/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
+++ b/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
@@ -59,14 +59,48 @@ llvm.func @alias_scopes(%arg1 : !llvm.ptr) {
 #alias_scope1 = #llvm.alias_scope<id = distinct[1]<>, domain = #alias_scope_domain>
 
 // CHECK-LABEL: @noalias_intr_only
-llvm.func @noalias_intr_only(%arg1 : !llvm.ptr) {
-  %0 = llvm.mlir.constant(0 : i32) : i32
-  // CHECK:  call void @llvm.experimental.noalias.scope.decl(metadata ![[SCOPES1:[0-9]+]])
+llvm.func @noalias_intr_only() {
+  // CHECK: call void @llvm.experimental.noalias.scope.decl(metadata ![[SCOPES:[0-9]+]])
   llvm.intr.experimental.noalias.scope.decl #alias_scope1
   llvm.return
 }
 
 // Check the translated metadata.
 // CHECK-DAG: ![[DOMAIN:[0-9]+]] = distinct !{![[DOMAIN]], !"The domain"}
-// CHECK-DAG: ![[SCOPE1:[0-9]+]] = distinct !{![[SCOPE1]], ![[DOMAIN]]}
-// CHECK-DAG: ![[SCOPES1]] = !{![[SCOPE1]]}
+// CHECK-DAG: ![[SCOPE:[0-9]+]] = distinct !{![[SCOPE]], ![[DOMAIN]]}
+// CHECK-DAG: ![[SCOPES]] = !{![[SCOPE]]}
+
+// -----
+
+// This test ensures the alias scope translation creates a temporary metadata
+// node as a placeholder for self-references. Without this, the debug info
+// translation of a type list with a null entry could inadvertently reference
+// access group metadata. This occurs when both translations generate a metadata
+// list with a null entry, which are then uniqued to the same metadata node.
+// The access group translation subsequently updates the null entry to a
+// self-reference, which causes the type list to reference the access
+// group node as well. The use of a temporary placeholder node avoids the issue.
+
+#alias_scope_domain = #llvm.alias_scope_domain<id = distinct[0]<>>
+#alias_scope = #llvm.alias_scope<id = distinct[1]<>, domain = #alias_scope_domain>
+
+#di_null_type = #llvm.di_null_type
+#di_subroutine_type = #llvm.di_subroutine_type<types = #di_null_type>
+#di_file = #llvm.di_file<"attribute-alias-scope.mlir" in "">
+#di_compile_unit = #llvm.di_compile_unit<id = distinct[3]<>, sourceLanguage = DW_LANG_C11, file = #di_file, isOptimized = true, emissionKind = Full>
+#di_subprogram = #llvm.di_subprogram<id = distinct[2]<>, compileUnit = #di_compile_unit, scope = #di_file, file = #di_file, subprogramFlags = "Definition", type = #di_subroutine_type>
+
+// CHECK-LABEL: @self_reference
+llvm.func @self_reference() {
+  // CHECK: call void @llvm.experimental.noalias.scope.decl(metadata ![[SCOPES:[0-9]+]])
+  llvm.intr.experimental.noalias.scope.decl #alias_scope
+  llvm.return
+} loc(fused<#di_subprogram>[unknown])
+
+// Check that the translated subroutine types do not reference the access group
+// domain since both of them are created as metadata list with a null entry.
+// CHECK-DAG: ![[DOMAIN:[0-9]+]] = distinct !{![[DOMAIN]]}
+// CHECK-DAG: ![[SCOPE:[0-9]+]] = distinct !{![[SCOPE]], ![[DOMAIN]]}
+// CHECK-DAG: ![[SCOPES]] = !{![[SCOPE]]}
+// CHECK-DAG: = !DISubroutineType(types: ![[TYPES:[0-9]+]])
+// CHECK-DAG: ![[TYPES]] = !{null}
diff --git a/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir b/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir
deleted file mode 100644
index dbb67559560031..00000000000000
--- a/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir
+++ /dev/null
@@ -1,73 +0,0 @@
-// REQUIRES: asserts
-// RUN: mlir-translate -mlir-to-llvmir %s -debug-only=llvm-dialect-to-llvm-ir 2>&1 | FileCheck %s
-
-// CHECK: Convert initializer for dup_const
-// CHECK: 6 new constants hit
-// CHECK: 3 dangling constants erased
-// CHECK: Convert initializer for unique_const
-// CHECK: 6 new constants hit
-// CHECK: 5 dangling constants erased
-
-
-// CHECK:@dup_const = global { [2 x double], [2 x double], [2 x double] } { [2 x double] [double 3.612250e-02, double 5.119230e-02], [2 x double] [double 3.612250e-02, double 5.119230e-02], [2 x double] [double 3.612250e-02, double 5.119230e-02] }
-
-llvm.mlir.global @dup_const() : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)> {
-    %c0 = llvm.mlir.constant(3.612250e-02 : f64) : f64
-    %c1 = llvm.mlir.constant(5.119230e-02 : f64) : f64
-
-    %empty0 = llvm.mlir.undef : !llvm.array<2 x f64>
-    %a00 = llvm.insertvalue %c0, %empty0[0] : !llvm.array<2 x f64>
-
-    %empty1 = llvm.mlir.undef : !llvm.array<2 x f64>
-    %a10 = llvm.insertvalue %c0, %empty1[0] : !llvm.array<2 x f64>
-
-    %empty2 = llvm.mlir.undef : !llvm.array<2 x f64>
-    %a20 = llvm.insertvalue %c0, %empty2[0] : !llvm.array<2 x f64>
-
-// NOTE: a00, a10, a20 are all same ConstantAggregate which not used at this point.
-//       should not delete it before all of the uses of the ConstantAggregate finished.
-
-    %a01 = llvm.insertvalue %c1, %a00[1] : !llvm.array<2 x f64>
-    %a11 = llvm.insertvalue %c1, %a10[1] : !llvm.array<2 x f64>
-    %a21 = llvm.insertvalue %c1, %a20[1] : !llvm.array<2 x f64>
-    %empty_r = llvm.mlir.undef : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-    %r0 = llvm.insertvalue %a01, %empty_r[0] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-    %r1 = llvm.insertvalue %a11, %r0[1] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-    %r2 = llvm.insertvalue %a21, %r1[2] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    llvm.return %r2 : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-  }
-
-// CHECK:@unique_const = global { [2 x double], [2 x double], [2 x double] } { [2 x double] [double 3.612250e-02, double 5.119230e-02], [2 x double] [double 3.312250e-02, double 5.219230e-02], [2 x double] [double 3.412250e-02, double 5.419230e-02] }
-
-llvm.mlir.global @unique_const() : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)> {
-    %c0 = llvm.mlir.constant(3.612250e-02 : f64) : f64
-    %c1 = llvm.mlir.constant(5.119230e-02 : f64) : f64
-
-    %c2 = llvm.mlir.constant(3.312250e-02 : f64) : f64
-    %c3 = llvm.mlir.constant(5.219230e-02 : f64) : f64
-
-    %c4 = llvm.mlir.constant(3.412250e-02 : f64) : f64
-    %c5 = llvm.mlir.constant(5.419230e-02 : f64) : f64
-
-    %2 = llvm.mlir.undef : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    %3 = llvm.mlir.undef : !llvm.array<2 x f64>
-
-    %4 = llvm.insertvalue %c0, %3[0] : !llvm.array<2 x f64>
-    %5 = llvm.insertvalue %c1, %4[1] : !llvm.array<2 x f64>
-
-    %6 = llvm.insertvalue %5, %2[0] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    %7 = llvm.insertvalue %c2, %3[0] : !llvm.array<2 x f64>
-    %8 = llvm.insertvalue %c3, %7[1] : !llvm.array<2 x f64>
-
-    %9 = llvm.insertvalue %8, %6[1] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    %10 = llvm.insertvalue %c4, %3[0] : !llvm.array<2 x f64>
-    %11 = llvm.insertvalue %c5, %10[1] : !llvm.array<2 x f64>
-
-    %12 = llvm.insertvalue %11, %9[2] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    llvm.return %12 : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-}
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 39a1e036e85c02..12bd108ba86cb6 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -2186,6 +2186,43 @@ llvm.func @single_nowait(%x: i32, %y: i32, %zaddr: !llvm.ptr) {
 
 // -----
 
+llvm.func @copy_i32(!llvm.ptr, !llvm.ptr)
+llvm.func @copy_f32(!llvm.ptr, !llvm.ptr)
+
+// CHECK-LABEL: @single_copyprivate
+// CHECK-SAME: (ptr %[[ip:.*]], ptr %[[fp:.*]])
+llvm.func @single_copyprivate(%ip: !llvm.ptr, %fp: !llvm.ptr) {
+  // CHECK: %[[didit_addr:.*]] = alloca i32
+  // CHECK: store i32 0, ptr %[[didit_addr]]
+  // CHECK: call i32 @__kmpc_single
+  omp.single copyprivate(%ip -> @copy_i32 : !llvm.ptr, %fp -> @copy_f32 : !llvm.ptr) {
+    // CHECK: %[[i:.*]] = load i32, ptr %[[ip]]
+    %i = llvm.load %ip : !llvm.ptr -> i32
+    // CHECK: %[[i2:.*]] = add i32 %[[i]], %[[i]]
+    %i2 = llvm.add %i, %i : i32
+    // CHECK: store i32 %[[i2]], ptr %[[ip]]
+    llvm.store %i2, %ip : i32, !llvm.ptr
+    // CHECK: %[[f:.*]] = load float, ptr %[[fp]]
+    %f = llvm.load %fp : !llvm.ptr -> f32
+    // CHECK: %[[f2:.*]] = fadd float %[[f]], %[[f]]
+    %f2 = llvm.fadd %f, %f : f32
+    // CHECK: store float %[[f2]], ptr %[[fp]]
+    llvm.store %f2, %fp : f32, !llvm.ptr
+    // CHECK: store i32 1, ptr %[[didit_addr]]
+    // CHECK: call void @__kmpc_end_single
+    // CHECK: %[[didit:.*]] = load i32, ptr %[[didit_addr]]
+    // CHECK: call void @__kmpc_copyprivate({{.*}}, ptr %[[ip]], ptr @copy_i32, i32 %[[didit]])
+    // CHECK: %[[didit2:.*]] = load i32, ptr %[[didit_addr]]
+    // CHECK: call void @__kmpc_copyprivate({{.*}}, ptr %[[fp]], ptr @copy_f32, i32 %[[didit2]])
+    // CHECK-NOT: call void @__kmpc_barrier
+    omp.terminator
+  }
+  // CHECK: ret void
+  llvm.return
+}
+
+// -----
+
 // CHECK: @_QFsubEx = internal global i32 undef
 // CHECK: @_QFsubEx.cache = common global ptr null
 
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
new file mode 100644
index 00000000000000..58bda87c3b7bea
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -0,0 +1,142 @@
+// Test code-gen for `omp.parallel` ops with delayed privatizers (i.e. using
+// `omp.private` ops).
+
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+llvm.func @parallel_op_1_private(%arg0: !llvm.ptr) {
+  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: @parallel_op_1_private
+// CHECK-SAME: (ptr %[[ORIG:.*]]) {
+// CHECK: %[[OMP_PAR_ARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[ORIG_GEP:.*]] = getelementptr { ptr }, ptr %[[OMP_PAR_ARG]], i32 0, i32 0
+// CHECK: store ptr %[[ORIG]], ptr %[[ORIG_GEP]], align 8
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @parallel_op_1_private..omp_par, ptr %[[OMP_PAR_ARG]])
+// CHECK: }
+
+// CHECK-LABEL: void @parallel_op_1_private..omp_par
+// CHECK-SAME: (ptr noalias %{{.*}}, ptr noalias %{{.*}}, ptr %[[ARG:.*]])
+// CHECK: %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %[[ARG]], i32 0, i32 0
+// CHECK: %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8
+
+// Check that the privatizer alloc region was inlined properly.
+// CHECK: %[[PRIV_ALLOC:.*]] = alloca float, align 4
+// CHECK: %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR]], align 4
+// CHECK: store float %[[ORIG_VAL]], ptr %[[PRIV_ALLOC]], align 4
+// CHECK-NEXT: br
+
+// Check that the privatized value is used (rather than the original one).
+// CHECK: load float, ptr %[[PRIV_ALLOC]], align 4
+// CHECK: }
+
+llvm.func @parallel_op_2_privates(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
+  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr, @y.privatizer %arg1 -> %arg3 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    %1 = llvm.load %arg3 : !llvm.ptr -> i32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: @parallel_op_2_privates
+// CHECK-SAME: (ptr %[[ORIG1:.*]], ptr %[[ORIG2:.*]]) {
+// CHECK: %[[OMP_PAR_ARG:.*]] = alloca { ptr, ptr }, align 8
+// CHECK: %[[ORIG1_GEP:.*]] = getelementptr { ptr, ptr }, ptr %[[OMP_PAR_ARG]], i32 0, i32 0
+// CHECK: store ptr %[[ORIG1]], ptr %[[ORIG1_GEP]], align 8
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @parallel_op_2_privates..omp_par, ptr %[[OMP_PAR_ARG]])
+// CHECK: }
+
+// CHECK-LABEL: void @parallel_op_2_privates..omp_par
+// CHECK-SAME: (ptr noalias %{{.*}}, ptr noalias %{{.*}}, ptr %[[ARG:.*]])
+// CHECK: %[[ORIG1_PTR_PTR:.*]] = getelementptr { ptr, ptr }, ptr %[[ARG]], i32 0, i32 0
+// CHECK: %[[ORIG1_PTR:.*]] = load ptr, ptr %[[ORIG1_PTR_PTR]], align 8
+// CHECK: %[[ORIG2_PTR_PTR:.*]] = getelementptr { ptr, ptr }, ptr %[[ARG]], i32 0, i32 1
+// CHECK: %[[ORIG2_PTR:.*]] = load ptr, ptr %[[ORIG2_PTR_PTR]], align 8
+
+// Check that the privatizer alloc region was inlined properly.
+// CHECK: %[[PRIV1_ALLOC:.*]] = alloca float, align 4
+// CHECK: %[[ORIG1_VAL:.*]] = load float, ptr %[[ORIG1_PTR]], align 4
+// CHECK: store float %[[ORIG1_VAL]], ptr %[[PRIV1_ALLOC]], align 4
+// CHECK: %[[PRIV2_ALLOC:.*]] = alloca i32, align 4
+// CHECK: %[[ORIG2_VAL:.*]] = load i32, ptr %[[ORIG2_PTR]], align 4
+// CHECK: store i32 %[[ORIG2_VAL]], ptr %[[PRIV2_ALLOC]], align 4
+// CHECK-NEXT: br
+
+// Check that the privatized value is used (rather than the original one).
+// CHECK: load float, ptr %[[PRIV1_ALLOC]], align 4
+// CHECK: load i32, ptr %[[PRIV2_ALLOC]], align 4
+// CHECK: }
+
+omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr
+  %1 = llvm.load %arg0 : !llvm.ptr -> f32
+  llvm.store %1, %0 : f32, !llvm.ptr
+  omp.yield(%0 : !llvm.ptr)
+}
+
+omp.private {type = private} @y.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
+  %1 = llvm.load %arg0 : !llvm.ptr -> i32
+  llvm.store %1, %0 : i32, !llvm.ptr
+  omp.yield(%0 : !llvm.ptr)
+}
+
+// -----
+
+llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr) {
+  omp.parallel private(@multi_block.privatizer %arg0 -> %arg2 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define internal void @parallel_op_private_multi_block..omp_par
+// CHECK: omp.par.entry:
+// CHECK:  %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %{{.*}}, i32 0, i32 0
+// CHECK:  %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8
+// CHECK:   br label %[[PRIV_BB1:.*]]
+
+// Check contents of the first block in the `alloc` region.
+// CHECK: [[PRIV_BB1]]:
+// CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = alloca float, align 4
+// CHECK-NEXT:   br label %[[PRIV_BB2:.*]]
+
+// Check contents of the second block in the `alloc` region.
+// CHECK: [[PRIV_BB2]]:
+// CHECK-NEXT:   %[[ORIG_PTR2:.*]] = phi ptr [ %[[ORIG_PTR]], %[[PRIV_BB1]] ]
+// CHECK-NEXT:   %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB1]] ]
+// CHECK-NEXT:   %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR2]], align 4
+// CHECK-NEXT:   store float %[[ORIG_VAL]], ptr %[[PRIV_ALLOC2]], align 4
+// CHECK-NEXT:   br label %[[PRIV_CONT:.*]]
+
+// Check that the privatizer's continuation block yileds the private clone's
+// address.
+// CHECK: [[PRIV_CONT]]:
+// CHECK-NEXT:   %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %[[PRIV_BB2]] ]
+// CHECK-NEXT:   br label %[[PAR_REG:.*]]
+
+// Check that the body of the parallel region loads from the private clone.
+// CHECK: [[PAR_REG]]:
+// CHECK:        %{{.*}} = load float, ptr %[[PRIV_ALLOC3]], align 4
+
+omp.private {type = private} @multi_block.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr
+  llvm.br ^bb1(%arg0, %0 : !llvm.ptr, !llvm.ptr)
+
+^bb1(%arg1: !llvm.ptr, %arg2: !llvm.ptr):
+  %1 = llvm.load %arg1 : !llvm.ptr -> f32
+  llvm.store %1, %arg2 : f32, !llvm.ptr
+  omp.yield(%arg2 : !llvm.ptr)
+}
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index bde4255ee4b368..abc0e43c7b7f2d 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1768,7 +1768,6 @@ struct TestMergeSingleBlockOps
     rewriter.inlineBlockBefore(&innerBlock, op);
     rewriter.eraseOp(innerTerminator);
     rewriter.eraseOp(op);
-    rewriter.modifyOpInPlace(op, [] {});
     return success();
   }
 };
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index 178a58e796b246..915f713f7047be 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -630,15 +630,13 @@ struct TestVectorDistribution
     });
     MLIRContext *ctx = &getContext();
     auto distributionFn = [](Value val) {
-      // Create a map (d0, d1) -> (d1) to distribute along the inner
-      // dimension. Once we support n-d distribution we can add more
-      // complex cases.
+      // Create an identity dim map of the same rank as the vector.
       VectorType vecType = dyn_cast<VectorType>(val.getType());
       int64_t vecRank = vecType ? vecType.getRank() : 0;
       OpBuilder builder(val.getContext());
       if (vecRank == 0)
         return AffineMap::get(val.getContext());
-      return AffineMap::get(vecRank, 0, builder.getAffineDimExpr(vecRank - 1));
+      return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
     };
     auto shuffleFn = [](Location loc, OpBuilder &builder, Value val,
                         Value srcIdx, int64_t warpSz) {
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 904dfb680a0404..7636ef30c2d3ef 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -161,6 +161,7 @@ def add_runtime(name):
         ToolSubst("transform-opt-ch2", unresolved="ignore"),
         ToolSubst("transform-opt-ch3", unresolved="ignore"),
         ToolSubst("transform-opt-ch4", unresolved="ignore"),
+        ToolSubst("mlir-transform-opt", unresolved="ignore"),
         ToolSubst("%mlir_lib_dir", config.mlir_lib_dir, unresolved="ignore"),
         ToolSubst("%mlir_src_dir", config.mlir_src_root, unresolved="ignore"),
     ]
diff --git a/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir b/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir
index 44141cc4eeaf42..0bf6523c5e5d5c 100644
--- a/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir
+++ b/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir
@@ -13,10 +13,21 @@ func.func @trunc_bf16(%a : f32) {
 }
 
 func.func @main() {
-  // CHECK: 1.00781
-  %roundOneI = arith.constant 0x3f808000 : i32
-  %roundOneF = arith.bitcast %roundOneI : i32 to f32
-  call @trunc_bf16(%roundOneF): (f32) -> ()
+  // Note: this is a tie (low 16 bits are 0x8000). We expect the rounding behavior
+  // to break ties "to nearest-even", which in this case means downwards,
+  // since bit 16 is not set.
+  // CHECK: 1
+  %value_1_00391_I = arith.constant 0x3f808000 : i32
+  %value_1_00391_F = arith.bitcast %value_1_00391_I : i32 to f32
+  call @trunc_bf16(%value_1_00391_F): (f32) -> ()
+
+  // Note: this is a tie (low 16 bits are 0x8000). We expect the rounding behavior
+  // to break ties "to nearest-even", which in this case means upwards,
+  // since bit 16 is set.
+  // CHECK-NEXT: 1.01562
+  %value_1_01172_I = arith.constant 0x3f818000 : i32
+  %value_1_01172_F = arith.bitcast %value_1_01172_I : i32 to f32
+  call @trunc_bf16(%value_1_01172_F): (f32) -> ()
 
   // CHECK-NEXT: -1
   %noRoundNegOneI = arith.constant 0xbf808000 : i32
@@ -38,15 +49,27 @@ func.func @main() {
   %neginff = arith.bitcast %neginfi : i32 to f32
   call @trunc_bf16(%neginff): (f32) -> ()
 
+  // Note: this rounds upwards. As the mantissa was already saturated, this rounding
+  // causes the exponent to be incremented. As the exponent was already the
+  // maximum exponent value for finite values, this increment of the exponent
+  // causes this to overflow to +inf.
+  // CHECK-NEXT: inf
+  %big_overflowing_i = arith.constant 0x7f7fffff : i32
+  %big_overflowing_f = arith.bitcast %big_overflowing_i : i32 to f32
+  call @trunc_bf16(%big_overflowing_f): (f32) -> ()
+
+  // Same as the previous testcase but negative.
+  // CHECK-NEXT: -inf
+  %negbig_overflowing_i = arith.constant 0xff7fffff : i32
+  %negbig_overflowing_f = arith.bitcast %negbig_overflowing_i : i32 to f32
+  call @trunc_bf16(%negbig_overflowing_f): (f32) -> ()
+
+  // In contrast to the previous two testcases, the upwards-rounding here
+  // does not cause overflow.
   // CHECK-NEXT: 3.38953e+38
-  %bigi = arith.constant 0x7f7fffff : i32
-  %bigf = arith.bitcast %bigi : i32 to f32
-  call @trunc_bf16(%bigf): (f32) -> ()
-
-  // CHECK-NEXT: -3.38953e+38
-  %negbigi = arith.constant 0xff7fffff : i32
-  %negbigf = arith.bitcast %negbigi : i32 to f32
-  call @trunc_bf16(%negbigf): (f32) -> ()
+  %big_nonoverflowing_i = arith.constant 0x7f7effff : i32
+  %big_nonoverflowing_f = arith.bitcast %big_nonoverflowing_i : i32 to f32
+  call @trunc_bf16(%big_nonoverflowing_f): (f32) -> ()
 
   // CHECK-NEXT: 1.625
   %exprolli = arith.constant 0x3fcfffff : i32
diff --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
index 62a19c084cac0f..943e7d5c120b87 100644
--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@@ -120,7 +120,8 @@ static Match synZeroMatch() { return Match(); }
 FOREVERY_BINOP(IMPL_BINOP_PATTERN)
 #undef IMPL_BINOP_PATTERN
 
-class MergerTestBase : public ::testing::Test {
+// Parameterize LevelFormat to test both Dense and Batch LevelFormat.
+class MergerTestBase : public ::testing::TestWithParam<LevelFormat> {
 protected:
   MergerTestBase(unsigned numTensors, unsigned numLoops)
       : merger(numTensors, numLoops, /*maxRank=*/numLoops) {
@@ -317,10 +318,14 @@ class MergerTest3T1L : public MergerTestBase {
     // Tensor 1: sparse input vector.
     merger.setLevelAndType(tid(1), lid(0), 0, LevelFormat::Compressed);
     // Tensor 2: dense output vector.
-    merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(2), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test3T1L, MergerTest3T1L,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 /// Four tensors (three inputs, one output); and a single loop.
 class MergerTest4T1L : public MergerTestBase {
 protected:
@@ -333,10 +338,14 @@ class MergerTest4T1L : public MergerTestBase {
     // Tensor 2: sparse input vector
     merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Compressed);
     // Tensor 3: dense output vector
-    merger.setLevelAndType(tid(3), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(3), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test4T1L, MergerTest4T1L,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 ///
 /// Tests with both sparse and dense input.
 ///
@@ -349,12 +358,16 @@ class MergerTest3T1LD : public MergerTestBase {
     // Tensor 0: sparse input vector.
     merger.setLevelAndType(tid(0), lid(0), 0, LevelFormat::Compressed);
     // Tensor 1: dense input vector.
-    merger.setLevelAndType(tid(1), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(1), lid(0), 0, GetParam());
     // Tensor 2: dense output vector.
-    merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(2), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test3T1LD, MergerTest3T1LD,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 ///
 /// Tests with both undef and dense input.
 ///
@@ -367,14 +380,18 @@ class MergerTest4T1LU : public MergerTestBase {
     // Tensor 0: undef input vector.
     merger.setLevelAndType(tid(0), lid(0), 0, LevelFormat::Undef);
     // Tensor 1: dense input vector.
-    merger.setLevelAndType(tid(1), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(1), lid(0), 0, GetParam());
     // Tensor 2: undef input vector.
     merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Undef);
     // Tensor 3: dense output vector.
-    merger.setLevelAndType(tid(3), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(3), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test4T1LU, MergerTest4T1LU,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 ///
 /// Tests with operation on sparse output.
 ///
@@ -395,6 +412,11 @@ class MergerTest3T1LSo : public MergerTestBase {
   }
 };
 
+// This testsuite does not use any dense-like format, just one of {Dense, Batch}
+// is enough.
+INSTANTIATE_TEST_SUITE_P(Test3T1LSo, MergerTest3T1LSo,
+                         ::testing::Values(LevelFormat::Dense));
+
 } // namespace
 
 /// Vector multiplication (conjunction) of 3 vectors, i.e.;
@@ -409,7 +431,7 @@ class MergerTest3T1LSo : public MergerTestBase {
 ///   lat( i_01_D / (tensor_0 * tensor_1 * tensor2) )
 /// }
 #define IMPL_MERGER_TEST_CONJ_CONJ_UNDEF(CONJ1, CONJ2)                         \
-  TEST_F(MergerTest4T1LU, vector_##CONJ1##_##CONJ2) {                          \
+  TEST_P(MergerTest4T1LU, vector_##CONJ1##_##CONJ2) {                          \
     const auto em = CONJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = CONJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -443,7 +465,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ_UNDEF)
 ///   lat( i_03_U / (tensor_0 * tensor_1 * output_tensor2) )
 /// }
 #define IMPL_MERGER_TEST_CONJ_CONJ_SPARSE_OUT(CONJ1, CONJ2)                    \
-  TEST_F(MergerTest3T1LSo, vector_##CONJ1##_##CONJ2) {                         \
+  TEST_P(MergerTest3T1LSo, vector_##CONJ1##_##CONJ2) {                         \
     const auto em = CONJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = CONJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -482,7 +504,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ_SPARSE_OUT)
 ///   lat( i_01 / tensor_1 )
 /// }
 #define IMPL_MERGER_TEST_DISJ(OP, UNUSED)                                      \
-  TEST_F(MergerTest3T1L, vector_##OP) {                                        \
+  TEST_P(MergerTest3T1L, vector_##OP) {                                        \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -514,7 +536,7 @@ FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_DISJ)
 ///   lat( i_00 i_01 / (tensor_0 * tensor_1) )
 /// }
 #define IMPL_MERGER_TEST_CONJ(OP, UNUSED)                                      \
-  TEST_F(MergerTest3T1L, vector_##OP) {                                        \
+  TEST_P(MergerTest3T1L, vector_##OP) {                                        \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -544,7 +566,7 @@ FOREVERY_COMMON_CONJ_BINOP(IMPL_MERGER_TEST_CONJ)
 ///    lat( i_02 / tensor_2 )
 /// }
 #define IMPL_MERGER_TEST_CONJ_DISJ(CONJ, DISJ)                                 \
-  TEST_F(MergerTest4T1L, vector_##CONJ##_##DISJ) {                             \
+  TEST_P(MergerTest4T1L, vector_##CONJ##_##DISJ) {                             \
     const auto em = CONJ##Expr(tensor(0), tensor(1));                          \
     const auto e = DISJ##Expr(em, tensor(2));                                  \
     const auto l0 = lid(0);                                                    \
@@ -587,7 +609,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_DISJ_BINOP(IMPL_MERGER_TEST_CONJ_DISJ)
 ///   lat( i_00 / tensor_0 )
 /// }
 #define IMPL_MERGER_TEST_DISJ_DISJ(DISJ1, DISJ2)                               \
-  TEST_F(MergerTest4T1L, Vector_##DISJ1##_##DISJ2) {                           \
+  TEST_P(MergerTest4T1L, Vector_##DISJ1##_##DISJ2) {                           \
     const auto em = DISJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = DISJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -636,7 +658,7 @@ FOREVERY_PAIR_OF_COMMON_DISJ_DISJ_BINOP(IMPL_MERGER_TEST_DISJ_DISJ)
 ///    lat( i_00 i_01 i_02 / tensor_0 * tensor_1 * tensor_2 )
 /// }
 #define IMPL_MERGER_TEST_CONJ_CONJ(CONJ1, CONJ2)                               \
-  TEST_F(MergerTest4T1L, vector_##CONJ1##_##CONJ2) {                           \
+  TEST_P(MergerTest4T1L, vector_##CONJ1##_##CONJ2) {                           \
     const auto em = CONJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = CONJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -675,7 +697,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ)
 /// lat( i_00 / sparse_tensor_0 ) should be opted out as it only has dense diff
 /// with lat( i_00 i_01 / (sparse_tensor_0 + dense_tensor_1) ).
 #define IMPL_MERGER_TEST_OPTIMIZED_DISJ(OP, UNUSED)                            \
-  TEST_F(MergerTest3T1LD, vector_opted_##OP) {                                 \
+  TEST_P(MergerTest3T1LD, vector_opted_##OP) {                                 \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -711,7 +733,7 @@ FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_DISJ)
 /// }
 /// since i_01 is a dense dimension.
 #define IMPL_MERGER_TEST_OPTIMIZED_CONJ(OP, UNUSED)                            \
-  TEST_F(MergerTest3T1LD, vector_opted_##OP) {                                 \
+  TEST_P(MergerTest3T1LD, vector_opted_##OP) {                                 \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -746,7 +768,7 @@ FOREVERY_COMMON_CONJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_CONJ)
 ///   lat( i_00 / tensor_0 cmp 0 )
 ///   lat( i_01 / 0 cmp tensor_1 )
 /// }
-TEST_F(MergerTest3T1L, vector_cmp) {
+TEST_P(MergerTest3T1L, vector_cmp) {
   const auto e = cmpiExpr(tensor(0), tensor(1));
   const auto l0 = lid(0);
   const auto t0 = tid(0);
@@ -784,7 +806,7 @@ TEST_F(MergerTest3T1L, vector_cmp) {
 ///
 /// lat( i_00 / sparse_tensor_0 ) should be opted out as it only has dense diff
 /// with lat( i_00 i_01 / (sparse_tensor_0 cmp dense_tensor_1) ).
-TEST_F(MergerTest3T1LD, vector_cmp) {
+TEST_P(MergerTest3T1LD, vector_cmp) {
   const auto e = cmpiExpr(tensor(0), tensor(1));
   const auto l0 = lid(0);
   const auto t0 = tid(0);
diff --git a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
index 26bfbd5c11e81e..cea49356538f0a 100644
--- a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/IR/MLIRContext.h"
@@ -29,10 +30,10 @@
 using namespace mlir;
 
 // Skip the test if the NVPTX target was not built.
-#if MLIR_CUDA_CONVERSIONS_ENABLED == 0
-#define SKIP_WITHOUT_NVPTX(x) DISABLED_##x
-#else
+#if MLIR_ENABLE_CUDA_CONVERSIONS
 #define SKIP_WITHOUT_NVPTX(x) x
+#else
+#define SKIP_WITHOUT_NVPTX(x) DISABLED_##x
 #endif
 
 class MLIRTargetLLVMNVVM : public ::testing::Test {
diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
index 17eb30c8e458c2..7d815bc4a2299c 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
@@ -17,6 +17,7 @@ def libc_common_copts():
     libc_include_path = paths.join(root_label.workspace_root, root_label.package)
     return [
         "-I" + libc_include_path,
+        "-I" + paths.join(libc_include_path, "include"),
         "-DLIBC_NAMESPACE=" + LIBC_NAMESPACE,
     ]
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 22345974247abd..375d04674479a0 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6,7 +6,6 @@
 #   The MLIR "Multi-Level Intermediate Representation" Compiler Infrastructure
 
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("@bazel_skylib//rules:write_file.bzl", "write_file")
 load(
     ":build_defs.bzl",
     "cc_headers_only",
@@ -36,7 +35,10 @@ expand_template(
         "#cmakedefine01 MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS": "#define MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS 0",
         "#cmakedefine MLIR_GREEDY_REWRITE_RANDOMIZER_SEED ${MLIR_GREEDY_REWRITE_RANDOMIZER_SEED}": "/* #undef MLIR_GREEDY_REWRITE_RANDOMIZER_SEED */",
         "#cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH": "#define MLIR_ENABLE_PDL_IN_PATTERNMATCH 1",
-    },
+    } | if_cuda_available(
+        {"#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS": "#define MLIR_ENABLE_CUDA_CONVERSIONS 1"},
+        {"#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS": "#define MLIR_ENABLE_CUDA_CONVERSIONS 0"},
+    ),
     template = "include/mlir/Config/mlir-config.h.cmake",
 )
 
@@ -4108,6 +4110,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":AffineDialect",
+        ":AffineTransforms",
         ":AffineUtils",
         ":ArithDialect",
         ":ConversionPassIncGen",
@@ -5467,7 +5470,6 @@ cc_library(
     srcs = ["lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp"],
     hdrs = ["include/mlir/Dialect/GPU/Pipelines/Passes.h"],
     includes = ["include"],
-    local_defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     deps = [
         ":AffineToStandard",
         ":ArithToLLVM",
@@ -5491,6 +5493,7 @@ cc_library(
         ":Transforms",
         ":VectorToLLVM",
         ":VectorToSCF",
+        ":config",
     ],
 )
 
@@ -5540,6 +5543,7 @@ cc_library(
         ":Transforms",
         ":VCIXToLLVMIRTranslation",
         ":VectorDialect",
+        ":config",
         "//llvm:Core",
         "//llvm:MC",
         "//llvm:Support",
@@ -6175,6 +6179,7 @@ cc_library(
         ":NVVMToLLVMIRTranslation",
         ":TargetLLVM",
         ":ToLLVMIRTranslation",
+        ":config",
         "//llvm:NVPTXCodeGen",
         "//llvm:Support",
     ],
@@ -9182,6 +9187,7 @@ cc_library(
         ":VectorTransforms",
         ":X86VectorDialect",
         ":X86VectorTransforms",
+        ":config",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 68d9b23fd5643e..583411aa60e555 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -552,7 +552,6 @@ cc_library(
 cc_library(
     name = "TestTransforms",
     srcs = glob(["lib/Transforms/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -579,7 +578,6 @@ cc_library(
 cc_library(
     name = "TestFuncToLLVM",
     srcs = glob(["lib/Conversion/FuncToLLVM/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -594,7 +592,6 @@ cc_library(
 cc_library(
     name = "TestOneToNTypeConversion",
     srcs = glob(["lib/Conversion/OneToNTypeConversion/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -653,7 +650,6 @@ cc_library(
 cc_library(
     name = "TestDLTI",
     srcs = glob(["lib/Dialect/DLTI/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -667,7 +663,7 @@ cc_library(
 cc_library(
     name = "TestGPU",
     srcs = glob(["lib/Dialect/GPU/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"] + if_cuda_available([
+    defines = if_cuda_available([
         "MLIR_GPU_TO_CUBIN_PASS_ENABLE",
     ]),
     includes = ["lib/Dialect/Test"],
@@ -714,7 +710,6 @@ cc_library(
 cc_library(
     name = "TestLinalg",
     srcs = glob(["lib/Dialect/Linalg/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//llvm:Support",
@@ -748,7 +743,6 @@ cc_library(
 cc_library(
     name = "TestLLVM",
     srcs = glob(["lib/Dialect/LLVM/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:AffineToStandard",
@@ -773,7 +767,6 @@ cc_library(
 cc_library(
     name = "TestMath",
     srcs = glob(["lib/Dialect/Math/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithDialect",
@@ -790,7 +783,6 @@ cc_library(
 cc_library(
     name = "TestMathToVCIX",
     srcs = glob(["lib/Conversion/MathToVCIX/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithDialect",
@@ -807,7 +799,6 @@ cc_library(
 cc_library(
     name = "TestMemRef",
     srcs = glob(["lib/Dialect/MemRef/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -847,7 +838,6 @@ cc_library(
 cc_library(
     name = "TestNVGPU",
     srcs = glob(["lib/Dialect/NVGPU/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:AffineDialect",
@@ -871,7 +861,6 @@ cc_library(
 cc_library(
     name = "TestSCF",
     srcs = glob(["lib/Dialect/SCF/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//llvm:Support",
@@ -891,7 +880,6 @@ cc_library(
 cc_library(
     name = "TestArith",
     srcs = glob(["lib/Dialect/Arith/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithDialect",
@@ -908,7 +896,6 @@ cc_library(
 cc_library(
     name = "TestArmSME",
     srcs = glob(["lib/Dialect/ArmSME/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithToArmSME",
@@ -927,7 +914,6 @@ cc_library(
 cc_library(
     name = "TestBufferization",
     srcs = glob(["lib/Dialect/Bufferization/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:BufferizationDialect",
@@ -989,7 +975,6 @@ cc_library(
 cc_library(
     name = "TestFunc",
     srcs = glob(["lib/Dialect/Func/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -1005,7 +990,6 @@ cc_library(
 cc_library(
     name = "TestTensor",
     srcs = glob(["lib/Dialect/Tensor/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithDialect",
@@ -1022,7 +1006,6 @@ cc_library(
 cc_library(
     name = "TestVector",
     srcs = glob(["lib/Dialect/Vector/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:AffineDialect",