From 1d1186de34c55149be336068bf312e8f755dca37 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Tue, 27 Feb 2024 12:28:25 -0800
Subject: [PATCH 001/114] [llvm-exegesis] Add loop-register snippet annotation
 (#82873)

This patch adds a LLVM-EXEGESIS-LOOP-REGISTER snippet annotation which
allows a user to specify the register to use for the loop counter in the
loop repetition mode. This allows for executing snippets that don't work
with the default value (currently R8 on X86).
---
 llvm/docs/CommandGuide/llvm-exegesis.rst      |  8 ++++
 .../llvm-exegesis/X86/latency/loop-register.s | 12 ++++++
 .../tools/llvm-exegesis/lib/BenchmarkResult.h |  2 +
 llvm/tools/llvm-exegesis/lib/SnippetFile.cpp  | 20 ++++++++++
 .../llvm-exegesis/lib/SnippetRepetitor.cpp    | 14 +++----
 .../llvm-exegesis/lib/SnippetRepetitor.h      |  3 +-
 llvm/tools/llvm-exegesis/lib/Target.h         |  7 +++-
 llvm/tools/llvm-exegesis/lib/X86/Target.cpp   | 18 +++++----
 llvm/tools/llvm-exegesis/llvm-exegesis.cpp    | 37 ++++++++++++-------
 .../llvm-exegesis/X86/SnippetFileTest.cpp     | 19 ++++++++++
 .../X86/SnippetRepetitorTest.cpp              | 16 +++++---
 11 files changed, 117 insertions(+), 39 deletions(-)
 create mode 100644 llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s
diff --git a/llvm/docs/CommandGuide/llvm-exegesis.rst b/llvm/docs/CommandGuide/llvm-exegesis.rst
index 9e3c19078f1cce2..fdf17c7fe41285e 100644
--- a/llvm/docs/CommandGuide/llvm-exegesis.rst
+++ b/llvm/docs/CommandGuide/llvm-exegesis.rst
@@ -89,6 +89,14 @@ properly.
   annotation requires the subprocess execution mode. This is useful in
   cases where the memory accessed by the snippet depends on the location
   of the snippet, like RIP-relative addressing.
+* `LLVM-EXEGESIS-LOOP-REGISTER <register name>` - This annotation specifies
+  the loop register to use for keeping track of the current iteration when
+  using the loop repetition mode. :program:`llvm-exegesis` needs to keep track
+  of the current loop iteration within the loop repetition mode in a performant
+  manner (i.e., no memory accesses), and uses a register to do this. This register
+  has an architecture specific default (e.g., `R8` on X86), but this might conflict
+  with some snippets. This annotation allows changing the register to prevent
+  interference between the loop index register and the snippet.
 
 EXAMPLE 1: benchmarking instructions
 ------------------------------------
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s b/llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s
new file mode 100644
index 000000000000000..81ca75251381ad1
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/X86/latency/loop-register.s
@@ -0,0 +1,12 @@
+# REQUIRES: exegesis-can-measure-latency, x86_64-linux
+
+# Test that specifying the loop register to use works as expected.
+
+# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -snippets-file=%s | FileCheck %s
+
+# CHECK: measurements:
+
+# LLVM-EXEGESIS-DEFREG R11 ff
+# LLVM-EXEGESIS-LOOP-REGISTER R12
+
+addq $0xff, %r11
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
index 0aecaaeea4b2e76..4ae6bc2a54cd501 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -74,6 +74,8 @@ struct BenchmarkKey {
   // The address that the snippet should be loaded in at if the execution mode
   // being used supports it.
   intptr_t SnippetAddress = 0;
+  // The register that should be used to hold the loop counter.
+  unsigned LoopRegister;
 };
 
 struct BenchmarkMeasure {
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
index 7258fcb4279c7da..431d99c72b80869 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetFile.cpp
@@ -9,6 +9,7 @@
 #include "SnippetFile.h"
 #include "BenchmarkRunner.h"
 #include "Error.h"
+#include "Target.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
@@ -175,6 +176,20 @@ class BenchmarkCodeStreamer : public MCStreamer, public AsmCommentConsumer {
 
       return;
     }
+    if (CommentText.consume_front("LOOP-REGISTER")) {
+      // LLVM-EXEGESIS-LOOP-REGISTER <loop register>
+      unsigned LoopRegister;
+
+      if (!(LoopRegister = findRegisterByName(CommentText.trim()))) {
+        errs() << "unknown register '" << CommentText
+               << "' in 'LLVM-EXEGESIS-LOOP-REGISTER " << CommentText << "'\n";
+        ++InvalidComments;
+        return;
+      }
+
+      Result->Key.LoopRegister = LoopRegister;
+      return;
+    }
   }
 
   unsigned numInvalidComments() const { return InvalidComments; }
@@ -221,6 +236,11 @@ Expected<std::vector<BenchmarkCode>> readSnippets(const LLVMState &State,
 
   BenchmarkCode Result;
 
+  // Ensure that there is a default loop register value specified.
+  Result.Key.LoopRegister =
+      State.getExegesisTarget().getDefaultLoopCounterRegister(
+          State.getTargetMachine().getTargetTriple());
+
   const TargetMachine &TM = State.getTargetMachine();
   MCContext Context(TM.getTargetTriple(), TM.getMCAsmInfo(),
                     TM.getMCRegisterInfo(), TM.getMCSubtargetInfo());
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
index 561687a62319b3e..0bab30d1582003b 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
@@ -48,10 +48,8 @@ class DuplicateSnippetRepetitor : public SnippetRepetitor {
 
 class LoopSnippetRepetitor : public SnippetRepetitor {
 public:
-  explicit LoopSnippetRepetitor(const LLVMState &State)
-      : SnippetRepetitor(State),
-        LoopCounter(State.getExegesisTarget().getLoopCounterRegister(
-            State.getTargetMachine().getTargetTriple())) {}
+  explicit LoopSnippetRepetitor(const LLVMState &State, unsigned LoopRegister)
+      : SnippetRepetitor(State), LoopCounter(LoopRegister) {}
 
   // Loop over the snippet ceil(MinInstructions / Instructions.Size()) times.
   FillFunction Repeat(ArrayRef<MCInst> Instructions, unsigned MinInstructions,
@@ -113,8 +111,8 @@ class LoopSnippetRepetitor : public SnippetRepetitor {
         (void)_;
         Loop.addInstructions(Instructions);
       }
-      ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB,
-                                     State.getInstrInfo());
+      ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB, State.getInstrInfo(),
+                                     LoopCounter);
 
       // Set up the exit basic block.
       Loop.MBB->addSuccessor(Exit.MBB, BranchProbability::getZero());
@@ -138,14 +136,14 @@ SnippetRepetitor::~SnippetRepetitor() {}
 
 std::unique_ptr<const SnippetRepetitor>
 SnippetRepetitor::Create(Benchmark::RepetitionModeE Mode,
-                         const LLVMState &State) {
+                         const LLVMState &State, unsigned LoopRegister) {
   switch (Mode) {
   case Benchmark::Duplicate:
   case Benchmark::MiddleHalfDuplicate:
     return std::make_unique<DuplicateSnippetRepetitor>(State);
   case Benchmark::Loop:
   case Benchmark::MiddleHalfLoop:
-    return std::make_unique<LoopSnippetRepetitor>(State);
+    return std::make_unique<LoopSnippetRepetitor>(State, LoopRegister);
   case Benchmark::AggregateMin:
     break;
   }
diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
index 2b3c416c9029f70..c62e80f161f1283 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.h
@@ -29,7 +29,8 @@ namespace exegesis {
 class SnippetRepetitor {
 public:
   static std::unique_ptr<const SnippetRepetitor>
-  Create(Benchmark::RepetitionModeE Mode, const LLVMState &State);
+  Create(Benchmark::RepetitionModeE Mode, const LLVMState &State,
+         unsigned LoopRegister);
 
   virtual ~SnippetRepetitor();
 
diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h
index 7bbd946b03331f5..522c75d15703d57 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.h
+++ b/llvm/tools/llvm-exegesis/lib/Target.h
@@ -202,12 +202,15 @@ class ExegesisTarget {
   }
 
   // Returns a counter usable as a loop counter.
-  virtual unsigned getLoopCounterRegister(const Triple &) const { return 0; }
+  virtual unsigned getDefaultLoopCounterRegister(const Triple &) const {
+    return 0;
+  }
 
   // Adds the code to decrement the loop counter and
   virtual void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
                                            MachineBasicBlock &TargetMBB,
-                                           const MCInstrInfo &MII) const {
+                                           const MCInstrInfo &MII,
+                                           unsigned LoopRegister) const {
     llvm_unreachable("decrementLoopCounterAndBranch() requires "
                      "getLoopCounterRegister() > 0");
   }
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index 6fc951a6e35d6a8..a41a995f5560af8 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -720,7 +720,7 @@ class ExegesisX86Target : public ExegesisTarget {
 
   unsigned getScratchMemoryRegister(const Triple &TT) const override;
 
-  unsigned getLoopCounterRegister(const Triple &) const override;
+  unsigned getDefaultLoopCounterRegister(const Triple &) const override;
 
   unsigned getMaxMemoryAccessSize() const override { return 64; }
 
@@ -733,7 +733,8 @@ class ExegesisX86Target : public ExegesisTarget {
 
   void decrementLoopCounterAndJump(MachineBasicBlock &MBB,
                                    MachineBasicBlock &TargetMBB,
-                                   const MCInstrInfo &MII) const override;
+                                   const MCInstrInfo &MII,
+                                   unsigned LoopRegister) const override;
 
   std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, unsigned Reg,
                                const APInt &Value) const override;
@@ -852,7 +853,7 @@ const unsigned ExegesisX86Target::kUnavailableRegistersSSE[12] = {
 // We're using one of R8-R15 because these registers are never hardcoded in
 // instructions (e.g. MOVS writes to EDI, ESI, EDX), so they have less
 // conflicts.
-constexpr const unsigned kLoopCounterReg = X86::R8;
+constexpr const unsigned kDefaultLoopCounterReg = X86::R8;
 
 } // namespace
 
@@ -870,11 +871,12 @@ unsigned ExegesisX86Target::getScratchMemoryRegister(const Triple &TT) const {
   return TT.isOSWindows() ? X86::RCX : X86::RDI;
 }
 
-unsigned ExegesisX86Target::getLoopCounterRegister(const Triple &TT) const {
+unsigned
+ExegesisX86Target::getDefaultLoopCounterRegister(const Triple &TT) const {
   if (!TT.isArch64Bit()) {
     return 0;
   }
-  return kLoopCounterReg;
+  return kDefaultLoopCounterReg;
 }
 
 Error ExegesisX86Target::randomizeTargetMCOperand(
@@ -912,10 +914,10 @@ void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT,
 
 void ExegesisX86Target::decrementLoopCounterAndJump(
     MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB,
-    const MCInstrInfo &MII) const {
+    const MCInstrInfo &MII, unsigned LoopRegister) const {
   BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8))
-      .addDef(kLoopCounterReg)
-      .addUse(kLoopCounterReg)
+      .addDef(LoopRegister)
+      .addUse(LoopRegister)
       .addImm(-1);
   BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1))
       .addMBB(&TargetMBB)
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
index 782d44422791ca1..1ae2565e894c694 100644
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -497,22 +497,42 @@ void benchmarkMain() {
   }
 
   const auto Opcodes = getOpcodesOrDie(State);
+  std::vector<BenchmarkCode> Configurations;
+
+  unsigned LoopRegister =
+      State.getExegesisTarget().getDefaultLoopCounterRegister(
+          State.getTargetMachine().getTargetTriple());
+
+  if (Opcodes.empty()) {
+    Configurations = ExitOnErr(readSnippets(State, SnippetsFile));
+    for (const auto &Configuration : Configurations) {
+      if (ExecutionMode != BenchmarkRunner::ExecutionModeE::SubProcess &&
+          (Configuration.Key.MemoryMappings.size() != 0 ||
+           Configuration.Key.MemoryValues.size() != 0 ||
+           Configuration.Key.SnippetAddress != 0))
+        ExitWithError("Memory and snippet address annotations are only "
+                      "supported in subprocess "
+                      "execution mode");
+    }
+    LoopRegister = Configurations[0].Key.LoopRegister;
+  }
 
   SmallVector<std::unique_ptr<const SnippetRepetitor>, 2> Repetitors;
   if (RepetitionMode != Benchmark::RepetitionModeE::AggregateMin)
-    Repetitors.emplace_back(SnippetRepetitor::Create(RepetitionMode, State));
+    Repetitors.emplace_back(
+        SnippetRepetitor::Create(RepetitionMode, State, LoopRegister));
   else {
     for (Benchmark::RepetitionModeE RepMode :
          {Benchmark::RepetitionModeE::Duplicate,
           Benchmark::RepetitionModeE::Loop})
-      Repetitors.emplace_back(SnippetRepetitor::Create(RepMode, State));
+      Repetitors.emplace_back(
+          SnippetRepetitor::Create(RepMode, State, LoopRegister));
   }
 
   BitVector AllReservedRegs;
   for (const std::unique_ptr<const SnippetRepetitor> &Repetitor : Repetitors)
     AllReservedRegs |= Repetitor->getReservedRegs();
 
-  std::vector<BenchmarkCode> Configurations;
   if (!Opcodes.empty()) {
     for (const unsigned Opcode : Opcodes) {
       // Ignore instructions without a sched class if
@@ -534,17 +554,6 @@ void benchmarkMain() {
       std::move(ConfigsForInstr->begin(), ConfigsForInstr->end(),
                 std::back_inserter(Configurations));
     }
-  } else {
-    Configurations = ExitOnErr(readSnippets(State, SnippetsFile));
-    for (const auto &Configuration : Configurations) {
-      if (ExecutionMode != BenchmarkRunner::ExecutionModeE::SubProcess &&
-          (Configuration.Key.MemoryMappings.size() != 0 ||
-           Configuration.Key.MemoryValues.size() != 0 ||
-           Configuration.Key.SnippetAddress != 0))
-        ExitWithError("Memory and snippet address annotations are only "
-                      "supported in subprocess "
-                      "execution mode");
-    }
   }
 
   if (MinInstructions == 0) {
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
index 505a030675f64cc..f1fa891171177ca 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetFileTest.cpp
@@ -219,6 +219,25 @@ TEST_F(X86SnippetFileTest, SnippetAddress) {
   EXPECT_EQ(Snippet.Key.SnippetAddress, 0x10000);
 }
 
+TEST_F(X86SnippetFileTest, LoopRegister) {
+  auto Snippets = TestCommon(R"(
+    # LLVM-EXEGESIS-LOOP-REGISTER R11
+  )");
+  ASSERT_TRUE(static_cast<bool>(Snippets));
+  EXPECT_THAT(*Snippets, SizeIs(1));
+  const auto &Snippet = (*Snippets)[0];
+  EXPECT_EQ(Snippet.Key.LoopRegister, X86::R11);
+}
+
+TEST_F(X86SnippetFileTest, LoopRegisterInvalidRegister) {
+  auto Error = TestCommon(R"(
+    # LLVM-EXEGESIS-LOOP-REGISTER INVALID
+  )")
+                   .takeError();
+  EXPECT_TRUE(static_cast<bool>(Error));
+  consumeError(std::move(Error));
+}
+
 } // namespace
 } // namespace exegesis
 } // namespace llvm
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
index 25e8836087c15df..b55ca5057ae01c7 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
@@ -40,7 +40,10 @@ class X86SnippetRepetitorTest : public X86TestBase {
 
   void TestCommon(Benchmark::RepetitionModeE RepetitionMode,
                   unsigned SnippetInstructions = 1) {
-    const auto Repetitor = SnippetRepetitor::Create(RepetitionMode, State);
+    const auto Repetitor = SnippetRepetitor::Create(
+        RepetitionMode, State,
+        State.getExegesisTarget().getDefaultLoopCounterRegister(
+            State.getTargetMachine().getTargetTriple()));
     const std::vector<MCInst> Instructions(SnippetInstructions,
                                            MCInstBuilder(X86::NOOP));
     FunctionFiller Sink(*MF, {X86::EAX});
@@ -98,11 +101,12 @@ TEST_F(X86SnippetRepetitorTest, Loop) {
                           HasOpcode(X86::NOOP), HasOpcode(X86::NOOP),
                           HasOpcode(X86::NOOP), HasOpcode(X86::ADD64ri8),
                           HasOpcode(X86::JCC_1)));
-  EXPECT_THAT(LoopBlock.liveins(),
-              UnorderedElementsAre(
-                  LiveReg(X86::EAX),
-                  LiveReg(State.getExegesisTarget().getLoopCounterRegister(
-                      State.getTargetMachine().getTargetTriple()))));
+  EXPECT_THAT(
+      LoopBlock.liveins(),
+      UnorderedElementsAre(
+          LiveReg(X86::EAX),
+          LiveReg(State.getExegesisTarget().getDefaultLoopCounterRegister(
+              State.getTargetMachine().getTargetTriple()))));
   EXPECT_THAT(MF->getBlockNumbered(2)->instrs(),
               ElementsAre(HasOpcode(X86::RET64)));
 }

From b0bae445176d30a3fa577d30c21f36dad61003b8 Mon Sep 17 00:00:00 2001
From: rohit-rao <rohitrao@google.com>
Date: Tue, 27 Feb 2024 15:29:34 -0500
Subject: [PATCH 002/114] [lld] Adds support for xros. (#83031)

---
 lld/MachO/Driver.cpp              | 19 +++++++++++++------
 lld/MachO/Options.td              |  2 +-
 lld/test/MachO/lc-build-version.s |  7 +++++++
 lld/test/MachO/platform-version.s |  2 +-
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index a57f60c5eed36b7..018ceec97f204af 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -691,6 +691,8 @@ static PlatformVersion parsePlatformVersion(const Arg *arg) {
           .Cases("tvos-simulator", "8", PLATFORM_TVOSSIMULATOR)
           .Cases("watchos-simulator", "9", PLATFORM_WATCHOSSIMULATOR)
           .Cases("driverkit", "10", PLATFORM_DRIVERKIT)
+          .Cases("xros", "11", PLATFORM_XROS)
+          .Cases("xros-simulator", "12", PLATFORM_XROS_SIMULATOR)
           .Default(PLATFORM_UNKNOWN);
   if (platformVersion.platform == PLATFORM_UNKNOWN)
     error(Twine("malformed platform: ") + platformStr);
@@ -985,6 +987,8 @@ PlatformType macho::removeSimulator(PlatformType platform) {
     return PLATFORM_TVOS;
   case PLATFORM_WATCHOSSIMULATOR:
     return PLATFORM_WATCHOS;
+  case PLATFORM_XROS_SIMULATOR:
+    return PLATFORM_XROS;
   default:
     return platform;
   }
@@ -1001,15 +1005,17 @@ static bool shouldAdhocSignByDefault(Architecture arch, PlatformType platform) {
 
   return platform == PLATFORM_MACOS || platform == PLATFORM_IOSSIMULATOR ||
          platform == PLATFORM_TVOSSIMULATOR ||
-         platform == PLATFORM_WATCHOSSIMULATOR;
+         platform == PLATFORM_WATCHOSSIMULATOR ||
+         platform == PLATFORM_XROS_SIMULATOR;
 }
 
 static bool dataConstDefault(const InputArgList &args) {
-  static const std::array<std::pair<PlatformType, VersionTuple>, 5> minVersion =
+  static const std::array<std::pair<PlatformType, VersionTuple>, 6> minVersion =
       {{{PLATFORM_MACOS, VersionTuple(10, 15)},
         {PLATFORM_IOS, VersionTuple(13, 0)},
         {PLATFORM_TVOS, VersionTuple(13, 0)},
         {PLATFORM_WATCHOS, VersionTuple(6, 0)},
+        {PLATFORM_XROS, VersionTuple(1, 0)},
         {PLATFORM_BRIDGEOS, VersionTuple(4, 0)}}};
   PlatformType platform = removeSimulator(config->platformInfo.target.Platform);
   auto it = llvm::find_if(minVersion,
@@ -1045,11 +1051,12 @@ static bool shouldEmitChainedFixups(const InputArgList &args) {
   bool isRequested = arg != nullptr;
 
   // Version numbers taken from the Xcode 13.3 release notes.
-  static const std::array<std::pair<PlatformType, VersionTuple>, 4> minVersion =
+  static const std::array<std::pair<PlatformType, VersionTuple>, 5> minVersion =
       {{{PLATFORM_MACOS, VersionTuple(11, 0)},
         {PLATFORM_IOS, VersionTuple(13, 4)},
         {PLATFORM_TVOS, VersionTuple(14, 0)},
-        {PLATFORM_WATCHOS, VersionTuple(7, 0)}}};
+        {PLATFORM_WATCHOS, VersionTuple(7, 0)},
+        {PLATFORM_XROS, VersionTuple(1, 0)}}};
   PlatformType platform = removeSimulator(config->platformInfo.target.Platform);
   auto it = llvm::find_if(minVersion,
                           [&](const auto &p) { return p.first == platform; });
@@ -1688,8 +1695,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
   if (args.getLastArg(OPT_reproducible))
     config->zeroModTime = true;
 
-  std::array<PlatformType, 3> encryptablePlatforms{
-      PLATFORM_IOS, PLATFORM_WATCHOS, PLATFORM_TVOS};
+  std::array<PlatformType, 4> encryptablePlatforms{
+      PLATFORM_IOS, PLATFORM_WATCHOS, PLATFORM_TVOS, PLATFORM_XROS};
   config->emitEncryptionInfo =
       args.hasFlag(OPT_encryptable, OPT_no_encryption,
                    is_contained(encryptablePlatforms, config->platform()));
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 01e73b789f9aabe..a524e4a4c508412 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -377,7 +377,7 @@ def grp_version : OptionGroup<"version">, HelpText<"VERSION TARGETING">;
 
 def platform_version : MultiArg<["-"], "platform_version", 3>,
     MetaVarName<"<platform> <min_version> <sdk_version>">,
-    HelpText<"Platform (e.g., macos, ios, tvos, watchos, bridgeos, mac-catalyst, ios-sim, tvos-sim, watchos-sim, driverkit) and version numbers">,
+    HelpText<"Platform (e.g., macos, ios, tvos, watchos, xros, bridgeos, mac-catalyst, ios-sim, tvos-sim, watchos-sim, xros-sim, driverkit) and version numbers">,
     Group<grp_version>;
 def sdk_version : Separate<["-"], "sdk_version">,
     HelpText<"This option is undocumented in ld64">,
diff --git a/lld/test/MachO/lc-build-version.s b/lld/test/MachO/lc-build-version.s
index 7b78f803428a7f1..1fd7078919b1593 100644
--- a/lld/test/MachO/lc-build-version.s
+++ b/lld/test/MachO/lc-build-version.s
@@ -64,6 +64,13 @@
 
 # WATCHOS-4-0: cmd LC_VERSION_MIN_WATCHOS
 
+# RUN: %no-arg-lld -arch x86_64 -platform_version xros 1.0 1.1 -o %t.xros-1-0 %t.o
+# RUN: llvm-objdump --macho --all-headers %t.xros-1-0 | FileCheck %s --check-prefix=XROS-1-0
+# RUN: %no-arg-lld -arch x86_64 -platform_version xros-simulator 1.0 1.1 -o %t.xros-sim-1-0 %t.o
+# RUN: llvm-objdump --macho --all-headers %t.xros-sim-1-0 | FileCheck %s --check-prefix=XROS-1-0
+
+# XROS-1-0: cmd LC_BUILD_VERSION
+
 .text
 .global _main
 _main:
diff --git a/lld/test/MachO/platform-version.s b/lld/test/MachO/platform-version.s
index 047aea02fcde311..57fbae62b2ffcd2 100644
--- a/lld/test/MachO/platform-version.s
+++ b/lld/test/MachO/platform-version.s
@@ -55,7 +55,7 @@
 # RUN:        -platform_version 0 1 5 \
 # RUN:     | FileCheck --check-prefix=FAIL-PLATFORM %s
 # RUN: not %no-arg-lld -arch x86_64 -o %t %t.o 2>&1 \
-# RUN:        -platform_version 11 1 5 \
+# RUN:        -platform_version 13 1 5 \
 # RUN:     | FileCheck --check-prefix=FAIL-PLATFORM %s
 # FAIL-PLATFORM: malformed platform: {{.*}}
 # FAIL-PLATFORM-NOT: malformed {{minimum|sdk}} version: {{.*}}

From 3250330997cf214293a20a1d532b617d72bafb09 Mon Sep 17 00:00:00 2001
From: Wu Yingcong <yingcong.wu@intel.com>
Date: Tue, 27 Feb 2024 12:52:04 -0800
Subject: [PATCH 003/114] [asan] Disable instrumentation for
 available_externally global with COFF (#81109)

For COFF, available_externally global will be instrumented because of
the lack of filtering, and will trigger the Verifier pass assertion and
crash the compilation. This patch will filter out the
available_externally global for COFF.

For non-COFF, `!G->hasExactDefinition()` in line 1954 will filter out
the available_externally globals.

There is a related bug reported in
https://bugs.llvm.org/show_bug.cgi?id=47950 /
https://github.com/llvm/llvm-project/issues/47294. I tried the
reproducer posted on the page and this will fix the problem.

Reproducer:
```
#include <locale>

void grouping_impl() {
  std::use_facet<std::numpunct<char>>(std::locale());
}

// clang -fsanitize=address -D_DLL -std=c++14 -c format.cc
```
---
 .../Transforms/Instrumentation/AddressSanitizer.cpp    |  4 ++++
 .../do-not-instrument-globals-windows.ll               | 10 ++++++++++
 2 files changed, 14 insertions(+)
 create mode 100644 llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll

diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 8a2864a0787312a..5d5c4ea57ed56cb 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1957,6 +1957,10 @@ bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
     // On COFF, don't instrument non-ODR linkages.
     if (G->isInterposable())
       return false;
+    // If the global has AvailableExternally linkage, then it is not in this
+    // module, which means it does not need to be instrumented.
+    if (G->hasAvailableExternallyLinkage())
+      return false;
   }
 
   // If a comdat is present, it must have a selection kind that implies ODR
diff --git a/llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll b/llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll
new file mode 100644
index 000000000000000..c143f69f126a85b
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/do-not-instrument-globals-windows.ll
@@ -0,0 +1,10 @@
+; This test checks that we are not instrumenting unnecessary globals
+; RUN: opt < %s -passes=asan -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+@v_available_externally = available_externally global i32 zeroinitializer
+; CHECK-NOT: {{asan_gen.*v_available_externally}}
+
+; CHECK: @asan.module_ctor

From 5e31e82698d9f1d3f1dd881c87b8c5399d790772 Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson@google.com>
Date: Tue, 27 Feb 2024 13:00:07 -0800
Subject: [PATCH 004/114] [compiler-rt] Use locally configured llvm-lit for
 standalone builds (#83178)

When building a standalone build with
`-DLLVM_CMAKE_DIR=$HOME/output/llvm-install
-DCOMPILER_RT_INCLUDE_TESTS=ON`, the current code will attempt to use
`LLVM_DEFAULT_EXTERNAL_LIT` which is set to
`$HOME/output/llvm-install/bin/llvm-lit` inside `LLVMConfig.cmake` even
though it is not actually installed. If we are adding the llvm-lit
subdirectory, we can use `get_llvm_lit_path()` immediately afterwards to
set LLVM_EXTERNAL_LIT so that subsequent calls within
`add_lit_testsuite()` use llvm-lit from the current build directory
instead of the nonexistant one.
---
 compiler-rt/CMakeLists.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index bbb4e8d7c333e4f..8a2b138d8d7020d 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -771,8 +771,6 @@ mark_as_advanced(COMPILER_RT_ENABLE_INTERNAL_SYMBOLIZER)
 add_subdirectory(lib)
 
 if(COMPILER_RT_INCLUDE_TESTS)
-  add_subdirectory(unittests)
-  add_subdirectory(test)
   # Don't build llvm-lit for runtimes-build, it will clean up map_config.
   if (COMPILER_RT_STANDALONE_BUILD AND NOT LLVM_RUNTIMES_BUILD)
     # If we have a valid source tree, generate llvm-lit into the bin directory.
@@ -782,11 +780,17 @@ if(COMPILER_RT_INCLUDE_TESTS)
       # Needed for lit support in standalone builds.
       include(AddLLVM)
       add_subdirectory(${LLVM_MAIN_SRC_DIR}/utils/llvm-lit ${CMAKE_CURRENT_BINARY_DIR}/llvm-lit)
+      # Ensure that the testsuite uses the local lit rather than
+      # LLVM_INSTALL_DIR/bin/llvm-lit (which probably does not exist).
+      get_llvm_lit_path(_base_dir _file_name)
+      set(LLVM_EXTERNAL_LIT "${_base_dir}/${_file_name}" CACHE STRING "Command used to spawn lit" FORCE)
     elseif(NOT EXISTS ${LLVM_EXTERNAL_LIT})
       message(WARNING "Could not find LLVM source directory and LLVM_EXTERNAL_LIT does not"
                        "point to a valid file.  You will not be able to run tests.")
     endif()
   endif()
+  add_subdirectory(unittests)
+  add_subdirectory(test)
 endif()
 
 add_subdirectory(tools)

From c6fa71cdd0b85a8edb612e326ea275eb23aab032 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 27 Feb 2024 15:14:01 -0600
Subject: [PATCH 005/114] [libc][NFC] Add `-Wno-multi-gpu` everywhere for the
 GPU build (#83173)

Summary:
This warning is intended to indicate if `-march=native` returns
different values in a single compilation sense. As it stands we don't
care and it absolutely spams the test output if you run it on a machine
with more than one GPU like any cluster machine. Disable these warnings
everywhere we compile.
---
 libc/cmake/modules/LLVMLibCCompileOptionRules.cmake | 1 +
 libc/cmake/modules/LLVMLibCTestRules.cmake          | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
index c7ccd392354cb8f..72b04822d8b84a6 100644
--- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
+++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
@@ -87,6 +87,7 @@ function(_get_common_compile_options output_var flags)
     list(APPEND compile_options "-fvisibility=hidden")
     list(APPEND compile_options "-fconvergent-functions")
     list(APPEND compile_options "-flto")
+    list(APPEND compile_options "-Wno-multi-gpu")
 
     if(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
       list(APPEND compile_options "-Wno-unknown-cuda-version")
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 76ce6754bd73393..836e15d34741b2a 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -463,7 +463,7 @@ function(add_integration_test test_name)
 
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
@@ -471,7 +471,7 @@ function(add_integration_test test_name)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")
@@ -647,14 +647,14 @@ function(add_libc_hermetic_test test_name)
   if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_link_options(${fq_build_target_name} PRIVATE
       ${LIBC_COMPILE_OPTIONS_DEFAULT}
-      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
+      -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu
       "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
       "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
   elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     # We need to use the internal object versions for NVPTX.
     set(internal_suffix ".__internal__")
     target_link_options(${fq_build_target_name} PRIVATE
-      ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
       "-Wl,--suppress-stack-size-warning"
       -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static
       "--cuda-path=${LIBC_CUDA_ROOT}")

From f7a99664681390f4bb0211a52f0d89c70aa96762 Mon Sep 17 00:00:00 2001
From: Janeczko Jakub <kuba@ev1.pl>
Date: Tue, 27 Feb 2024 22:17:11 +0100
Subject: [PATCH 006/114] Fix typo in mlir::Value doxygen comment (#83150)

Fix #82900
---
 mlir/include/mlir/IR/Value.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/IR/Value.h b/mlir/include/mlir/IR/Value.h
index fff3b87faff669a..a74d0faa1dfc4b2 100644
--- a/mlir/include/mlir/IR/Value.h
+++ b/mlir/include/mlir/IR/Value.h
@@ -90,7 +90,7 @@ class alignas(8) ValueImpl : public IRObjectWithUseList<OpOperand> {
 /// class has value-type semantics and is just a simple wrapper around a
 /// ValueImpl that is either owner by a block(in the case of a BlockArgument) or
 /// an Operation(in the case of an OpResult).
-/// As most IR construct, this isn't const-correct, but we keep method
+/// As most IR constructs, this isn't const-correct, but we keep method
 /// consistent and as such method that immediately modify this Value aren't
 /// marked `const` (include modifying the Value use-list).
 class Value {

From d82e93e7f129d9e8b72570efdf4a15d6ec3d4336 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Tue, 27 Feb 2024 13:18:43 -0800
Subject: [PATCH 007/114] [mlir][sparse] add merger support on Batch LevelType.
 (#83186)

---
 .../mlir/Dialect/SparseTensor/IR/Enums.h      | 18 +++++-
 .../mlir/Dialect/SparseTensor/Utils/Merger.h  |  3 +-
 .../lib/Dialect/SparseTensor/Utils/Merger.cpp |  8 +--
 .../Dialect/SparseTensor/MergerTest.cpp       | 58 +++++++++++++------
 4 files changed, 59 insertions(+), 28 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
index 9e79b6aca1c9ba0..5563cb907e9353e 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -333,16 +333,28 @@ struct LevelType {
     return lvlBits & static_cast<uint64_t>(p);
   }
 
+  /// Check if the `LevelType` is considered to be sparse.
+  constexpr bool hasSparseSemantic() const {
+    return isa<LevelFormat::Compressed, LevelFormat::Singleton,
+               LevelFormat::LooseCompressed, LevelFormat::NOutOfM>();
+  }
+
+  /// Check if the `LevelType` is considered to be dense-like.
+  constexpr bool hasDenseSemantic() const {
+    return isa<LevelFormat::Dense, LevelFormat::Batch>();
+  }
+
   /// Check if the `LevelType` needs positions array.
   constexpr bool isWithPosLT() const {
-    return isa<LevelFormat::Compressed>() ||
-           isa<LevelFormat::LooseCompressed>();
+    assert(!isa<LevelFormat::Undef>());
+    return isa<LevelFormat::Compressed, LevelFormat::LooseCompressed>();
   }
 
   /// Check if the `LevelType` needs coordinates array.
   constexpr bool isWithCrdLT() const {
+    assert(!isa<LevelFormat::Undef>());
     // All sparse levels has coordinate array.
-    return !isa<LevelFormat::Dense, LevelFormat::Batch>();
+    return hasSparseSemantic();
   }
 
   std::string toMLIRString() const {
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
index 490ef3071af1b79..7f9820df984b293 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -509,8 +509,7 @@ class Merger {
   bool isSparseLvlWithNonTrivialIdxExp(TensorLoopId b) const {
     if (isLvlWithNonTrivialIdxExp(b)) {
       auto lt = getLoopDependentLevelType(b);
-      return isCompressedLT(lt) || isSingletonLT(lt) ||
-             isLooseCompressedLT(lt) || isNOutOfMLT(lt);
+      return lt.hasSparseSemantic();
     }
     return false;
   }
diff --git a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
index 731cd79a1e3b4bc..72b722c69ae34bb 100644
--- a/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Utils/Merger.cpp
@@ -476,7 +476,7 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) {
     // Starts resetting from a dense level, so that the first bit (if kept)
     // is not undefined level-type.
     for (unsigned b = 0; b < be; b++) {
-      if (simple[b] && isDenseLT(getLvlType(TensorLoopId{b}))) {
+      if (simple[b] && getLvlType(TensorLoopId{b}).hasDenseSemantic()) {
         offset = be - b - 1; // relative to the end
         break;
       }
@@ -489,8 +489,7 @@ BitVector Merger::simplifyCond(LatSetId s0, LatPointId p0) {
     // Slice on dense level has `locate` property as well, and can be optimized.
     if (simple[b] && !isSparseLvlWithNonTrivialIdxExp(b)) {
       const auto lt = getLvlType(b);
-      if (!isCompressedLT(lt) && !isSingletonLT(lt) &&
-          !isLooseCompressedLT(lt) && !isNOutOfMLT(lt)) {
+      if (!lt.hasSparseSemantic()) {
         if (reset)
           simple.reset(b);
         reset = true;
@@ -670,8 +669,7 @@ bool Merger::isSingleCondition(TensorId t, ExprId e) const {
 bool Merger::hasAnySparse(const BitVector &bits) const {
   for (TensorLoopId b : bits.set_bits()) {
     const auto lt = getLvlType(b);
-    if (isCompressedLT(lt) || isSingletonLT(lt) || isLooseCompressedLT(lt) ||
-        isNOutOfMLT(lt))
+    if (lt.hasSparseSemantic())
       return true;
   }
   return hasSparseIdxReduction(bits);
diff --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
index 62a19c084cac0fa..943e7d5c120b87d 100644
--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@@ -120,7 +120,8 @@ static Match synZeroMatch() { return Match(); }
 FOREVERY_BINOP(IMPL_BINOP_PATTERN)
 #undef IMPL_BINOP_PATTERN
 
-class MergerTestBase : public ::testing::Test {
+// Parameterize LevelFormat to test both Dense and Batch LevelFormat.
+class MergerTestBase : public ::testing::TestWithParam<LevelFormat> {
 protected:
   MergerTestBase(unsigned numTensors, unsigned numLoops)
       : merger(numTensors, numLoops, /*maxRank=*/numLoops) {
@@ -317,10 +318,14 @@ class MergerTest3T1L : public MergerTestBase {
     // Tensor 1: sparse input vector.
     merger.setLevelAndType(tid(1), lid(0), 0, LevelFormat::Compressed);
     // Tensor 2: dense output vector.
-    merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(2), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test3T1L, MergerTest3T1L,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 /// Four tensors (three inputs, one output); and a single loop.
 class MergerTest4T1L : public MergerTestBase {
 protected:
@@ -333,10 +338,14 @@ class MergerTest4T1L : public MergerTestBase {
     // Tensor 2: sparse input vector
     merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Compressed);
     // Tensor 3: dense output vector
-    merger.setLevelAndType(tid(3), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(3), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test4T1L, MergerTest4T1L,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 ///
 /// Tests with both sparse and dense input.
 ///
@@ -349,12 +358,16 @@ class MergerTest3T1LD : public MergerTestBase {
     // Tensor 0: sparse input vector.
     merger.setLevelAndType(tid(0), lid(0), 0, LevelFormat::Compressed);
     // Tensor 1: dense input vector.
-    merger.setLevelAndType(tid(1), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(1), lid(0), 0, GetParam());
     // Tensor 2: dense output vector.
-    merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(2), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test3T1LD, MergerTest3T1LD,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 ///
 /// Tests with both undef and dense input.
 ///
@@ -367,14 +380,18 @@ class MergerTest4T1LU : public MergerTestBase {
     // Tensor 0: undef input vector.
     merger.setLevelAndType(tid(0), lid(0), 0, LevelFormat::Undef);
     // Tensor 1: dense input vector.
-    merger.setLevelAndType(tid(1), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(1), lid(0), 0, GetParam());
     // Tensor 2: undef input vector.
     merger.setLevelAndType(tid(2), lid(0), 0, LevelFormat::Undef);
     // Tensor 3: dense output vector.
-    merger.setLevelAndType(tid(3), lid(0), 0, LevelFormat::Dense);
+    merger.setLevelAndType(tid(3), lid(0), 0, GetParam());
   }
 };
 
+INSTANTIATE_TEST_SUITE_P(Test4T1LU, MergerTest4T1LU,
+                         ::testing::Values(LevelFormat::Dense,
+                                           LevelFormat::Batch));
+
 ///
 /// Tests with operation on sparse output.
 ///
@@ -395,6 +412,11 @@ class MergerTest3T1LSo : public MergerTestBase {
   }
 };
 
+// This testsuite does not use any dense-like format, just one of {Dense, Batch}
+// is enough.
+INSTANTIATE_TEST_SUITE_P(Test3T1LSo, MergerTest3T1LSo,
+                         ::testing::Values(LevelFormat::Dense));
+
 } // namespace
 
 /// Vector multiplication (conjunction) of 3 vectors, i.e.;
@@ -409,7 +431,7 @@ class MergerTest3T1LSo : public MergerTestBase {
 ///   lat( i_01_D / (tensor_0 * tensor_1 * tensor2) )
 /// }
 #define IMPL_MERGER_TEST_CONJ_CONJ_UNDEF(CONJ1, CONJ2)                         \
-  TEST_F(MergerTest4T1LU, vector_##CONJ1##_##CONJ2) {                          \
+  TEST_P(MergerTest4T1LU, vector_##CONJ1##_##CONJ2) {                          \
     const auto em = CONJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = CONJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -443,7 +465,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ_UNDEF)
 ///   lat( i_03_U / (tensor_0 * tensor_1 * output_tensor2) )
 /// }
 #define IMPL_MERGER_TEST_CONJ_CONJ_SPARSE_OUT(CONJ1, CONJ2)                    \
-  TEST_F(MergerTest3T1LSo, vector_##CONJ1##_##CONJ2) {                         \
+  TEST_P(MergerTest3T1LSo, vector_##CONJ1##_##CONJ2) {                         \
     const auto em = CONJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = CONJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -482,7 +504,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ_SPARSE_OUT)
 ///   lat( i_01 / tensor_1 )
 /// }
 #define IMPL_MERGER_TEST_DISJ(OP, UNUSED)                                      \
-  TEST_F(MergerTest3T1L, vector_##OP) {                                        \
+  TEST_P(MergerTest3T1L, vector_##OP) {                                        \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -514,7 +536,7 @@ FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_DISJ)
 ///   lat( i_00 i_01 / (tensor_0 * tensor_1) )
 /// }
 #define IMPL_MERGER_TEST_CONJ(OP, UNUSED)                                      \
-  TEST_F(MergerTest3T1L, vector_##OP) {                                        \
+  TEST_P(MergerTest3T1L, vector_##OP) {                                        \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -544,7 +566,7 @@ FOREVERY_COMMON_CONJ_BINOP(IMPL_MERGER_TEST_CONJ)
 ///    lat( i_02 / tensor_2 )
 /// }
 #define IMPL_MERGER_TEST_CONJ_DISJ(CONJ, DISJ)                                 \
-  TEST_F(MergerTest4T1L, vector_##CONJ##_##DISJ) {                             \
+  TEST_P(MergerTest4T1L, vector_##CONJ##_##DISJ) {                             \
     const auto em = CONJ##Expr(tensor(0), tensor(1));                          \
     const auto e = DISJ##Expr(em, tensor(2));                                  \
     const auto l0 = lid(0);                                                    \
@@ -587,7 +609,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_DISJ_BINOP(IMPL_MERGER_TEST_CONJ_DISJ)
 ///   lat( i_00 / tensor_0 )
 /// }
 #define IMPL_MERGER_TEST_DISJ_DISJ(DISJ1, DISJ2)                               \
-  TEST_F(MergerTest4T1L, Vector_##DISJ1##_##DISJ2) {                           \
+  TEST_P(MergerTest4T1L, Vector_##DISJ1##_##DISJ2) {                           \
     const auto em = DISJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = DISJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -636,7 +658,7 @@ FOREVERY_PAIR_OF_COMMON_DISJ_DISJ_BINOP(IMPL_MERGER_TEST_DISJ_DISJ)
 ///    lat( i_00 i_01 i_02 / tensor_0 * tensor_1 * tensor_2 )
 /// }
 #define IMPL_MERGER_TEST_CONJ_CONJ(CONJ1, CONJ2)                               \
-  TEST_F(MergerTest4T1L, vector_##CONJ1##_##CONJ2) {                           \
+  TEST_P(MergerTest4T1L, vector_##CONJ1##_##CONJ2) {                           \
     const auto em = CONJ1##Expr(tensor(0), tensor(1));                         \
     const auto e = CONJ2##Expr(em, tensor(2));                                 \
     const auto l0 = lid(0);                                                    \
@@ -675,7 +697,7 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ)
 /// lat( i_00 / sparse_tensor_0 ) should be opted out as it only has dense diff
 /// with lat( i_00 i_01 / (sparse_tensor_0 + dense_tensor_1) ).
 #define IMPL_MERGER_TEST_OPTIMIZED_DISJ(OP, UNUSED)                            \
-  TEST_F(MergerTest3T1LD, vector_opted_##OP) {                                 \
+  TEST_P(MergerTest3T1LD, vector_opted_##OP) {                                 \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -711,7 +733,7 @@ FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_DISJ)
 /// }
 /// since i_01 is a dense dimension.
 #define IMPL_MERGER_TEST_OPTIMIZED_CONJ(OP, UNUSED)                            \
-  TEST_F(MergerTest3T1LD, vector_opted_##OP) {                                 \
+  TEST_P(MergerTest3T1LD, vector_opted_##OP) {                                 \
     const auto e = OP##Expr(tensor(0), tensor(1));                             \
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
@@ -746,7 +768,7 @@ FOREVERY_COMMON_CONJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_CONJ)
 ///   lat( i_00 / tensor_0 cmp 0 )
 ///   lat( i_01 / 0 cmp tensor_1 )
 /// }
-TEST_F(MergerTest3T1L, vector_cmp) {
+TEST_P(MergerTest3T1L, vector_cmp) {
   const auto e = cmpiExpr(tensor(0), tensor(1));
   const auto l0 = lid(0);
   const auto t0 = tid(0);
@@ -784,7 +806,7 @@ TEST_F(MergerTest3T1L, vector_cmp) {
 ///
 /// lat( i_00 / sparse_tensor_0 ) should be opted out as it only has dense diff
 /// with lat( i_00 i_01 / (sparse_tensor_0 cmp dense_tensor_1) ).
-TEST_F(MergerTest3T1LD, vector_cmp) {
+TEST_P(MergerTest3T1LD, vector_cmp) {
   const auto e = cmpiExpr(tensor(0), tensor(1));
   const auto l0 = lid(0);
   const auto t0 = tid(0);

From 06bcd9da1670b1d62e08b9fdd58b3a64368da87b Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Tue, 27 Feb 2024 13:45:37 -0800
Subject: [PATCH 008/114] [libc][stdbit] implement stdc_has_single_bit (C23)
 (#83168)

---
 libc/config/linux/x86_64/entrypoints.txt      |  5 +++++
 libc/docs/stdbit.rst                          | 12 +++++-----
 libc/include/llvm-libc-macros/stdbit-macros.h | 22 +++++++++++++++++++
 libc/spec/spec.td                             |  1 +
 libc/spec/stdc.td                             | 10 +++++++--
 libc/src/stdbit/CMakeLists.txt                |  1 +
 libc/src/stdbit/stdc_has_single_bit_uc.cpp    | 20 +++++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_uc.h      | 18 +++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_ui.cpp    | 20 +++++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_ui.h      | 18 +++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_ul.cpp    | 20 +++++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_ul.h      | 18 +++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_ull.cpp   | 20 +++++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_ull.h     | 18 +++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_us.cpp    | 20 +++++++++++++++++
 libc/src/stdbit/stdc_has_single_bit_us.h      | 18 +++++++++++++++
 libc/test/include/stdbit_test.cpp             | 13 +++++++++++
 libc/test/src/stdbit/CMakeLists.txt           |  1 +
 .../stdbit/stdc_has_single_bit_uc_test.cpp    | 20 +++++++++++++++++
 .../stdbit/stdc_has_single_bit_ui_test.cpp    | 20 +++++++++++++++++
 .../stdbit/stdc_has_single_bit_ul_test.cpp    | 20 +++++++++++++++++
 .../stdbit/stdc_has_single_bit_ull_test.cpp   | 20 +++++++++++++++++
 .../stdbit/stdc_has_single_bit_us_test.cpp    | 20 +++++++++++++++++
 23 files changed, 347 insertions(+), 8 deletions(-)
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_uc.cpp
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_uc.h
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_ui.cpp
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_ui.h
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_ul.cpp
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_ul.h
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_ull.cpp
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_ull.h
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_us.cpp
 create mode 100644 libc/src/stdbit/stdc_has_single_bit_us.h
 create mode 100644 libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp
 create mode 100644 libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index c2300a2aa681a44..27c9a42934c240f 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -142,6 +142,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_count_ones_ui
     libc.src.stdbit.stdc_count_ones_ul
     libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_has_single_bit_uc
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_has_single_bit_ui
+    libc.src.stdbit.stdc_has_single_bit_ul
+    libc.src.stdbit.stdc_has_single_bit_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
diff --git a/libc/docs/stdbit.rst b/libc/docs/stdbit.rst
index 0308caeb9293212..b579e9dbbc2f512 100644
--- a/libc/docs/stdbit.rst
+++ b/libc/docs/stdbit.rst
@@ -81,11 +81,11 @@ stdc_count_ones_us           |check|
 stdc_count_ones_ui           |check|
 stdc_count_ones_ul           |check|
 stdc_count_ones_ull          |check|
-stdc_has_single_bit_uc
-stdc_has_single_bit_us
-stdc_has_single_bit_ui
-stdc_has_single_bit_ul
-stdc_has_single_bit_ull
+stdc_has_single_bit_uc       |check|
+stdc_has_single_bit_us       |check|
+stdc_has_single_bit_ui       |check|
+stdc_has_single_bit_ul       |check|
+stdc_has_single_bit_ull      |check|
 stdc_bit_width_uc
 stdc_bit_width_us
 stdc_bit_width_ui
@@ -124,7 +124,7 @@ stdc_first_trailing_zero   |check|
 stdc_first_trailing_one    |check|
 stdc_count_zeros           |check|
 stdc_count_ones            |check|
-stdc_has_single_bit
+stdc_has_single_bit        |check|
 stdc_bit_width
 stdc_bit_floor
 stdc_bit_ceil
diff --git a/libc/include/llvm-libc-macros/stdbit-macros.h b/libc/include/llvm-libc-macros/stdbit-macros.h
index 5ee152e105f7727..e3a36d10ed92aba 100644
--- a/libc/include/llvm-libc-macros/stdbit-macros.h
+++ b/libc/include/llvm-libc-macros/stdbit-macros.h
@@ -157,6 +157,21 @@ inline unsigned stdc_count_ones(unsigned long x) {
 inline unsigned stdc_count_ones(unsigned long long x) {
   return stdc_count_ones_ull(x);
 }
+inline bool stdc_has_single_bit(unsigned char x) {
+  return stdc_has_single_bit_uc(x);
+}
+inline bool stdc_has_single_bit(unsigned short x) {
+  return stdc_has_single_bit_us(x);
+}
+inline bool stdc_has_single_bit(unsigned x) {
+  return stdc_has_single_bit_ui(x);
+}
+inline bool stdc_has_single_bit(unsigned long x) {
+  return stdc_has_single_bit_ul(x);
+}
+inline bool stdc_has_single_bit(unsigned long long x) {
+  return stdc_has_single_bit_ull(x);
+}
 #else
 #define stdc_leading_zeros(x)                                                  \
   _Generic((x),                                                                \
@@ -228,6 +243,13 @@ inline unsigned stdc_count_ones(unsigned long long x) {
       unsigned: stdc_count_ones_ui,                                            \
       unsigned long: stdc_count_ones_ul,                                       \
       unsigned long long: stdc_count_ones_ull)(x)
+#define stdc_has_single_bit(x)                                                 \
+  _Generic((x),                                                                \
+      unsigned char: stdc_has_single_bit_uc,                                   \
+      unsigned short: stdc_has_single_bit_us,                                  \
+      unsigned: stdc_has_single_bit_ui,                                        \
+      unsigned long: stdc_has_single_bit_ul,                                   \
+      unsigned long long: stdc_has_single_bit_ull)(x)
 #endif // __cplusplus
 
 #endif // __LLVM_LIBC_MACROS_STDBIT_MACROS_H
diff --git a/libc/spec/spec.td b/libc/spec/spec.td
index 90c076580be1252..998f37fb26deed4 100644
--- a/libc/spec/spec.td
+++ b/libc/spec/spec.td
@@ -51,6 +51,7 @@ def LongDoubleType : NamedType<"long double">;
 def CharType : NamedType<"char">;
 def UnsignedCharType : NamedType<"unsigned char">;
 def UnsignedShortType : NamedType<"unsigned short">;
+def BoolType : NamedType<"bool">;
 
 def Float128Type : NamedType<"float128">;
 
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 8a1a235e4eecdc5..5b97255b899746a 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -799,7 +799,8 @@ def StdC : StandardSpec<"stdc"> {
         Macro<"stdc_first_trailing_zero">,
         Macro<"stdc_first_trailing_one">,
         Macro<"stdc_count_zeros">,
-        Macro<"stdc_count_ones">
+        Macro<"stdc_count_ones">,
+        Macro<"stdc_has_single_bit">
       ], // Macros
       [], // Types
       [], // Enumerations
@@ -848,7 +849,12 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"stdc_count_ones_us", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedShortType>]>,
           FunctionSpec<"stdc_count_ones_ui", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedIntType>]>,
           FunctionSpec<"stdc_count_ones_ul", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongType>]>,
-          FunctionSpec<"stdc_count_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>
+          FunctionSpec<"stdc_count_ones_ull", RetValSpec<UnsignedIntType>, [ArgSpec<UnsignedLongLongType>]>,
+          FunctionSpec<"stdc_has_single_bit_uc", RetValSpec<BoolType>, [ArgSpec<UnsignedCharType>]>,
+          FunctionSpec<"stdc_has_single_bit_us", RetValSpec<BoolType>, [ArgSpec<UnsignedShortType>]>,
+          FunctionSpec<"stdc_has_single_bit_ui", RetValSpec<BoolType>, [ArgSpec<UnsignedIntType>]>,
+          FunctionSpec<"stdc_has_single_bit_ul", RetValSpec<BoolType>, [ArgSpec<UnsignedLongType>]>,
+          FunctionSpec<"stdc_has_single_bit_ull", RetValSpec<BoolType>, [ArgSpec<UnsignedLongLongType>]>
       ] // Functions
   >;
 
diff --git a/libc/src/stdbit/CMakeLists.txt b/libc/src/stdbit/CMakeLists.txt
index 5fb77d21e57a138..8bc7dd7852bbca0 100644
--- a/libc/src/stdbit/CMakeLists.txt
+++ b/libc/src/stdbit/CMakeLists.txt
@@ -9,6 +9,7 @@ set(prefixes
   first_trailing_one
   count_zeros
   count_ones
+  has_single_bit
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/src/stdbit/stdc_has_single_bit_uc.cpp b/libc/src/stdbit/stdc_has_single_bit_uc.cpp
new file mode 100644
index 000000000000000..e5acdc2a71b4bc9
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_uc.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_uc --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_uc.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_uc, (unsigned char value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_uc.h b/libc/src/stdbit/stdc_has_single_bit_uc.h
new file mode 100644
index 000000000000000..028d4ee710505aa
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_uc.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_uc --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UC_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UC_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_uc(unsigned char value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UC_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_ui.cpp b/libc/src/stdbit/stdc_has_single_bit_ui.cpp
new file mode 100644
index 000000000000000..37578882324aa6c
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ui.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_ui --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_ui.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_ui, (unsigned value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_ui.h b/libc/src/stdbit/stdc_has_single_bit_ui.h
new file mode 100644
index 000000000000000..1e8cd9afaee8858
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ui.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_ui --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UI_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UI_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_ui(unsigned value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UI_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_ul.cpp b/libc/src/stdbit/stdc_has_single_bit_ul.cpp
new file mode 100644
index 000000000000000..85133ab81cc602c
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ul.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_ul --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_ul.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_ul, (unsigned long value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_ul.h b/libc/src/stdbit/stdc_has_single_bit_ul.h
new file mode 100644
index 000000000000000..9b924fca9f065d5
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ul.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_ul --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UL_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_ul(unsigned long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_UL_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_ull.cpp b/libc/src/stdbit/stdc_has_single_bit_ull.cpp
new file mode 100644
index 000000000000000..4491cf2b98b6d32
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ull.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_ull -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_ull.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_ull, (unsigned long long value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_ull.h b/libc/src/stdbit/stdc_has_single_bit_ull.h
new file mode 100644
index 000000000000000..d4802bc287274f7
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_ull.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_ull -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_ULL_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_ULL_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_ull(unsigned long long value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_ULL_H
diff --git a/libc/src/stdbit/stdc_has_single_bit_us.cpp b/libc/src/stdbit/stdc_has_single_bit_us.cpp
new file mode 100644
index 000000000000000..7a42ae553aa2e80
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_us.cpp
@@ -0,0 +1,20 @@
+//===-- Implementation of stdc_has_single_bit_us --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdbit/stdc_has_single_bit_us.h"
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(bool, stdc_has_single_bit_us, (unsigned short value)) {
+  return cpp::has_single_bit(value);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdbit/stdc_has_single_bit_us.h b/libc/src/stdbit/stdc_has_single_bit_us.h
new file mode 100644
index 000000000000000..201ff4954c3b7ac
--- /dev/null
+++ b/libc/src/stdbit/stdc_has_single_bit_us.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for stdc_has_single_bit_us --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_US_H
+#define LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_US_H
+
+namespace LIBC_NAMESPACE {
+
+bool stdc_has_single_bit_us(unsigned short value);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDBIT_STDC_HAS_SINGLE_BIT_US_H
diff --git a/libc/test/include/stdbit_test.cpp b/libc/test/include/stdbit_test.cpp
index 46019075a7c1095..acb79ca0f3ff11d 100644
--- a/libc/test/include/stdbit_test.cpp
+++ b/libc/test/include/stdbit_test.cpp
@@ -81,6 +81,11 @@ unsigned stdc_count_ones_us(unsigned short) noexcept { return 0x3BU; }
 unsigned stdc_count_ones_ui(unsigned) noexcept { return 0x3CU; }
 unsigned stdc_count_ones_ul(unsigned long) noexcept { return 0x3DU; }
 unsigned stdc_count_ones_ull(unsigned long long) noexcept { return 0x3FU; }
+bool stdc_has_single_bit_uc(unsigned char) noexcept { return false; }
+bool stdc_has_single_bit_us(unsigned short) noexcept { return false; }
+bool stdc_has_single_bit_ui(unsigned) noexcept { return false; }
+bool stdc_has_single_bit_ul(unsigned long) noexcept { return false; }
+bool stdc_has_single_bit_ull(unsigned long long) noexcept { return false; }
 }
 
 #include "include/llvm-libc-macros/stdbit-macros.h"
@@ -164,3 +169,11 @@ TEST(LlvmLibcStdbitTest, TypeGenericMacroCountOnes) {
   EXPECT_EQ(stdc_count_ones(0UL), 0x3DU);
   EXPECT_EQ(stdc_count_ones(0ULL), 0x3FU);
 }
+
+TEST(LlvmLibcStdbitTest, TypeGenericMacroHasSingleBit) {
+  EXPECT_EQ(stdc_has_single_bit(static_cast<unsigned char>(1U)), false);
+  EXPECT_EQ(stdc_has_single_bit(static_cast<unsigned short>(1U)), false);
+  EXPECT_EQ(stdc_has_single_bit(1U), false);
+  EXPECT_EQ(stdc_has_single_bit(1UL), false);
+  EXPECT_EQ(stdc_has_single_bit(1ULL), false);
+}
diff --git a/libc/test/src/stdbit/CMakeLists.txt b/libc/test/src/stdbit/CMakeLists.txt
index 659e575fedea27b..a886ee4a35325d2 100644
--- a/libc/test/src/stdbit/CMakeLists.txt
+++ b/libc/test/src/stdbit/CMakeLists.txt
@@ -11,6 +11,7 @@ set(prefixes
   first_trailing_one
   count_zeros
   count_ones
+  has_single_bit
 )
 set(suffixes c s i l ll)
 foreach(prefix IN LISTS prefixes)
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp
new file mode 100644
index 000000000000000..6212b1ec765a5df
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_uc_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_uc ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_uc.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUcTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_uc(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUcTest, OneHot) {
+  for (unsigned i = 0U; i != UCHAR_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_uc(1U << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp
new file mode 100644
index 000000000000000..2e00507aa0258cd
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_ui_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_ui ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_ui.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUiTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ui(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUiTest, OneHot) {
+  for (unsigned i = 0U; i != UINT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ui(1U << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp
new file mode 100644
index 000000000000000..8c0178998bbec16
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_ul_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_ul ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_ul.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUlTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ul(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUlTest, OneHot) {
+  for (unsigned i = 0U; i != ULONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ul(1UL << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp
new file mode 100644
index 000000000000000..1d9f976b6d63388
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_ull_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_ull -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_ull.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUllTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ull(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUllTest, OneHot) {
+  for (unsigned i = 0U; i != ULLONG_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_ull(1ULL << i), true);
+}
diff --git a/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp b/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp
new file mode 100644
index 000000000000000..52c4de881044591
--- /dev/null
+++ b/libc/test/src/stdbit/stdc_has_single_bit_us_test.cpp
@@ -0,0 +1,20 @@
+//===-- Unittests for stdc_has_single_bit_us ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/stdbit/stdc_has_single_bit_us.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcStdcCountOnesUsTest, Zero) {
+  EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_us(0U), false);
+}
+
+TEST(LlvmLibcStdcCountOnesUsTest, OneHot) {
+  for (unsigned i = 0U; i != USHRT_WIDTH; ++i)
+    EXPECT_EQ(LIBC_NAMESPACE::stdc_has_single_bit_us(1U << i), true);
+}

From 70a7b1e8df7222557cadec4e6d007850ce64f8ed Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Tue, 27 Feb 2024 14:00:01 -0800
Subject: [PATCH 009/114] Remove test since no test on --debug output. (#83189)

---
 .../LLVMIR/erase-dangling-constants.mlir      | 73 -------------------
 1 file changed, 73 deletions(-)
 delete mode 100644 mlir/test/Target/LLVMIR/erase-dangling-constants.mlir

diff --git a/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir b/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir
deleted file mode 100644
index dbb675595600316..000000000000000
--- a/mlir/test/Target/LLVMIR/erase-dangling-constants.mlir
+++ /dev/null
@@ -1,73 +0,0 @@
-// REQUIRES: asserts
-// RUN: mlir-translate -mlir-to-llvmir %s -debug-only=llvm-dialect-to-llvm-ir 2>&1 | FileCheck %s
-
-// CHECK: Convert initializer for dup_const
-// CHECK: 6 new constants hit
-// CHECK: 3 dangling constants erased
-// CHECK: Convert initializer for unique_const
-// CHECK: 6 new constants hit
-// CHECK: 5 dangling constants erased
-
-
-// CHECK:@dup_const = global { [2 x double], [2 x double], [2 x double] } { [2 x double] [double 3.612250e-02, double 5.119230e-02], [2 x double] [double 3.612250e-02, double 5.119230e-02], [2 x double] [double 3.612250e-02, double 5.119230e-02] }
-
-llvm.mlir.global @dup_const() : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)> {
-    %c0 = llvm.mlir.constant(3.612250e-02 : f64) : f64
-    %c1 = llvm.mlir.constant(5.119230e-02 : f64) : f64
-
-    %empty0 = llvm.mlir.undef : !llvm.array<2 x f64>
-    %a00 = llvm.insertvalue %c0, %empty0[0] : !llvm.array<2 x f64>
-
-    %empty1 = llvm.mlir.undef : !llvm.array<2 x f64>
-    %a10 = llvm.insertvalue %c0, %empty1[0] : !llvm.array<2 x f64>
-
-    %empty2 = llvm.mlir.undef : !llvm.array<2 x f64>
-    %a20 = llvm.insertvalue %c0, %empty2[0] : !llvm.array<2 x f64>
-
-// NOTE: a00, a10, a20 are all same ConstantAggregate which not used at this point.
-//       should not delete it before all of the uses of the ConstantAggregate finished.
-
-    %a01 = llvm.insertvalue %c1, %a00[1] : !llvm.array<2 x f64>
-    %a11 = llvm.insertvalue %c1, %a10[1] : !llvm.array<2 x f64>
-    %a21 = llvm.insertvalue %c1, %a20[1] : !llvm.array<2 x f64>
-    %empty_r = llvm.mlir.undef : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-    %r0 = llvm.insertvalue %a01, %empty_r[0] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-    %r1 = llvm.insertvalue %a11, %r0[1] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-    %r2 = llvm.insertvalue %a21, %r1[2] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    llvm.return %r2 : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-  }
-
-// CHECK:@unique_const = global { [2 x double], [2 x double], [2 x double] } { [2 x double] [double 3.612250e-02, double 5.119230e-02], [2 x double] [double 3.312250e-02, double 5.219230e-02], [2 x double] [double 3.412250e-02, double 5.419230e-02] }
-
-llvm.mlir.global @unique_const() : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)> {
-    %c0 = llvm.mlir.constant(3.612250e-02 : f64) : f64
-    %c1 = llvm.mlir.constant(5.119230e-02 : f64) : f64
-
-    %c2 = llvm.mlir.constant(3.312250e-02 : f64) : f64
-    %c3 = llvm.mlir.constant(5.219230e-02 : f64) : f64
-
-    %c4 = llvm.mlir.constant(3.412250e-02 : f64) : f64
-    %c5 = llvm.mlir.constant(5.419230e-02 : f64) : f64
-
-    %2 = llvm.mlir.undef : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    %3 = llvm.mlir.undef : !llvm.array<2 x f64>
-
-    %4 = llvm.insertvalue %c0, %3[0] : !llvm.array<2 x f64>
-    %5 = llvm.insertvalue %c1, %4[1] : !llvm.array<2 x f64>
-
-    %6 = llvm.insertvalue %5, %2[0] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    %7 = llvm.insertvalue %c2, %3[0] : !llvm.array<2 x f64>
-    %8 = llvm.insertvalue %c3, %7[1] : !llvm.array<2 x f64>
-
-    %9 = llvm.insertvalue %8, %6[1] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    %10 = llvm.insertvalue %c4, %3[0] : !llvm.array<2 x f64>
-    %11 = llvm.insertvalue %c5, %10[1] : !llvm.array<2 x f64>
-
-    %12 = llvm.insertvalue %11, %9[2] : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-
-    llvm.return %12 : !llvm.struct<(array<2 x f64>, array<2 x f64>, array<2 x f64>)>
-}

From d4cdb516eee49ecaf36380af4d8f923cc475e1d7 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Tue, 27 Feb 2024 14:00:43 -0800
Subject: [PATCH 010/114] [WebAssembly] Add RefTypeMem2Local pass (#81965)

This adds `WebAssemblyRefTypeMem2Local` pass, which changes the address
spaces of reference type `alloca`s to `addrspace(1)`. This in turn
changes the address spaces of all `load` and `store` instructions that
use the `alloca`s.

`addrspace(1)` is `WASM_ADDRESS_SPACE_VAR`, and loads and stores to this
address space become `local.get`s and `local.set`s, thanks to the Wasm
local IR support added in

https://github.com/llvm/llvm-project/commit/82f92e35c6464e23859c29422956caaceb623967.

In a follow-up PR, I am planning to replace the usage of mem2reg pass
with this to solve the reference type `alloca` problems described in
#81575.
---
 llvm/lib/Target/WebAssembly/CMakeLists.txt    |  1 +
 llvm/lib/Target/WebAssembly/WebAssembly.h     |  2 +
 .../WebAssemblyRefTypeMem2Local.cpp           | 91 +++++++++++++++++++
 .../WebAssembly/WebAssemblyTargetMachine.cpp  |  1 +
 .../CodeGen/WebAssembly/ref-type-mem2local.ll | 57 ++++++++++++
 5 files changed, 152 insertions(+)
 create mode 100644 llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp
 create mode 100644 llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll

diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt
index bb2ccea5c145985..f430be2653b4ee8 100644
--- a/llvm/lib/Target/WebAssembly/CMakeLists.txt
+++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyOptimizeLiveIntervals.cpp
   WebAssemblyOptimizeReturned.cpp
   WebAssemblyPeephole.cpp
+  WebAssemblyRefTypeMem2Local.cpp
   WebAssemblyRegisterInfo.cpp
   WebAssemblyRegColoring.cpp
   WebAssemblyRegNumbering.cpp
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.h b/llvm/lib/Target/WebAssembly/WebAssembly.h
index 91765ad117bdb01..1c40addb6d6f785 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -30,6 +30,7 @@ ModulePass *createWebAssemblyAddMissingPrototypes();
 ModulePass *createWebAssemblyFixFunctionBitcasts();
 FunctionPass *createWebAssemblyOptimizeReturned();
 FunctionPass *createWebAssemblyLowerRefTypesIntPtrConv();
+FunctionPass *createWebAssemblyRefTypeMem2Local();
 
 // ISel and immediate followup passes.
 FunctionPass *createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
@@ -59,6 +60,7 @@ ModulePass *createWebAssemblyMCLowerPrePass();
 // PassRegistry initialization declarations.
 void initializeFixFunctionBitcastsPass(PassRegistry &);
 void initializeOptimizeReturnedPass(PassRegistry &);
+void initializeWebAssemblyRefTypeMem2LocalPass(PassRegistry &);
 void initializeWebAssemblyAddMissingPrototypesPass(PassRegistry &);
 void initializeWebAssemblyArgumentMovePass(PassRegistry &);
 void initializeWebAssemblyCFGSortPass(PassRegistry &);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp
new file mode 100644
index 000000000000000..d3c60ee289dfd27
--- /dev/null
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRefTypeMem2Local.cpp
@@ -0,0 +1,91 @@
+//=== WebAssemblyRefTypeMem2Local.cpp - WebAssembly RefType Mem2Local -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Assign reference type allocas to local addrspace (addrspace(1)) so that
+/// their loads and stores can be lowered to local.gets/local.sets.
+///
+//===----------------------------------------------------------------------===//
+
+#include "Utils/WasmAddressSpaces.h"
+#include "Utils/WebAssemblyTypeUtilities.h"
+#include "WebAssembly.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-ref-type-mem2local"
+
+namespace {
+class WebAssemblyRefTypeMem2Local final
+    : public FunctionPass,
+      public InstVisitor<WebAssemblyRefTypeMem2Local> {
+  StringRef getPassName() const override {
+    return "WebAssembly Reference Types Memory to Local";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override;
+  bool Changed = false;
+
+public:
+  static char ID;
+  WebAssemblyRefTypeMem2Local() : FunctionPass(ID) {}
+
+  void visitAllocaInst(AllocaInst &AI);
+};
+} // End anonymous namespace
+
+char WebAssemblyRefTypeMem2Local::ID = 0;
+INITIALIZE_PASS(WebAssemblyRefTypeMem2Local, DEBUG_TYPE,
+                "Assign reference type allocas to local address space", true,
+                false)
+
+FunctionPass *llvm::createWebAssemblyRefTypeMem2Local() {
+  return new WebAssemblyRefTypeMem2Local();
+}
+
+void WebAssemblyRefTypeMem2Local::visitAllocaInst(AllocaInst &AI) {
+  if (WebAssembly::isWebAssemblyReferenceType(AI.getAllocatedType())) {
+    Changed = true;
+    IRBuilder<> IRB(AI.getContext());
+    IRB.SetInsertPoint(&AI);
+    auto *NewAI = IRB.CreateAlloca(AI.getAllocatedType(),
+                                   WebAssembly::WASM_ADDRESS_SPACE_VAR, nullptr,
+                                   AI.getName() + ".var");
+
+    // The below is basically equivalent to AI.replaceAllUsesWith(NewAI), but we
+    // cannot use it because it requires the old and new types be the same,
+    // which is not true here because the address spaces are different.
+    if (AI.hasValueHandle())
+      ValueHandleBase::ValueIsRAUWd(&AI, NewAI);
+    if (AI.isUsedByMetadata())
+      ValueAsMetadata::handleRAUW(&AI, NewAI);
+    while (!AI.materialized_use_empty()) {
+      Use &U = *AI.materialized_use_begin();
+      U.set(NewAI);
+    }
+
+    AI.eraseFromParent();
+  }
+}
+
+bool WebAssemblyRefTypeMem2Local::runOnFunction(Function &F) {
+  LLVM_DEBUG(dbgs() << "********** WebAssembly RefType Mem2Local **********\n"
+                       "********** Function: "
+                    << F.getName() << '\n');
+
+  visit(F);
+  return Changed;
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index b2f7ee970a732f7..d088c7d925ddf00 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -77,6 +77,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   initializeLowerGlobalDtorsLegacyPassPass(PR);
   initializeFixFunctionBitcastsPass(PR);
   initializeOptimizeReturnedPass(PR);
+  initializeWebAssemblyRefTypeMem2LocalPass(PR);
   initializeWebAssemblyArgumentMovePass(PR);
   initializeWebAssemblySetP2AlignOperandsPass(PR);
   initializeWebAssemblyReplacePhysRegsPass(PR);
diff --git a/llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll b/llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll
new file mode 100644
index 000000000000000..a38243ca218cc1d
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/ref-type-mem2local.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -wasm-ref-type-mem2local -S | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+%externref = type ptr addrspace(10)
+%funcref = type ptr addrspace(20)
+
+declare %externref @get_externref()
+declare %funcref @get_funcref()
+declare i32 @get_i32()
+declare void @take_externref(%externref)
+declare void @take_funcref(%funcref)
+declare void @take_i32(i32)
+
+; Reference type allocas should be moved to addrspace(1)
+; CHECK-LABEL: @test_ref_type_mem2local
+define void @test_ref_type_mem2local() {
+entry:
+  %alloc.externref = alloca %externref, align 1
+  %eref = call %externref @get_externref()
+  store %externref %eref, ptr %alloc.externref, align 1
+  %eref.loaded = load %externref, ptr %alloc.externref, align 1
+  call void @take_externref(%externref %eref.loaded)
+  ; CHECK:      %alloc.externref.var = alloca ptr addrspace(10), align 1, addrspace(1)
+  ; CHECK-NEXT: %eref = call ptr addrspace(10) @get_externref()
+  ; CHECK-NEXT: store ptr addrspace(10) %eref, ptr addrspace(1) %alloc.externref.var, align 1
+  ; CHECK-NEXT: %eref.loaded = load ptr addrspace(10), ptr addrspace(1) %alloc.externref.var, align 1
+  ; CHECK-NEXT: call void @take_externref(ptr addrspace(10) %eref.loaded)
+
+  %alloc.funcref = alloca %funcref, align 1
+  %fref = call %funcref @get_funcref()
+  store %funcref %fref, ptr %alloc.funcref, align 1
+  %fref.loaded = load %funcref, ptr %alloc.funcref, align 1
+  call void @take_funcref(%funcref %fref.loaded)
+  ; CHECK-NEXT: %alloc.funcref.var = alloca ptr addrspace(20), align 1, addrspace(1)
+  ; CHECK-NEXT: %fref = call ptr addrspace(20) @get_funcref()
+  ; CHECK-NEXT: store ptr addrspace(20) %fref, ptr addrspace(1) %alloc.funcref.var, align 1
+  ; CHECK-NEXT: %fref.loaded = load ptr addrspace(20), ptr addrspace(1) %alloc.funcref.var, align 1
+  ; CHECK-NEXT: call void @take_funcref(ptr addrspace(20) %fref.loaded)
+
+  ret void
+}
+
+; POD type allocas should stay the same
+; CHECK-LABEL: @test_pod_type
+define void @test_pod_type() {
+entry:
+  %alloc.i32 = alloca i32
+  %i32 = call i32 @get_i32()
+  store i32 %i32, ptr %alloc.i32
+  %i32.loaded = load i32, ptr %alloc.i32
+  call void @take_i32(i32 %i32.loaded)
+  ; CHECK: %alloc.i32 = alloca i32, align 4{{$}}
+  ; CHECK-NOT: addrspace(1)
+
+  ret void
+}

From 5964f4bcf012deb8b8dadcc403644c754a6b15e0 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 27 Feb 2024 22:01:27 +0000
Subject: [PATCH 011/114] [gn build] Port d4cdb516eee4

---
 llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
index 949b3b2147405c3..a8d6290f1b990db 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
@@ -61,6 +61,7 @@ static_library("LLVMWebAssemblyCodeGen") {
     "WebAssemblyOptimizeLiveIntervals.cpp",
     "WebAssemblyOptimizeReturned.cpp",
     "WebAssemblyPeephole.cpp",
+    "WebAssemblyRefTypeMem2Local.cpp",
     "WebAssemblyRegColoring.cpp",
     "WebAssemblyRegNumbering.cpp",
     "WebAssemblyRegStackify.cpp",

From 3761ad01e125e3b38ed2d6f40b3cbcbac13611a5 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 27 Feb 2024 17:31:02 -0500
Subject: [PATCH 012/114] [libc++] Remove _LIBCPP_ATOMIC_ONLY_USE_BUILTINS
 (#82000)

As discussed in #76647, _LIBCPP_ATOMIC_ONLY_USE_BUILTINS is a
questionable configuration option. It makes our implementation of
std::atomic even more complicated than it already is for a limited
benefit.

Indeed, the original goal of that setting was to decouple libc++ from
libraries like compiler-rt and libatomic in Freestanding mode. We didn't
have a clear understanding of goals and non-goals of Freestanding back
then, but nowadays we do have a better understanding that removing all
dependencies of libc++ in Freestanding is a non-goal. We should still be
able to depend on builtins like those defined in compiler-rt for
implementing our atomic operations in Freestanding. Freestanding means
that there is no underlying operating system, not that there is no
toolchain available.

This patch removes the configuration option. This should have a very
limited fallout since that configuration was only enabled with
-ffreestanding, and libc++ basically doesn't work out of the box on
Freestanding platforms today.

The benefits are a slightly simpler implementation of std::atomic,
getting rid of one of the ABI-incompatible representations of
std::atomic, and clearing the way for proper Freestanding support to
eventually land in the library.

Fixes #81286
---
 libcxx/docs/ReleaseNotes/19.rst           |   5 +
 libcxx/include/__atomic/aliases.h         |   2 +-
 libcxx/include/__atomic/cxx_atomic_impl.h | 291 +---------------------
 libcxx/include/__config                   |   3 -
 4 files changed, 7 insertions(+), 294 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 6c8f8d17af9b1f9..78c6bb87a5a4024 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -73,6 +73,11 @@ Deprecations and Removals
 
 - The ``_LIBCPP_INLINE_VISIBILITY`` and ``_VSTD`` macros have been removed in LLVM 19.
 
+- The ``_LIBCPP_ATOMIC_ONLY_USE_BUILTINS`` configuration option has been removed in LLVM 19. This should not affect
+  many users, except perhaps users using the library with ``-ffreestanding`` with a toolchain where compiler-rt or
+  libatomic is not available. If you are one such user, please reach out to the libc++ developers so we can collaborate
+  on a path for supporting atomics properly on freestanding platforms.
+
 
 Upcoming Deprecations and Removals
 ----------------------------------
diff --git a/libcxx/include/__atomic/aliases.h b/libcxx/include/__atomic/aliases.h
index 0fa289de54b0f12..db34f5ec02d744c 100644
--- a/libcxx/include/__atomic/aliases.h
+++ b/libcxx/include/__atomic/aliases.h
@@ -92,7 +92,7 @@ using __largest_lock_free_type = short;
 #  elif ATOMIC_CHAR_LOCK_FREE == 2
 using __largest_lock_free_type = char;
 #  else
-#    define _LIBCPP_NO_LOCK_FREE_TYPES // There are no lockfree types (this can happen in freestanding)
+#    define _LIBCPP_NO_LOCK_FREE_TYPES // There are no lockfree types (this can happen on unusual platforms)
 #  endif
 
 #  ifndef _LIBCPP_NO_LOCK_FREE_TYPES
diff --git a/libcxx/include/__atomic/cxx_atomic_impl.h b/libcxx/include/__atomic/cxx_atomic_impl.h
index 1a0b808a0cb1c4f..b900cc135f78f63 100644
--- a/libcxx/include/__atomic/cxx_atomic_impl.h
+++ b/libcxx/include/__atomic/cxx_atomic_impl.h
@@ -9,16 +9,13 @@
 #ifndef _LIBCPP___ATOMIC_CXX_ATOMIC_IMPL_H
 #define _LIBCPP___ATOMIC_CXX_ATOMIC_IMPL_H
 
-#include <__atomic/is_always_lock_free.h>
 #include <__atomic/memory_order.h>
 #include <__config>
 #include <__memory/addressof.h>
-#include <__type_traits/conditional.h>
 #include <__type_traits/is_assignable.h>
 #include <__type_traits/is_trivially_copyable.h>
 #include <__type_traits/remove_const.h>
 #include <cstddef>
-#include <cstring>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -26,7 +23,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if defined(_LIBCPP_HAS_GCC_ATOMIC_IMP) || defined(_LIBCPP_ATOMIC_ONLY_USE_BUILTINS)
+#if defined(_LIBCPP_HAS_GCC_ATOMIC_IMP)
 
 // [atomics.types.generic]p1 guarantees _Tp is trivially copyable. Because
 // the default operator= in an object is not volatile, a byte-by-byte copy
@@ -44,10 +41,6 @@ _LIBCPP_HIDE_FROM_ABI void __cxx_atomic_assign_volatile(_Tp volatile& __a_value,
     *__to++ = *__from++;
 }
 
-#endif
-
-#if defined(_LIBCPP_HAS_GCC_ATOMIC_IMP)
-
 template <typename _Tp>
 struct __cxx_atomic_base_impl {
   _LIBCPP_HIDE_FROM_ABI
@@ -529,289 +522,7 @@ __cxx_atomic_fetch_xor(__cxx_atomic_base_impl<_Tp>* __a, _Tp __pattern, memory_o
 
 #endif // _LIBCPP_HAS_GCC_ATOMIC_IMP, _LIBCPP_HAS_C_ATOMIC_IMP
 
-#ifdef _LIBCPP_ATOMIC_ONLY_USE_BUILTINS
-
-template <typename _Tp>
-struct __cxx_atomic_lock_impl {
-  _LIBCPP_HIDE_FROM_ABI __cxx_atomic_lock_impl() _NOEXCEPT : __a_value(), __a_lock(0) {}
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR explicit __cxx_atomic_lock_impl(_Tp value) _NOEXCEPT
-      : __a_value(value),
-        __a_lock(0) {}
-
-  _Tp __a_value;
-  mutable __cxx_atomic_base_impl<_LIBCPP_ATOMIC_FLAG_TYPE> __a_lock;
-
-  _LIBCPP_HIDE_FROM_ABI void __lock() const volatile {
-    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-      /*spin*/;
-  }
-  _LIBCPP_HIDE_FROM_ABI void __lock() const {
-    while (1 == __cxx_atomic_exchange(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(true), memory_order_acquire))
-      /*spin*/;
-  }
-  _LIBCPP_HIDE_FROM_ABI void __unlock() const volatile {
-    __cxx_atomic_store(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(false), memory_order_release);
-  }
-  _LIBCPP_HIDE_FROM_ABI void __unlock() const {
-    __cxx_atomic_store(&__a_lock, _LIBCPP_ATOMIC_FLAG_TYPE(false), memory_order_release);
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp __read() const volatile {
-    __lock();
-    _Tp __old;
-    __cxx_atomic_assign_volatile(__old, __a_value);
-    __unlock();
-    return __old;
-  }
-  _LIBCPP_HIDE_FROM_ABI _Tp __read() const {
-    __lock();
-    _Tp __old = __a_value;
-    __unlock();
-    return __old;
-  }
-  _LIBCPP_HIDE_FROM_ABI void __read_inplace(_Tp* __dst) const volatile {
-    __lock();
-    __cxx_atomic_assign_volatile(*__dst, __a_value);
-    __unlock();
-  }
-  _LIBCPP_HIDE_FROM_ABI void __read_inplace(_Tp* __dst) const {
-    __lock();
-    *__dst = __a_value;
-    __unlock();
-  }
-};
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_init(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __val) {
-  __cxx_atomic_assign_volatile(__a->__a_value, __val);
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_init(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __val) {
-  __a->__a_value = __val;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_store(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __val, memory_order) {
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__a->__a_value, __val);
-  __a->__unlock();
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_store(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __val, memory_order) {
-  __a->__lock();
-  __a->__a_value = __val;
-  __a->__unlock();
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp>* __a, memory_order) {
-  return __a->__read();
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp>* __a, memory_order) {
-  return __a->__read();
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void
-__cxx_atomic_load(const volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __dst, memory_order) {
-  __a->__read_inplace(__dst);
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI void __cxx_atomic_load(const __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __dst, memory_order) {
-  __a->__read_inplace(__dst);
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_exchange(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __value, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_exchange(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __value, memory_order) {
-  __a->__lock();
-  _Tp __old      = __a->__a_value;
-  __a->__a_value = __value;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_strong(
-    volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  _Tp __temp;
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = (std::memcmp(&__temp, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  else
-    __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
-  __a->__unlock();
-  return __ret;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_strong(
-    __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
-  bool __ret = (std::memcmp(&__a->__a_value, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    std::memcpy(&__a->__a_value, &__value, sizeof(_Tp));
-  else
-    std::memcpy(__expected, &__a->__a_value, sizeof(_Tp));
-  __a->__unlock();
-  return __ret;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_weak(
-    volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  _Tp __temp;
-  __a->__lock();
-  __cxx_atomic_assign_volatile(__temp, __a->__a_value);
-  bool __ret = (std::memcmp(&__temp, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    __cxx_atomic_assign_volatile(__a->__a_value, __value);
-  else
-    __cxx_atomic_assign_volatile(*__expected, __a->__a_value);
-  __a->__unlock();
-  return __ret;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI bool __cxx_atomic_compare_exchange_weak(
-    __cxx_atomic_lock_impl<_Tp>* __a, _Tp* __expected, _Tp __value, memory_order, memory_order) {
-  __a->__lock();
-  bool __ret = (std::memcmp(&__a->__a_value, __expected, sizeof(_Tp)) == 0);
-  if (__ret)
-    std::memcpy(&__a->__a_value, &__value, sizeof(_Tp));
-  else
-    std::memcpy(__expected, &__a->__a_value, sizeof(_Tp));
-  __a->__unlock();
-  return __ret;
-}
-
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old + __delta));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value += __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp*
-__cxx_atomic_fetch_add(volatile __cxx_atomic_lock_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order) {
-  __a->__lock();
-  _Tp* __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, __old + __delta);
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp* __cxx_atomic_fetch_add(__cxx_atomic_lock_impl<_Tp*>* __a, ptrdiff_t __delta, memory_order) {
-  __a->__lock();
-  _Tp* __old = __a->__a_value;
-  __a->__a_value += __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_sub(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old - __delta));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp, typename _Td>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_sub(__cxx_atomic_lock_impl<_Tp>* __a, _Td __delta, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value -= __delta;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__cxx_atomic_fetch_and(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old & __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_and(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value &= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__cxx_atomic_fetch_or(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old | __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_or(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value |= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp
-__cxx_atomic_fetch_xor(volatile __cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old;
-  __cxx_atomic_assign_volatile(__old, __a->__a_value);
-  __cxx_atomic_assign_volatile(__a->__a_value, _Tp(__old ^ __pattern));
-  __a->__unlock();
-  return __old;
-}
-template <typename _Tp>
-_LIBCPP_HIDE_FROM_ABI _Tp __cxx_atomic_fetch_xor(__cxx_atomic_lock_impl<_Tp>* __a, _Tp __pattern, memory_order) {
-  __a->__lock();
-  _Tp __old = __a->__a_value;
-  __a->__a_value ^= __pattern;
-  __a->__unlock();
-  return __old;
-}
-
-template <typename _Tp,
-          typename _Base = typename conditional<__libcpp_is_always_lock_free<_Tp>::__value,
-                                                __cxx_atomic_base_impl<_Tp>,
-                                                __cxx_atomic_lock_impl<_Tp> >::type>
-#else
 template <typename _Tp, typename _Base = __cxx_atomic_base_impl<_Tp> >
-#endif //_LIBCPP_ATOMIC_ONLY_USE_BUILTINS
 struct __cxx_atomic_impl : public _Base {
   static_assert(is_trivially_copyable<_Tp>::value, "std::atomic<T> requires that 'T' be a trivially copyable type");
 
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 0797880cb2f5da7..942bbe7cbb93cbf 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1200,9 +1200,6 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 #    ifndef _LIBCPP_ATOMIC_FLAG_TYPE
 #      define _LIBCPP_ATOMIC_FLAG_TYPE bool
 #    endif
-#    ifdef _LIBCPP_FREESTANDING
-#      define _LIBCPP_ATOMIC_ONLY_USE_BUILTINS
-#    endif
 #  endif
 
 #  if defined(__FreeBSD__) && defined(__clang__) && __has_attribute(__no_thread_safety_analysis__)

From 78647116d85b30c9b8b31a4690758c33f50c0550 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Tue, 27 Feb 2024 14:34:07 -0800
Subject: [PATCH 013/114] [nfc][compiler-rt]Remove round-up in
 __llvm_profile_get_num_data (#83194)

- Update instrprof-basic.c as a regression test.
---
 compiler-rt/lib/profile/InstrProfilingBuffer.c | 11 ++---------
 compiler-rt/test/profile/instrprof-basic.c     | 12 ++++++++++++
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index 7c5c26f4d113b13..1c451d7ec756370 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -61,19 +61,12 @@ uint64_t __llvm_profile_get_size_for_buffer(void) {
       NamesBegin, NamesEnd, VTableBegin, VTableEnd, VNamesBegin, VNamesEnd);
 }
 
+// NOTE: Caller should guarantee that `Begin` and `End` specifies a half-open
+// interval [Begin, End). Namely, `End` is one-byte past the end of the array.
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
                                      const __llvm_profile_data *End) {
   intptr_t BeginI = (intptr_t)Begin, EndI = (intptr_t)End;
-  // `sizeof(__llvm_profile_data) - 1` is required in the numerator when
-  // [Begin, End] represents an inclusive range.
-  // For ELF, [Begin, End) represents the address of linker-inserted
-  // symbols  `__start__<elf-section>` and `__stop_<elf-section>`.
-  // Thereby, `End` is one byte past the inclusive range, and
-  // `sizeof(__llvm_profile_data) - 1` is not necessary in the numerator to get
-  // the correct number of profile data.
-  // FIXME: Consider removing `sizeof(__llvm_profile_data) - 1` if this is true
-  // across platforms.
   return ((EndI + sizeof(__llvm_profile_data) - 1) - BeginI) /
          sizeof(__llvm_profile_data);
 }
diff --git a/compiler-rt/test/profile/instrprof-basic.c b/compiler-rt/test/profile/instrprof-basic.c
index de66e1b2746806e..702f521ba4ed8a5 100644
--- a/compiler-rt/test/profile/instrprof-basic.c
+++ b/compiler-rt/test/profile/instrprof-basic.c
@@ -1,6 +1,7 @@
 // RUN: %clang_profgen -o %t -O3 %s
 // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t
 // RUN: llvm-profdata merge -o %t.profdata %t.profraw
+// RUN: llvm-profdata show --all-functions %t.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.profdata -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=ORIG
 //
 // RUN: rm -fr %t.dir1
@@ -8,6 +9,7 @@
 // RUN: env LLVM_PROFILE_FILE=%t.dir1/profraw_e_%1m %run %t
 // RUN: env LLVM_PROFILE_FILE=%t.dir1/profraw_e_%1m %run %t
 // RUN: llvm-profdata merge -o %t.em.profdata %t.dir1
+// RUN: llvm-profdata show --all-functions %t.em.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.em.profdata -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=MERGE
 //
 // RUN: rm -fr %t.dir2
@@ -16,6 +18,7 @@
 // RUN: %run %t.merge
 // RUN: %run %t.merge
 // RUN: llvm-profdata merge -o %t.m.profdata %t.dir2/
+// RUN: llvm-profdata show --all-functions %t.m.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.m.profdata -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=MERGE
 //
 // Test that merging is enabled by default with -fprofile-generate=
@@ -27,6 +30,7 @@
 // RUN: %run %t.merge3
 // RUN: %run %t.merge3
 // RUN: llvm-profdata merge -o %t.m3.profdata %t.dir3/
+// RUN: llvm-profdata show --all-functions %t.m3.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.m3.profdata -O0 -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON --check-prefix=PGOMERGE
 //
 // Test that merging is enabled by default with -fprofile-generate
@@ -40,6 +44,7 @@
 // RUN: %run %t.dir4/merge4
 // RUN: rm -f %t.dir4/merge4*
 // RUN: llvm-profdata merge -o %t.m4.profdata ./
+// RUN: llvm-profdata show --all-functions %t.m4.profdata | FileCheck %s --check-prefix=PROFCNT
 // RUN: %clang_profuse=%t.m4.profdata -O0 -o - -S -emit-llvm %s | FileCheck %s --check-prefix=COMMON  --check-prefix=PGOMERGE
 
 /// Test that the merge pool size can be larger than 10.
@@ -49,6 +54,13 @@
 // RUN: not ls %t.dir5/e_%20m.profraw
 // RUN: ls %t.dir5/e_*.profraw | count 1
 
+// Test that all three functions have counters in the profile.
+// PROFCNT-DAG: begin
+// PROFCNT-DAG: end
+// PROFCNT-DAG: main
+// PROFCNT: Functions shown: 3
+// PROFCNT: Total functions: 3
+
 int begin(int i) {
   // COMMON: br i1 %{{.*}}, label %{{.*}}, label %{{.*}}, !prof ![[PD1:[0-9]+]]
   if (i)

From 001e18c816736602e3ad1c5dc6259143455610ea Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 27 Feb 2024 22:45:46 +0000
Subject: [PATCH 014/114] [NFC] clang-format SROA.cpp

---
 llvm/lib/Transforms/Scalar/SROA.cpp | 98 ++++++++++++++---------------
 1 file changed, 48 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 6c8785d52c4eabf..fad70e8bf2861f8 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -293,7 +293,7 @@ calculateFragment(DILocalVariable *Variable,
   if (!CurrentFragment) {
     if (auto Size = Variable->getSizeInBits()) {
       // Treat the current fragment as covering the whole variable.
-      CurrentFragment =  DIExpression::FragmentInfo(*Size, 0);
+      CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
       if (Target == CurrentFragment)
         return UseNoFrag;
     }
@@ -1213,8 +1213,9 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
     if (!IsOffsetKnown)
       return PI.setAborted(&II);
 
-    insertUse(II, Offset, Length ? Length->getLimitedValue()
-                                 : AllocSize - Offset.getLimitedValue(),
+    insertUse(II, Offset,
+              Length ? Length->getLimitedValue()
+                     : AllocSize - Offset.getLimitedValue(),
               (bool)Length);
   }
 
@@ -1669,7 +1670,7 @@ static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
   }
 
   // Inject loads into all of the pred blocks.
-  DenseMap<BasicBlock*, Value*> InjectedLoads;
+  DenseMap<BasicBlock *, Value *> InjectedLoads;
   for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
     BasicBlock *Pred = PN.getIncomingBlock(Idx);
     Value *InVal = PN.getIncomingValue(Idx);
@@ -1678,7 +1679,7 @@ static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
     // basic block, as long as the value is the same. So if we already injected
     // a load in the predecessor, then we should reuse the same load for all
     // duplicated entries.
-    if (Value* V = InjectedLoads.lookup(Pred)) {
+    if (Value *V = InjectedLoads.lookup(Pred)) {
       NewPN->addIncoming(V, Pred);
       continue;
     }
@@ -2077,8 +2078,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
   if (BeginIndex * ElementSize != BeginOffset ||
       BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
     return false;
-  uint64_t EndOffset =
-      std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
+  uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
   uint64_t EndIndex = EndOffset / ElementSize;
   if (EndIndex * ElementSize != EndOffset ||
       EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
@@ -2754,8 +2754,8 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
     Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
     IRB.SetInsertPoint(OldUserI);
     IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
-    IRB.getInserter().SetNamePrefix(
-        Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
+    IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
+                                    Twine(BeginOffset) + ".");
 
     CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
     if (VecTy || IntTy)
@@ -2808,7 +2808,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
 #else
                           Twine()
 #endif
-                          );
+    );
   }
 
   /// Compute suitable alignment to access this slice of the *new*
@@ -3189,8 +3189,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
     const bool CanContinue = [&]() {
       if (VecTy || IntTy)
         return true;
-      if (BeginOffset > NewAllocaBeginOffset ||
-          EndOffset < NewAllocaEndOffset)
+      if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
         return false;
       // Length must be in range for FixedVectorType.
       auto *C = cast<ConstantInt>(II.getLength());
@@ -3984,9 +3983,9 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     if (!Sel)
       return false;
 
-    LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):"
-                      << "\n    original: " << *Sel
-                      << "\n              " << GEPI);
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):\n";
+               dbgs() << "    original: " << *Sel << "\n";
+               dbgs() << "              " << GEPI << "\n";);
 
     auto GetNewOps = [&](Value *SelOp) {
       SmallVector<Value *> NewOps;
@@ -4023,9 +4022,9 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     Visited.insert(NSelI);
     enqueueUsers(*NSelI);
 
-    LLVM_DEBUG(dbgs() << "\n          to: " << *NTrue
-                      << "\n              " << *NFalse
-                      << "\n              " << *NSel << '\n');
+    LLVM_DEBUG(dbgs() << "          to: " << *NTrue << "\n";
+               dbgs() << "              " << *NFalse << "\n";
+               dbgs() << "              " << *NSel << "\n";);
 
     return true;
   }
@@ -4037,18 +4036,17 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
 
     PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
     if (GEPI.getParent() != PHI->getParent() ||
-        llvm::any_of(PHI->incoming_values(), [](Value *In)
-          { Instruction *I = dyn_cast<Instruction>(In);
-            return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
-                   succ_empty(I->getParent()) ||
-                   !I->getParent()->isLegalToHoistInto();
-          }))
+        llvm::any_of(PHI->incoming_values(), [](Value *In) {
+          Instruction *I = dyn_cast<Instruction>(In);
+          return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
+                 succ_empty(I->getParent()) ||
+                 !I->getParent()->isLegalToHoistInto();
+        }))
       return false;
 
-    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):"
-                      << "\n    original: " << *PHI
-                      << "\n              " << GEPI
-                      << "\n          to: ");
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):\n";
+               dbgs() << "    original: " << *PHI << "\n";
+               dbgs() << "              " << GEPI << "\n";);
 
     SmallVector<Value *, 4> Index(GEPI.indices());
     bool IsInBounds = GEPI.isInBounds();
@@ -4078,8 +4076,10 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     Visited.insert(NewPN);
     enqueueUsers(*NewPN);
 
-    LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
-                 dbgs() << "\n              " << *In;
+    LLVM_DEBUG(dbgs() << "          to: ";
+               for (Value *In
+                    : NewPN->incoming_values()) dbgs()
+               << "\n              " << *In;
                dbgs() << "\n              " << *NewPN << '\n');
 
     return true;
@@ -4089,8 +4089,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     if (foldGEPSelect(GEPI))
       return true;
 
-    if (isa<PHINode>(GEPI.getPointerOperand()) &&
-        foldGEPPhi(GEPI))
+    if (isa<PHINode>(GEPI.getPointerOperand()) && foldGEPPhi(GEPI))
       return true;
 
     enqueueUsers(GEPI);
@@ -4162,17 +4161,17 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
     return nullptr;
 
   if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
-     Type *ElementTy;
-     uint64_t TyNumElements;
-     if (auto *AT = dyn_cast<ArrayType>(Ty)) {
-       ElementTy = AT->getElementType();
-       TyNumElements = AT->getNumElements();
-     } else {
-       // FIXME: This isn't right for vectors with non-byte-sized or
-       // non-power-of-two sized elements.
-       auto *VT = cast<FixedVectorType>(Ty);
-       ElementTy = VT->getElementType();
-       TyNumElements = VT->getNumElements();
+    Type *ElementTy;
+    uint64_t TyNumElements;
+    if (auto *AT = dyn_cast<ArrayType>(Ty)) {
+      ElementTy = AT->getElementType();
+      TyNumElements = AT->getNumElements();
+    } else {
+      // FIXME: This isn't right for vectors with non-byte-sized or
+      // non-power-of-two sized elements.
+      auto *VT = cast<FixedVectorType>(Ty);
+      ElementTy = VT->getElementType();
+      TyNumElements = VT->getNumElements();
     }
     uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
     uint64_t NumSkippedElements = Offset / ElementSize;
@@ -4853,9 +4852,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     ++NumNewAllocas;
   }
 
-  LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
-                    << "[" << P.beginOffset() << "," << P.endOffset()
-                    << ") to: " << *NewAI << "\n");
+  LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
+                    << "," << P.endOffset() << ") to: " << *NewAI << "\n");
 
   // Track the high watermark on the worklist as it is only relevant for
   // promoted allocas. We will reset it to this point if the alloca is not in
@@ -5040,8 +5038,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
         IsSorted = false;
       }
     }
-  }
-  else {
+  } else {
     // We only allow whole-alloca splittable loads and stores
     // for a large alloca to avoid creating too large BitVector.
     for (Slice &S : AS) {
@@ -5069,7 +5066,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
     uint64_t Offset;
     uint64_t Size;
     Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
-      : Alloca(AI), Offset(O), Size(S) {}
+        : Alloca(AI), Offset(O), Size(S) {}
   };
   SmallVector<Fragment, 4> Fragments;
 
@@ -5083,7 +5080,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
             DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue();
         // Don't include any padding.
         uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
-        Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
+        Fragments.push_back(
+            Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
       }
     }
     ++NumPartitions;

From a3748d60ff18f612cd26a0b4ca7f05f2fbef264d Mon Sep 17 00:00:00 2001
From: Iman Hosseini <hosseini.iman@yahoo.com>
Date: Tue, 27 Feb 2024 22:55:45 +0000
Subject: [PATCH 015/114] [MLIR] Update GPU.md: add gpu kernel outlining to doc
 example. (#83141)

gpu-kernel-outlining is needed for this example to work.
---
 mlir/docs/Dialects/GPU.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/docs/Dialects/GPU.md b/mlir/docs/Dialects/GPU.md
index 85255fdc5e6439e..8a3acc33600a415 100644
--- a/mlir/docs/Dialects/GPU.md
+++ b/mlir/docs/Dialects/GPU.md
@@ -50,6 +50,7 @@ An example of how the compilation workflow look is:
 ```
 mlir-opt example.mlir                   \
   --pass-pipeline="builtin.module(      \
+    gpu-kernel-outlining,               \ # Outline gpu.launch body to a kernel.
     nvvm-attach-target{chip=sm_90 O=3}, \ # Attach an NVVM target to a gpu.module op.
     gpu.module(convert-gpu-to-nvvm),    \ # Convert GPU to NVVM.
     gpu-to-llvm,                        \ # Convert GPU to LLVM.

From 9d0acb872a5063f570366cd0e94b069d286cc71f Mon Sep 17 00:00:00 2001
From: Diego Caballero <diegocaballero@google.com>
Date: Tue, 27 Feb 2024 15:27:31 -0800
Subject: [PATCH 016/114] [mlir][Vector] Add support for trunci to narrow type
 emulation (#82565)

This PR add support for `arith.trunci` to vector narrow type emulation for iX -> i4 truncations, for X >= 8. For now, the pattern only works for 1D vectors and is based on `vector.shuffle` ops. We would need `vector.deinterleave` to add n-D vector support.
---
 .../Transforms/VectorEmulateNarrowType.cpp    | 126 +++++++++++++++++-
 .../Vector/vector-rewrite-narrow-types.mlir   |  42 ++++++
 2 files changed, 163 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index fc11ae63e718a57..dc6f126aae4c876 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -729,8 +729,8 @@ static LogicalResult commonConversionPrecondition(PatternRewriter &rewriter,
 
   // TODO: consider relaxing this restriction in the future if we find ways
   // to really work with subbyte elements across the MLIR/LLVM boundary.
-  unsigned resultBitwidth = preconditionType.getElementTypeBitWidth();
-  if (resultBitwidth % 8 != 0)
+  unsigned bitwidth = preconditionType.getElementTypeBitWidth();
+  if (bitwidth % 8 != 0)
     return rewriter.notifyMatchFailure(op, "bitwidth is not k * 8");
 
   return success();
@@ -768,6 +768,10 @@ static LogicalResult alignedConversionPrecondition(PatternRewriter &rewriter,
       (dstElemBitwidth % srcElemBitwidth) != 0)
     return rewriter.notifyMatchFailure(op, "Not a supported aligned case");
 
+  if ((srcType.getShape().back() % 2) != 0)
+    return rewriter.notifyMatchFailure(
+        op, "Not an even number of i4 elements in trailing dim");
+
   return success();
 }
 
@@ -876,6 +880,58 @@ static Value rewriteI4ToI8SignedExt(PatternRewriter &rewriter, Location loc,
   return rewriter.create<vector::InterleaveOp>(loc, low, high);
 }
 
+/// Rewrite the i8 -> i4 truncation into a sequence of shuffles and bitwise ops
+/// that take advantage of high-level information to avoid leaving LLVM to
+/// scramble with peephole optimizations.
+static Value rewriteI8ToI4Trunc(PatternRewriter &rewriter, Location loc,
+                                Value srcValue) {
+  VectorType srcVecType = cast<VectorType>(srcValue.getType());
+  assert(srcVecType.getElementType().isSignlessInteger(8) &&
+         "Expected i8 type");
+
+  // 1. De-interleave low and high i8 elements.
+  int64_t vecDimSize = srcVecType.getShape().back();
+  SmallVector<int64_t> deinterleaveLowMaskValues;
+  SmallVector<int64_t> deinterleaveHighMaskValues;
+  assert((vecDimSize % 2) == 0 && "Odd number of i4 elements");
+  deinterleaveLowMaskValues.reserve(vecDimSize / 2);
+  deinterleaveHighMaskValues.reserve(vecDimSize / 2);
+  for (int i = 0, end = vecDimSize; i < end; i += 2) {
+    deinterleaveLowMaskValues.push_back(i);
+    deinterleaveHighMaskValues.push_back(i + 1);
+  }
+
+  auto lowShuffleOp = rewriter.create<vector::ShuffleOp>(
+      loc, srcValue, srcValue,
+      rewriter.getI64ArrayAttr(deinterleaveLowMaskValues));
+  auto highShuffleOp = rewriter.create<vector::ShuffleOp>(
+      loc, srcValue, srcValue,
+      rewriter.getI64ArrayAttr(deinterleaveHighMaskValues));
+
+  // 2. Zero out the upper side of each low i8 element.
+  constexpr int8_t i8LowBitMask = 0x0F;
+  Value zeroOutMask = rewriter.create<arith::ConstantOp>(
+      loc,
+      DenseElementsAttr::get(lowShuffleOp.getResultVectorType(), i8LowBitMask));
+  Value zeroOutLow =
+      rewriter.create<arith::AndIOp>(loc, lowShuffleOp, zeroOutMask);
+
+  // 3. Move high i4 values to upper side of the byte.
+  constexpr int8_t bitsToShift = 4;
+  VectorType deinterI8VecType = highShuffleOp.getResultVectorType();
+  auto shiftValues = rewriter.create<arith::ConstantOp>(
+      loc, DenseElementsAttr::get(deinterI8VecType, bitsToShift));
+  Value shlHigh =
+      rewriter.create<arith::ShLIOp>(loc, highShuffleOp, shiftValues);
+
+  // 4. Merge high and low i4 values.
+  auto mergedHiLowOp = rewriter.create<arith::OrIOp>(loc, zeroOutLow, shlHigh);
+
+  // 5. Generate a bitcast vector<Xxi8> -> vector<2Xxi4>.
+  auto i4VecType = srcVecType.cloneWith(std::nullopt, rewriter.getI4Type());
+  return rewriter.create<vector::BitCastOp>(loc, i4VecType, mergedHiLowOp);
+}
+
 namespace {
 /// Rewrite bitcast(trunci) to a sequence of shuffles and bitwise ops that take
 /// advantage of high-level information to avoid leaving LLVM to scramble with
@@ -1019,7 +1075,7 @@ struct RewriteAlignedSubByteIntSignedExt : OpRewritePattern<ConversionOpType> {
 
   LogicalResult matchAndRewrite(ConversionOpType conversionOp,
                                 PatternRewriter &rewriter) const override {
-    // Set up the BitCastRewriter and verify the preconditions.
+    // Verify the preconditions.
     Value srcValue = conversionOp.getIn();
     auto srcVecType = dyn_cast<VectorType>(srcValue.getType());
     auto dstVecType = dyn_cast<VectorType>(conversionOp.getType());
@@ -1043,6 +1099,65 @@ struct RewriteAlignedSubByteIntSignedExt : OpRewritePattern<ConversionOpType> {
   }
 };
 
+/// Rewrite the i8 -> i4 part of any truncation into a sequence of shuffles and
+/// bitwise ops that take advantage of high-level information to avoid leaving
+/// LLVM to scramble with peephole optimizations.
+///
+/// For example:
+///    arith.trunci %in : vector<8xi32> to vector<8xi4>
+///      is rewriten as
+///
+///        %cst = arith.constant dense<15> : vector<4xi8>
+///        %cst_0 = arith.constant dense<4> : vector<4xi8>
+///        %0 = arith.trunci %in : vector<8xi32> to vector<8xi8>
+///        %1 = vector.shuffle %0, %0 [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
+///        %2 = vector.shuffle %0, %0 [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+///        %3 = arith.andi %1, %cst : vector<4xi8>
+///        %4 = arith.shli %2, %cst_0 : vector<4xi8>
+///        %5 = arith.ori %3, %4 : vector<4xi8>
+///        %6 = vector.bitcast %5 : vector<4xi8> to vector<8xi4>
+///
+struct RewriteAlignedSubByteIntTrunc : OpRewritePattern<arith::TruncIOp> {
+  using OpRewritePattern<arith::TruncIOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::TruncIOp truncOp,
+                                PatternRewriter &rewriter) const override {
+    // Verify the preconditions.
+    Value srcValue = truncOp.getIn();
+    auto srcVecType = dyn_cast<VectorType>(srcValue.getType());
+    auto dstVecType = dyn_cast<VectorType>(truncOp.getType());
+    if (!srcVecType || !dstVecType)
+      return failure();
+
+    // Only single dim vectors are supported until we have
+    // `vector.deinterleave`.
+    if (srcVecType.getRank() != 1)
+      return failure();
+
+    if (failed(commonConversionPrecondition(rewriter, srcVecType, truncOp)))
+      return failure();
+
+    // Check general alignment preconditions. We invert the src/dst type order
+    // to reuse the existing precondition logic.
+    if (failed(alignedConversionPrecondition(rewriter, dstVecType, srcVecType,
+                                             truncOp)))
+      return failure();
+
+    // Create a new iX -> i8 truncation op.
+    Location loc = truncOp.getLoc();
+    auto i8VecType = srcVecType.cloneWith(std::nullopt, rewriter.getI8Type());
+    Value i8TruncVal =
+        rewriter.create<arith::TruncIOp>(loc, i8VecType, srcValue);
+
+    // Rewrite the i8 -> i4 truncation part.
+    Value subByteTrunc = rewriteI8ToI4Trunc(rewriter, loc, i8TruncVal);
+
+    // Finalize the rewrite.
+    rewriter.replaceOp(truncOp, subByteTrunc);
+    return success();
+  }
+};
+
 /// Rewrite a sub-byte vector transpose into a sequence of instructions that
 /// perform the transpose on wider (byte) element types.
 /// For example:
@@ -1115,8 +1230,9 @@ void vector::populateVectorNarrowTypeRewritePatterns(
   // Patterns for aligned cases. We set higher priority as they are expected to
   // generate better performance for aligned cases.
   patterns.add<RewriteAlignedSubByteIntSignedExt<arith::ExtSIOp>,
-               RewriteAlignedSubByteIntSignedExt<arith::SIToFPOp>>(
-      patterns.getContext(), benefit.getBenefit() + 1);
+               RewriteAlignedSubByteIntSignedExt<arith::SIToFPOp>,
+               RewriteAlignedSubByteIntTrunc>(patterns.getContext(),
+                                              benefit.getBenefit() + 1);
 }
 
 void vector::populateVectorTransposeNarrowTypeRewritePatterns(
diff --git a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
index 94e78ce40a3c19a..8f0148119806c98 100644
--- a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
+++ b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir
@@ -262,6 +262,48 @@ func.func @aligned_sitofp_2d(%a: vector<8x32xi4>) -> vector<8x32xf32> {
   return %0 : vector<8x32xf32>
 }
 
+// CHECK-LABEL: func.func @aligned_trunci(
+func.func @aligned_trunci(%a: vector<8xi32>) -> vector<8xi4> {
+// CHECK-SAME:    %[[IN:.*]]: vector<8xi32>) -> vector<8xi4> {
+// CHECK-DAG:       %[[LOW_MASK:.*]] = arith.constant dense<15> : vector<4xi8>
+// CHECK-DAG:       %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
+// CHECK:           %[[I8:.*]] = arith.trunci %[[IN]] : vector<8xi32> to vector<8xi8>
+// CHECK:           %[[LOW:.*]] = vector.shuffle %[[I8]], %[[I8]] [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[HIGH:.*]] = vector.shuffle %[[I8]], %[[I8]] [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[LOW_MASK]] : vector<4xi8>
+// CHECK:           %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<4xi8>
+// CHECK:           %[[I4:.*]] = vector.bitcast %[[MERGED]] : vector<4xi8> to vector<8xi4>
+  %0 = arith.trunci %a : vector<8xi32> to vector<8xi4>
+  return %0 : vector<8xi4>
+}
+
+// CHECK-LABEL: func.func @aligned_trunci_base_case(
+func.func @aligned_trunci_base_case(%a: vector<8xi8>) -> vector<8xi4> {
+// CHECK-SAME:    %[[IN:.*]]: vector<8xi8>) -> vector<8xi4> {
+// CHECK-DAG:       %[[LOW_MASK:.*]] = arith.constant dense<15> : vector<4xi8>
+// CHECK-DAG:       %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8>
+// CHECK:           %[[LOW:.*]] = vector.shuffle %[[IN]], %[[IN]] [0, 2, 4, 6] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[HIGH:.*]] = vector.shuffle %[[IN]], %[[IN]] [1, 3, 5, 7] : vector<8xi8>, vector<8xi8>
+// CHECK:           %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[LOW_MASK]] : vector<4xi8>
+// CHECK:           %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[I4_BITS]] : vector<4xi8>
+// CHECK:           %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<4xi8>
+// CHECK:           %[[I4:.*]] = vector.bitcast %[[MERGED]] : vector<4xi8> to vector<8xi4>
+  %0 = arith.trunci %a : vector<8xi8> to vector<8xi4>
+  return %0 : vector<8xi4>
+}
+
+// CHECK-LABEL: func.func @aligned_trunci_2d(
+func.func @aligned_trunci_2d(%a: vector<8x32xi32>) -> vector<8x32xi4> {
+// CHECK-NOT:       vector.shuffle
+// CHECK-NOT:       vector.andi
+// CHECK-NOT:       vector.shli
+// CHECK-NOT:       vector.ori
+// CHECK:           arith.trunci
+  %0 = arith.trunci %a : vector<8x32xi32> to vector<8x32xi4>
+  return %0 : vector<8x32xi4>
+}
+
 // CHECK-LABEL: func.func @i4_transpose(
 func.func @i4_transpose(%a: vector<8x16xi4>) -> vector<16x8xi4> {
 // CHECK-SAME:    %[[IN:.*]]: vector<8x16xi4>) -> vector<16x8xi4> {

From 0e84e2748b40eb757a5c52a983c87dd4f25a1587 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Tue, 27 Feb 2024 15:38:31 -0800
Subject: [PATCH 017/114] [BOLT] Move test under X86 target. NFCI (#83202)

instrument-wrong-target.s test requires X86 host. Move it under
runtime/X86.
---
 bolt/test/runtime/{ => X86}/instrument-wrong-target.s | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename bolt/test/runtime/{ => X86}/instrument-wrong-target.s (88%)

diff --git a/bolt/test/runtime/instrument-wrong-target.s b/bolt/test/runtime/X86/instrument-wrong-target.s
similarity index 88%
rename from bolt/test/runtime/instrument-wrong-target.s
rename to bolt/test/runtime/X86/instrument-wrong-target.s
index b25c924ffbcc0ff..343d93a89ed1306 100644
--- a/bolt/test/runtime/instrument-wrong-target.s
+++ b/bolt/test/runtime/X86/instrument-wrong-target.s
@@ -1,8 +1,8 @@
 # Test that BOLT errs when trying to instrument a binary with a different
 # architecture than the one BOLT is built for.
 
-# REQUIRES: x86_64-linux,bolt-runtime
-# REQUIRES: target-x86_64 && aarch64-registered-target
+# REQUIRES: system-linux,bolt-runtime
+# REQUIRES: aarch64-registered-target
 
 # RUN: llvm-mc -triple aarch64 -filetype=obj %s -o %t.o
 # RUN: ld.lld -q -pie -o %t.exe %t.o

From 04e8653f189bf3d65680c7fb3b3033ad82903ee9 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 27 Feb 2024 17:45:15 -0600
Subject: [PATCH 018/114] [libc] Add "include/" to the LLVM include directories
 (#83199)

Summary:
Recent changes added an include path in the float128 type that used the
internal `libc` path to find the macro. This doesn't work once it's
installed because we need to search from the root of the install dir.
This patch adds "include/" to the include path so that our inclusion
of installed headers always match the internal use.
---
 libc/benchmarks/automemcpy/lib/CMakeLists.txt              | 3 ++-
 libc/cmake/modules/LLVMLibCObjectRules.cmake               | 4 ++++
 libc/cmake/modules/LLVMLibCTestRules.cmake                 | 4 ++++
 libc/cmake/modules/compiler_features/check_fixed_point.cpp | 2 +-
 libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp              | 2 +-
 libc/include/llvm-libc-types/float128.h                    | 2 +-
 libc/src/__support/CPP/limits.h                            | 2 +-
 libc/src/__support/CPP/type_traits/is_fixed_point.h        | 2 +-
 libc/src/__support/HashTable/table.h                       | 2 +-
 libc/src/__support/fixed_point/fx_bits.h                   | 2 +-
 libc/src/__support/fixed_point/fx_rep.h                    | 2 +-
 libc/src/__support/fixed_point/sqrt.h                      | 2 +-
 libc/src/__support/macros/properties/float.h               | 4 ++--
 libc/src/stdfix/abshk.h                                    | 2 +-
 libc/src/stdfix/abshr.h                                    | 2 +-
 libc/src/stdfix/absk.h                                     | 2 +-
 libc/src/stdfix/abslk.h                                    | 2 +-
 libc/src/stdfix/abslr.h                                    | 2 +-
 libc/src/stdfix/absr.h                                     | 2 +-
 libc/src/stdfix/roundhk.h                                  | 2 +-
 libc/src/stdfix/roundhr.h                                  | 2 +-
 libc/src/stdfix/roundk.h                                   | 2 +-
 libc/src/stdfix/roundlk.h                                  | 2 +-
 libc/src/stdfix/roundlr.h                                  | 2 +-
 libc/src/stdfix/roundr.h                                   | 2 +-
 libc/src/stdfix/rounduhk.h                                 | 2 +-
 libc/src/stdfix/rounduhr.h                                 | 2 +-
 libc/src/stdfix/rounduk.h                                  | 2 +-
 libc/src/stdfix/roundulk.h                                 | 2 +-
 libc/src/stdfix/roundulr.h                                 | 2 +-
 libc/src/stdfix/roundur.h                                  | 2 +-
 libc/src/stdfix/sqrtuhk.h                                  | 2 +-
 libc/src/stdfix/sqrtuhr.h                                  | 2 +-
 libc/src/stdfix/sqrtuk.h                                   | 2 +-
 libc/src/stdfix/sqrtulr.h                                  | 2 +-
 libc/src/stdfix/sqrtur.h                                   | 2 +-
 libc/src/stdio/printf_core/fixed_converter.h               | 2 +-
 libc/src/stdio/printf_core/parser.h                        | 2 +-
 libc/src/sys/epoll/epoll_pwait.h                           | 4 ++--
 libc/src/sys/epoll/epoll_pwait2.h                          | 6 +++---
 libc/src/sys/epoll/epoll_wait.h                            | 2 +-
 libc/src/sys/epoll/linux/epoll_pwait.cpp                   | 4 ++--
 libc/src/sys/epoll/linux/epoll_pwait2.cpp                  | 6 +++---
 libc/src/sys/epoll/linux/epoll_wait.cpp                    | 4 ++--
 libc/test/UnitTest/CMakeLists.txt                          | 3 ++-
 libc/test/UnitTest/LibcTest.cpp                            | 2 +-
 libc/test/include/stdbit_test.cpp                          | 2 +-
 libc/test/include/stdckdint_test.cpp                       | 2 +-
 libc/test/integration/startup/CMakeLists.txt               | 1 +
 libc/test/integration/startup/gpu/rpc_interface_test.cpp   | 2 +-
 libc/test/integration/startup/gpu/rpc_stream_test.cpp      | 2 +-
 libc/test/integration/startup/gpu/rpc_test.cpp             | 2 +-
 libc/test/src/__support/fixed_point/fx_bits_test.cpp       | 2 +-
 libc/test/src/math/differential_testing/CMakeLists.txt     | 1 +
 libc/utils/LibcTableGenUtil/CMakeLists.txt                 | 2 +-
 libc/utils/gpu/loader/Loader.h                             | 2 +-
 56 files changed, 72 insertions(+), 60 deletions(-)

diff --git a/libc/benchmarks/automemcpy/lib/CMakeLists.txt b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
index 0c7d399d4023bbb..bb6a5631f2c3f69 100644
--- a/libc/benchmarks/automemcpy/lib/CMakeLists.txt
+++ b/libc/benchmarks/automemcpy/lib/CMakeLists.txt
@@ -18,7 +18,8 @@ add_custom_command(
 
 add_library(automemcpy_implementations "${Implementations}")
 target_link_libraries(automemcpy_implementations PUBLIC LLVMSupport libc-memory-benchmark)
-target_include_directories(automemcpy_implementations PRIVATE ${LIBC_SOURCE_DIR} ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
+target_include_directories(automemcpy_implementations PRIVATE 
+                           ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include ${LIBC_AUTOMEMCPY_INCLUDE_DIR})
 target_compile_options(automemcpy_implementations PRIVATE ${LIBC_COMPILE_OPTIONS_NATIVE} "SHELL:-mllvm -combiner-global-alias-analysis" -fno-builtin)
 llvm_update_compile_flags(automemcpy_implementations)
 
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 0649e9f7a767095..5469799f0239830 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -59,6 +59,7 @@ function(create_object_library fq_target_name)
   )
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   target_compile_options(${fq_target_name} PRIVATE ${compile_options})
 
   # The NVPTX target is installed as LLVM-IR but the internal testing toolchain
@@ -73,6 +74,7 @@ function(create_object_library fq_target_name)
     )
     target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
     target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+    target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
     target_compile_options(${internal_target_name} PRIVATE ${compile_options}
                            -fno-lto -march=${LIBC_GPU_TARGET_ARCHITECTURE})
   endif()
@@ -279,6 +281,7 @@ function(create_entrypoint_object fq_target_name)
   target_compile_options(${internal_target_name} BEFORE PRIVATE ${common_compile_options})
   target_include_directories(${internal_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${internal_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   add_dependencies(${internal_target_name} ${full_deps_list})
   target_link_libraries(${internal_target_name} ${full_deps_list})
 
@@ -300,6 +303,7 @@ function(create_entrypoint_object fq_target_name)
   target_compile_options(${fq_target_name} BEFORE PRIVATE ${common_compile_options} -DLIBC_COPT_PUBLIC_PACKAGING)
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   add_dependencies(${fq_target_name} ${full_deps_list})
   target_link_libraries(${fq_target_name} ${full_deps_list})
 
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 836e15d34741b2a..5981d427b71f8da 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -184,6 +184,7 @@ function(create_libc_unittest fq_target_name)
   )
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
   if(NOT LIBC_UNITTEST_CXX_STANDARD)
@@ -317,6 +318,7 @@ function(add_libc_fuzzer target_name)
   )
   target_include_directories(${fq_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
 
   target_link_libraries(${fq_target_name} PRIVATE
     ${link_object_files}
@@ -457,6 +459,7 @@ function(add_integration_test test_name)
       PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
 
   _get_hermetic_test_compile_options(compile_options "${INTEGRATION_TEST_COMPILE_OPTIONS}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
@@ -632,6 +635,7 @@ function(add_libc_hermetic_test test_name)
   _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR})
   target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR})
+  target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}/include)
   _get_hermetic_test_compile_options(compile_options "${HERMETIC_TEST_COMPILE_OPTIONS}")
   target_compile_options(${fq_build_target_name} PRIVATE ${compile_options})
 
diff --git a/libc/cmake/modules/compiler_features/check_fixed_point.cpp b/libc/cmake/modules/compiler_features/check_fixed_point.cpp
index a5192697d43f779..9199340fe652ea5 100644
--- a/libc/cmake/modules/compiler_features/check_fixed_point.cpp
+++ b/libc/cmake/modules/compiler_features/check_fixed_point.cpp
@@ -1,4 +1,4 @@
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 #ifndef LIBC_COMPILER_HAS_FIXED_POINT
 #error unsupported
diff --git a/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp b/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
index b4a8621891203da..c385c3a8f3e44a1 100644
--- a/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
+++ b/libc/fuzzing/stdio/printf_fixed_conv_fuzz.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 #include "src/stdio/snprintf.h"
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/fixed_point/fx_rep.h"
 
diff --git a/libc/include/llvm-libc-types/float128.h b/libc/include/llvm-libc-types/float128.h
index 61a094fdb96b127..1907a5e3ece727a 100644
--- a/libc/include/llvm-libc-types/float128.h
+++ b/libc/include/llvm-libc-types/float128.h
@@ -9,7 +9,7 @@
 #ifndef __LLVM_LIBC_TYPES_FLOAT128_H__
 #define __LLVM_LIBC_TYPES_FLOAT128_H__
 
-#include <include/llvm-libc-macros/float-macros.h> // LDBL_MANT_DIG
+#include "llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
 
 // Currently, C23 `_Float128` type is only defined as a built-in type in GCC 7
 // or later, and only for C.  For C++, or for clang, `__float128` is defined
diff --git a/libc/src/__support/CPP/limits.h b/libc/src/__support/CPP/limits.h
index 1ffde5f9556f871..6440e8cb358fa76 100644
--- a/libc/src/__support/CPP/limits.h
+++ b/libc/src/__support/CPP/limits.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_CPP_LIMITS_H
 #define LLVM_LIBC_SRC___SUPPORT_CPP_LIMITS_H
 
-#include "include/llvm-libc-macros/limits-macros.h" // CHAR_BIT
+#include "llvm-libc-macros/limits-macros.h" // CHAR_BIT
 #include "src/__support/CPP/type_traits/is_integral.h"
 #include "src/__support/CPP/type_traits/is_signed.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
diff --git a/libc/src/__support/CPP/type_traits/is_fixed_point.h b/libc/src/__support/CPP/type_traits/is_fixed_point.h
index 317ba39748b7de4..e139e6477e2e7e9 100644
--- a/libc/src/__support/CPP/type_traits/is_fixed_point.h
+++ b/libc/src/__support/CPP/type_traits/is_fixed_point.h
@@ -12,7 +12,7 @@
 #include "src/__support/CPP/type_traits/remove_cv.h"
 #include "src/__support/macros/attributes.h"
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE::cpp {
 
diff --git a/libc/src/__support/HashTable/table.h b/libc/src/__support/HashTable/table.h
index 5b4697e5245b6d1..e2a26d0e2b5f366 100644
--- a/libc/src/__support/HashTable/table.h
+++ b/libc/src/__support/HashTable/table.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_HASHTABLE_table_H
 #define LLVM_LIBC_SRC___SUPPORT_HASHTABLE_table_H
 
-#include "include/llvm-libc-types/ENTRY.h"
+#include "llvm-libc-types/ENTRY.h"
 #include "src/__support/CPP/bit.h" // bit_ceil
 #include "src/__support/CPP/new.h"
 #include "src/__support/HashTable/bitmask.h"
diff --git a/libc/src/__support/fixed_point/fx_bits.h b/libc/src/__support/fixed_point/fx_bits.h
index fcd47cd72cbb316..0c8d03beb84ae59 100644
--- a/libc/src/__support/fixed_point/fx_bits.h
+++ b/libc/src/__support/fixed_point/fx_bits.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXBITS_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXBITS_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
diff --git a/libc/src/__support/fixed_point/fx_rep.h b/libc/src/__support/fixed_point/fx_rep.h
index fcd7554c4d85508..7d18f14a8c483ac 100644
--- a/libc/src/__support/fixed_point/fx_rep.h
+++ b/libc/src/__support/fixed_point/fx_rep.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXREP_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_FXREP_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE, LIBC_INLINE_VAR
 
diff --git a/libc/src/__support/fixed_point/sqrt.h b/libc/src/__support/fixed_point/sqrt.h
index d8df294b18a1a8d..236ebb2857030ba 100644
--- a/libc/src/__support/fixed_point/sqrt.h
+++ b/libc/src/__support/fixed_point/sqrt.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
 #define LLVM_LIBC_SRC___SUPPORT_FIXEDPOINT_SQRT_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
diff --git a/libc/src/__support/macros/properties/float.h b/libc/src/__support/macros/properties/float.h
index 08a1ab726cbde46..510f39237493581 100644
--- a/libc/src/__support/macros/properties/float.h
+++ b/libc/src/__support/macros/properties/float.h
@@ -11,8 +11,8 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_FLOAT_H
 #define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_FLOAT_H
 
-#include "include/llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
-#include "include/llvm-libc-types/float128.h"      // float128
+#include "llvm-libc-macros/float-macros.h" // LDBL_MANT_DIG
+#include "llvm-libc-types/float128.h"      // float128
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/compiler.h"
 #include "src/__support/macros/properties/cpu_features.h"
diff --git a/libc/src/stdfix/abshk.h b/libc/src/stdfix/abshk.h
index 13c9300caab8837..80dc73053dfb45f 100644
--- a/libc/src/stdfix/abshk.h
+++ b/libc/src/stdfix/abshk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSHK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abshr.h b/libc/src/stdfix/abshr.h
index 5acd0cfc4a60db0..035f9a6de222e8b 100644
--- a/libc/src/stdfix/abshr.h
+++ b/libc/src/stdfix/abshr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSHR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/absk.h b/libc/src/stdfix/absk.h
index 73dfcac0ac8e7f8..426415de28e6aec 100644
--- a/libc/src/stdfix/absk.h
+++ b/libc/src/stdfix/absk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abslk.h b/libc/src/stdfix/abslk.h
index 7de116fa2279328..21e33f856bfc650 100644
--- a/libc/src/stdfix/abslk.h
+++ b/libc/src/stdfix/abslk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSLK_H
 #define LLVM_LIBC_SRC_STDFIX_ABSLK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/abslr.h b/libc/src/stdfix/abslr.h
index bf5b585bbbb6697..ebca35e58aa5102 100644
--- a/libc/src/stdfix/abslr.h
+++ b/libc/src/stdfix/abslr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSLR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSLR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/absr.h b/libc/src/stdfix/absr.h
index b5ead7ce14e2a00..2744fcb5a7ecccd 100644
--- a/libc/src/stdfix/absr.h
+++ b/libc/src/stdfix/absr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ABSR_H
 #define LLVM_LIBC_SRC_STDFIX_ABSR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundhk.h b/libc/src/stdfix/roundhk.h
index 9a5c874cc030dbe..06de5cc05cdbe4e 100644
--- a/libc/src/stdfix/roundhk.h
+++ b/libc/src/stdfix/roundhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDHK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundhr.h b/libc/src/stdfix/roundhr.h
index ba5a67945d6c3be..6729bf5b139973d 100644
--- a/libc/src/stdfix/roundhr.h
+++ b/libc/src/stdfix/roundhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDHR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundk.h b/libc/src/stdfix/roundk.h
index e9fa6d8f9c3b8c4..02fb9a8c9b1a863 100644
--- a/libc/src/stdfix/roundk.h
+++ b/libc/src/stdfix/roundk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundlk.h b/libc/src/stdfix/roundlk.h
index 5fa0e90e855a64a..28be9c00549498a 100644
--- a/libc/src/stdfix/roundlk.h
+++ b/libc/src/stdfix/roundlk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDLK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDLK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundlr.h b/libc/src/stdfix/roundlr.h
index c015292e8f3f283..be97a35a64204d3 100644
--- a/libc/src/stdfix/roundlr.h
+++ b/libc/src/stdfix/roundlr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDLR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDLR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundr.h b/libc/src/stdfix/roundr.h
index b5b1375c882e033..15523f8b6c9a38c 100644
--- a/libc/src/stdfix/roundr.h
+++ b/libc/src/stdfix/roundr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduhk.h b/libc/src/stdfix/rounduhk.h
index 85ebf2903ec7e99..d1c4a4416d7636c 100644
--- a/libc/src/stdfix/rounduhk.h
+++ b/libc/src/stdfix/rounduhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUHK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduhr.h b/libc/src/stdfix/rounduhr.h
index 1be0aab1f5a79db..6cecb733dd3b18c 100644
--- a/libc/src/stdfix/rounduhr.h
+++ b/libc/src/stdfix/rounduhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUHR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/rounduk.h b/libc/src/stdfix/rounduk.h
index 8dae89586c49014..4511d69525c5d5a 100644
--- a/libc/src/stdfix/rounduk.h
+++ b/libc/src/stdfix/rounduk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundulk.h b/libc/src/stdfix/roundulk.h
index 81dfd1dceb60011..8bd90beeb830c55 100644
--- a/libc/src/stdfix/roundulk.h
+++ b/libc/src/stdfix/roundulk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDULK_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDULK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundulr.h b/libc/src/stdfix/roundulr.h
index 002fc94907c6132..65e5c27b1c8531f 100644
--- a/libc/src/stdfix/roundulr.h
+++ b/libc/src/stdfix/roundulr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDULR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDULR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/roundur.h b/libc/src/stdfix/roundur.h
index 72de44b1e0c4e5a..110e578da79319b 100644
--- a/libc/src/stdfix/roundur.h
+++ b/libc/src/stdfix/roundur.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_ROUNDUR_H
 #define LLVM_LIBC_SRC_STDFIX_ROUNDUR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtuhk.h b/libc/src/stdfix/sqrtuhk.h
index 80000a0079696d1..b57340003fa03c3 100644
--- a/libc/src/stdfix/sqrtuhk.h
+++ b/libc/src/stdfix/sqrtuhk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUHK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtuhr.h b/libc/src/stdfix/sqrtuhr.h
index fd95f0924e8d48c..6b629a29de3c887 100644
--- a/libc/src/stdfix/sqrtuhr.h
+++ b/libc/src/stdfix/sqrtuhr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUHR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtuk.h b/libc/src/stdfix/sqrtuk.h
index 04d0adadde9ad2f..6bd7a2608716cd5 100644
--- a/libc/src/stdfix/sqrtuk.h
+++ b/libc/src/stdfix/sqrtuk.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUK_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUK_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtulr.h b/libc/src/stdfix/sqrtulr.h
index 284adaaf35bf590..d1982a6b1c05187 100644
--- a/libc/src/stdfix/sqrtulr.h
+++ b/libc/src/stdfix/sqrtulr.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTULR_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTULR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdfix/sqrtur.h b/libc/src/stdfix/sqrtur.h
index df9dfe5a0bf39ee..13f7d1e5e466ec3 100644
--- a/libc/src/stdfix/sqrtur.h
+++ b/libc/src/stdfix/sqrtur.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDFIX_SQRTUR_H
 #define LLVM_LIBC_SRC_STDFIX_SQRTUR_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/stdio/printf_core/fixed_converter.h b/libc/src/stdio/printf_core/fixed_converter.h
index de69c603be6b631..c89971e20686e4c 100644
--- a/libc/src/stdio/printf_core/fixed_converter.h
+++ b/libc/src/stdio/printf_core/fixed_converter.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_FIXED_CONVERTER_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/fixed_point/fx_rep.h"
diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h
index 0876116a0bac866..13fdbf243a22e8c 100644
--- a/libc/src/stdio/printf_core/parser.h
+++ b/libc/src/stdio/printf_core/parser.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 #define LLVM_LIBC_SRC_STDIO_PRINTF_CORE_PARSER_H
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/str_to_integer.h"
diff --git a/libc/src/sys/epoll/epoll_pwait.h b/libc/src/sys/epoll/epoll_pwait.h
index 9dcb55533009f97..16105850d6942e5 100644
--- a/libc/src/sys/epoll/epoll_pwait.h
+++ b/libc/src/sys/epoll/epoll_pwait.h
@@ -10,8 +10,8 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_PWAIT_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/epoll_pwait2.h b/libc/src/sys/epoll/epoll_pwait2.h
index 622ede6a0f9f9ae..f7b28d4fbc51d9a 100644
--- a/libc/src/sys/epoll/epoll_pwait2.h
+++ b/libc/src/sys/epoll/epoll_pwait2.h
@@ -10,9 +10,9 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_PWAIT2_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
-// #include "include/llvm-libc-types/struct_timespec.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/struct_timespec.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/epoll_wait.h b/libc/src/sys/epoll/epoll_wait.h
index d51c9100846ce0d..0dc487bba5bdf4e 100644
--- a/libc/src/sys/epoll/epoll_wait.h
+++ b/libc/src/sys/epoll/epoll_wait.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_SYS_EPOLL_EPOLL_WAIT_H
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_pwait.cpp b/libc/src/sys/epoll/linux/epoll_pwait.cpp
index ee1b4e66e984447..e0c13a7a79602f9 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait.cpp
@@ -15,8 +15,8 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_pwait2.cpp b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
index 671dede2a1058d6..a44b0c2a9f70f0a 100644
--- a/libc/src/sys/epoll/linux/epoll_pwait2.cpp
+++ b/libc/src/sys/epoll/linux/epoll_pwait2.cpp
@@ -15,9 +15,9 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
-// #include "include/llvm-libc-types/struct_timespec.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/struct_timespec.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/src/sys/epoll/linux/epoll_wait.cpp b/libc/src/sys/epoll/linux/epoll_wait.cpp
index 0c43edf76454543..b643e2dd720cb61 100644
--- a/libc/src/sys/epoll/linux/epoll_wait.cpp
+++ b/libc/src/sys/epoll/linux/epoll_wait.cpp
@@ -14,8 +14,8 @@
 #include <sys/syscall.h> // For syscall numbers.
 
 // TODO: Use this include once the include headers are also using quotes.
-// #include "include/llvm-libc-types/sigset_t.h"
-// #include "include/llvm-libc-types/struct_epoll_event.h"
+// #include "llvm-libc-types/sigset_t.h"
+// #include "llvm-libc-types/struct_epoll_event.h"
 
 #include <sys/epoll.h>
 
diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
index 4668f0061975f85..466494f038f4e3b 100644
--- a/libc/test/UnitTest/CMakeLists.txt
+++ b/libc/test/UnitTest/CMakeLists.txt
@@ -26,7 +26,8 @@ function(add_unittest_framework_library name)
       ${TEST_LIB_SRCS}
       ${TEST_LIB_HDRS}
     )
-    target_include_directories(${lib} PUBLIC ${LIBC_SOURCE_DIR})
+    target_include_directories(${lib} PUBLIC 
+                               ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include)
     list(APPEND compile_options -fno-exceptions -fno-rtti)
     if(TARGET libc.src.time.clock)
       target_compile_definitions(${lib} PRIVATE TARGET_SUPPORTS_CLOCK)
diff --git a/libc/test/UnitTest/LibcTest.cpp b/libc/test/UnitTest/LibcTest.cpp
index 7b0e4fca83683b9..babd44f9b20630a 100644
--- a/libc/test/UnitTest/LibcTest.cpp
+++ b/libc/test/UnitTest/LibcTest.cpp
@@ -8,7 +8,7 @@
 
 #include "LibcTest.h"
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 #include "src/__support/CPP/string.h"
 #include "src/__support/CPP/string_view.h"
 #include "src/__support/UInt128.h"
diff --git a/libc/test/include/stdbit_test.cpp b/libc/test/include/stdbit_test.cpp
index acb79ca0f3ff11d..84a4cde18b9f403 100644
--- a/libc/test/include/stdbit_test.cpp
+++ b/libc/test/include/stdbit_test.cpp
@@ -88,7 +88,7 @@ bool stdc_has_single_bit_ul(unsigned long) noexcept { return false; }
 bool stdc_has_single_bit_ull(unsigned long long) noexcept { return false; }
 }
 
-#include "include/llvm-libc-macros/stdbit-macros.h"
+#include "llvm-libc-macros/stdbit-macros.h"
 
 TEST(LlvmLibcStdbitTest, TypeGenericMacroLeadingZeros) {
   EXPECT_EQ(stdc_leading_zeros(static_cast<unsigned char>(0U)), 0xAAU);
diff --git a/libc/test/include/stdckdint_test.cpp b/libc/test/include/stdckdint_test.cpp
index 1180a6de9efe2e9..5ac8c95f4ef26f3 100644
--- a/libc/test/include/stdckdint_test.cpp
+++ b/libc/test/include/stdckdint_test.cpp
@@ -8,7 +8,7 @@
 
 #include "test/UnitTest/Test.h"
 
-#include "include/llvm-libc-macros/stdckdint-macros.h"
+#include "llvm-libc-macros/stdckdint-macros.h"
 
 TEST(LlvmLibcStdCkdIntTest, Add) {
   int result;
diff --git a/libc/test/integration/startup/CMakeLists.txt b/libc/test/integration/startup/CMakeLists.txt
index fb5d6bc787cc261..08c0d978602b80d 100644
--- a/libc/test/integration/startup/CMakeLists.txt
+++ b/libc/test/integration/startup/CMakeLists.txt
@@ -31,6 +31,7 @@ function(add_startup_test target_name)
     ${fq_target_name}
     PRIVATE
       ${LIBC_SOURCE_DIR}
+      ${LIBC_SOURCE_DIR}/include
       ${LIBC_BUILD_DIR}
       ${LIBC_BUILD_DIR}/include
   )
diff --git a/libc/test/integration/startup/gpu/rpc_interface_test.cpp b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
index 674e2cc1ed7499f..7bbd7085fc2f45f 100644
--- a/libc/test/integration/startup/gpu/rpc_interface_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "test/IntegrationTest/test.h"
diff --git a/libc/test/integration/startup/gpu/rpc_stream_test.cpp b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
index 09a4ae67256e3a6..9401f822904d047 100644
--- a/libc/test/integration/startup/gpu/rpc_stream_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_stream_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/integer_to_string.h"
diff --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp
index 4032d890c53ec89..bb36b6cedb63cb6 100644
--- a/libc/test/integration/startup/gpu/rpc_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "test/IntegrationTest/test.h"
diff --git a/libc/test/src/__support/fixed_point/fx_bits_test.cpp b/libc/test/src/__support/fixed_point/fx_bits_test.cpp
index 58627816eb8d97c..5670687273d5ba4 100644
--- a/libc/test/src/__support/fixed_point/fx_bits_test.cpp
+++ b/libc/test/src/__support/fixed_point/fx_bits_test.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "include/llvm-libc-macros/stdfix-macros.h"
+#include "llvm-libc-macros/stdfix-macros.h"
 
 #include "src/__support/fixed_point/fx_bits.h"
 #include "src/__support/integer_literals.h"
diff --git a/libc/test/src/math/differential_testing/CMakeLists.txt b/libc/test/src/math/differential_testing/CMakeLists.txt
index 878f81f1d573c8f..36bfdca1a442d6a 100644
--- a/libc/test/src/math/differential_testing/CMakeLists.txt
+++ b/libc/test/src/math/differential_testing/CMakeLists.txt
@@ -47,6 +47,7 @@ function(add_diff_binary target_name)
     ${fq_target_name}
     PRIVATE
       ${LIBC_SOURCE_DIR}
+      ${LIBC_SOURCE_DIR}/include
   )
   if(DIFF_COMPILE_OPTIONS)
     target_compile_options(
diff --git a/libc/utils/LibcTableGenUtil/CMakeLists.txt b/libc/utils/LibcTableGenUtil/CMakeLists.txt
index dca6a7bb8306557..60208ed790d5746 100644
--- a/libc/utils/LibcTableGenUtil/CMakeLists.txt
+++ b/libc/utils/LibcTableGenUtil/CMakeLists.txt
@@ -5,5 +5,5 @@ add_llvm_library(
   DISABLE_LLVM_LINK_LLVM_DYLIB
   LINK_COMPONENTS Support TableGen
 )
-target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR})
+target_include_directories(LibcTableGenUtil PUBLIC ${LIBC_SOURCE_DIR} ${LIBC_SOURCE_DIR}/include)
 target_include_directories(LibcTableGenUtil PRIVATE ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR})
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index e2aabb08c11dac0..d74d65e89938290 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -11,7 +11,7 @@
 
 #include "utils/gpu/server/llvmlibc_rpc_server.h"
 
-#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
+#include "llvm-libc-types/test_rpc_opcodes_t.h"
 
 #include <cstddef>
 #include <cstdint>

From d699d9d609a24d80809df15efe47ac539da90e93 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Tue, 27 Feb 2024 15:59:25 -0800
Subject: [PATCH 019/114] [flang][runtime] Support SUM/PRODUCT/DOT_PRODUCT
 reductions for REAL(16). (#83169)

The reductions implementations rely on trivial operations that
are supported by the build compiler runtime, so they can be enabled
whenever the build compiler provides 128-bit float support.

std::conj used by DOT_PRODUCT is a template implementation
in most environments, so it should not introduce a dependency
on any 128-bit float support library. I am not goind to
test it in all the build environments before merging.
If it fails for someone, I will deal with it.
---
 flang/include/flang/Common/float128.h     | 16 ++++++++
 flang/include/flang/Runtime/reduction.h   | 12 ++++--
 flang/runtime/Float128Math/cabs.cpp       |  6 +--
 flang/runtime/Float128Math/math-entries.h |  9 -----
 flang/runtime/complex-reduction.c         | 47 ++++++++++++++++++-----
 flang/runtime/complex-reduction.h         | 13 +++++--
 flang/runtime/product.cpp                 |  6 ++-
 flang/runtime/sum.cpp                     |  3 +-
 8 files changed, 81 insertions(+), 31 deletions(-)

diff --git a/flang/include/flang/Common/float128.h b/flang/include/flang/Common/float128.h
index 3443aa06437b041..6aa98df5529df22 100644
--- a/flang/include/flang/Common/float128.h
+++ b/flang/include/flang/Common/float128.h
@@ -49,4 +49,20 @@
 #endif /* (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)) && \
           !defined(_LIBCPP_VERSION)  && !defined(__CUDA_ARCH__) */
 
+/* Define pure C CFloat128Type and CFloat128ComplexType. */
+#if LDBL_MANT_DIG == 113
+typedef long double CFloat128Type;
+typedef long double _Complex CFloat128ComplexType;
+#elif HAS_FLOAT128
+typedef __float128 CFloat128Type;
+/*
+ * Use mode() attribute supported by GCC and Clang.
+ * Adjust it for other compilers as needed.
+ */
+#if !defined(_ARCH_PPC) || defined(__LONG_DOUBLE_IEEE128__)
+typedef _Complex float __attribute__((mode(TC))) CFloat128ComplexType;
+#else
+typedef _Complex float __attribute__((mode(KC))) CFloat128ComplexType;
+#endif
+#endif
 #endif /* FORTRAN_COMMON_FLOAT128_H_ */
diff --git a/flang/include/flang/Runtime/reduction.h b/flang/include/flang/Runtime/reduction.h
index b91fec0cd26b51c..6d62f4016937e0b 100644
--- a/flang/include/flang/Runtime/reduction.h
+++ b/flang/include/flang/Runtime/reduction.h
@@ -92,9 +92,11 @@ void RTDECL(CppSumComplex8)(std::complex<double> &, const Descriptor &,
 void RTDECL(CppSumComplex10)(std::complex<long double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTDECL(CppSumComplex16)(std::complex<long double> &, const Descriptor &,
-    const char *source, int line, int dim = 0,
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+void RTDECL(CppSumComplex16)(std::complex<CppFloat128Type> &,
+    const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#endif
 
 void RTDECL(SumDim)(Descriptor &result, const Descriptor &array, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
@@ -145,12 +147,16 @@ void RTDECL(CppProductComplex4)(std::complex<float> &, const Descriptor &,
 void RTDECL(CppProductComplex8)(std::complex<double> &, const Descriptor &,
     const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#if LDBL_MANT_DIG == 64
 void RTDECL(CppProductComplex10)(std::complex<long double> &,
     const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
-void RTDECL(CppProductComplex16)(std::complex<long double> &,
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+void RTDECL(CppProductComplex16)(std::complex<CppFloat128Type> &,
     const Descriptor &, const char *source, int line, int dim = 0,
     const Descriptor *mask = nullptr);
+#endif
 
 void RTDECL(ProductDim)(Descriptor &result, const Descriptor &array, int dim,
     const char *source, int line, const Descriptor *mask = nullptr);
diff --git a/flang/runtime/Float128Math/cabs.cpp b/flang/runtime/Float128Math/cabs.cpp
index 63f2bdf8e177ae0..2867c8a4578a80b 100644
--- a/flang/runtime/Float128Math/cabs.cpp
+++ b/flang/runtime/Float128Math/cabs.cpp
@@ -12,10 +12,8 @@ namespace Fortran::runtime {
 extern "C" {
 
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
-// FIXME: the argument should be CppTypeFor<TypeCategory::Complex, 16>,
-// and it should be translated into the underlying library's
-// corresponding complex128 type.
-CppTypeFor<TypeCategory::Real, 16> RTDEF(CAbsF128)(ComplexF128 x) {
+// NOTE: Flang calls the runtime APIs using C _Complex ABI
+CppTypeFor<TypeCategory::Real, 16> RTDEF(CAbsF128)(CFloat128ComplexType x) {
   return CAbs<RTNAME(CAbsF128)>::invoke(x);
 }
 #endif
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index fe1525468edcafc..141648d2fb2c54f 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -91,15 +91,6 @@ DEFINE_FALLBACK(Y0)
 DEFINE_FALLBACK(Y1)
 DEFINE_FALLBACK(Yn)
 
-// Define ComplexF128 type that is compatible with
-// the type of results/arguments of libquadmath.
-// TODO: this may need more work for other libraries/compilers.
-#if !defined(_ARCH_PPC) || defined(__LONG_DOUBLE_IEEE128__)
-typedef _Complex float __attribute__((mode(TC))) ComplexF128;
-#else
-typedef _Complex float __attribute__((mode(KC))) ComplexF128;
-#endif
-
 #if HAS_LIBM
 // Define wrapper callers for libm.
 #include <ccomplex>
diff --git a/flang/runtime/complex-reduction.c b/flang/runtime/complex-reduction.c
index d77e1c0a5500691..06e4f15c7fa9b5f 100644
--- a/flang/runtime/complex-reduction.c
+++ b/flang/runtime/complex-reduction.c
@@ -19,6 +19,11 @@ struct CppComplexDouble {
 struct CppComplexLongDouble {
   long double r, i;
 };
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+struct CppComplexFloat128 {
+  CFloat128Type r, i;
+};
+#endif
 
 /* Not all environments define CMPLXF, CMPLX, CMPLXL. */
 
@@ -70,6 +75,27 @@ static long_double_Complex_t CMPLXL(long double r, long double i) {
 #endif
 #endif
 
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+/*
+ * GCC 7.4.0 (currently minimum GCC version for llvm builds)
+ * supports __builtin_complex. For Clang, require >=12.0.
+ * Otherwise, rely on the memory layout compatibility.
+ */
+#if (defined(__clang_major__) && (__clang_major__ >= 12)) || defined(__GNUC__)
+#define CMPLXF128 __builtin_complex
+#else
+static CFloat128ComplexType CMPLXF128(CFloat128Type r, CFloat128Type i) {
+  union {
+    struct CppComplexFloat128 x;
+    CFloat128ComplexType result;
+  } u;
+  u.x.r = r;
+  u.x.i = i;
+  return u.result;
+}
+#endif
+#endif
+
 /* RTNAME(SumComplex4) calls RTNAME(CppSumComplex4) with the same arguments
  * and converts the members of its C++ complex result to C _Complex.
  */
@@ -93,9 +119,10 @@ ADAPT_REDUCTION(SumComplex8, double_Complex_t, CppComplexDouble, CMPLX,
 #if LDBL_MANT_DIG == 64
 ADAPT_REDUCTION(SumComplex10, long_double_Complex_t, CppComplexLongDouble,
     CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
-#elif LDBL_MANT_DIG == 113
-ADAPT_REDUCTION(SumComplex16, long_double_Complex_t, CppComplexLongDouble,
-    CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+ADAPT_REDUCTION(SumComplex16, CFloat128ComplexType, CppComplexFloat128,
+    CMPLXF128, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
 #endif
 
 /* PRODUCT() */
@@ -106,9 +133,10 @@ ADAPT_REDUCTION(ProductComplex8, double_Complex_t, CppComplexDouble, CMPLX,
 #if LDBL_MANT_DIG == 64
 ADAPT_REDUCTION(ProductComplex10, long_double_Complex_t, CppComplexLongDouble,
     CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
-#elif LDBL_MANT_DIG == 113
-ADAPT_REDUCTION(ProductComplex16, long_double_Complex_t, CppComplexLongDouble,
-    CMPLXL, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+ADAPT_REDUCTION(ProductComplex16, CFloat128ComplexType, CppComplexFloat128,
+    CMPLXF128, REDUCTION_ARGS, REDUCTION_ARG_NAMES)
 #endif
 
 /* DOT_PRODUCT() */
@@ -119,7 +147,8 @@ ADAPT_REDUCTION(DotProductComplex8, double_Complex_t, CppComplexDouble, CMPLX,
 #if LDBL_MANT_DIG == 64
 ADAPT_REDUCTION(DotProductComplex10, long_double_Complex_t,
     CppComplexLongDouble, CMPLXL, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES)
-#elif LDBL_MANT_DIG == 113
-ADAPT_REDUCTION(DotProductComplex16, long_double_Complex_t,
-    CppComplexLongDouble, CMPLXL, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES)
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+ADAPT_REDUCTION(DotProductComplex16, CFloat128ComplexType, CppComplexFloat128,
+    CMPLXF128, DOT_PRODUCT_ARGS, DOT_PRODUCT_ARG_NAMES)
 #endif
diff --git a/flang/runtime/complex-reduction.h b/flang/runtime/complex-reduction.h
index 5c4f1f5126e393b..1d37b235d5194b5 100644
--- a/flang/runtime/complex-reduction.h
+++ b/flang/runtime/complex-reduction.h
@@ -15,6 +15,7 @@
 #ifndef FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
 #define FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
 
+#include "flang/Common/float128.h"
 #include "flang/Runtime/entry-names.h"
 #include <complex.h>
 
@@ -40,14 +41,18 @@ float_Complex_t RTNAME(SumComplex3)(REDUCTION_ARGS);
 float_Complex_t RTNAME(SumComplex4)(REDUCTION_ARGS);
 double_Complex_t RTNAME(SumComplex8)(REDUCTION_ARGS);
 long_double_Complex_t RTNAME(SumComplex10)(REDUCTION_ARGS);
-long_double_Complex_t RTNAME(SumComplex16)(REDUCTION_ARGS);
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CFloat128ComplexType RTNAME(SumComplex16)(REDUCTION_ARGS);
+#endif
 
 float_Complex_t RTNAME(ProductComplex2)(REDUCTION_ARGS);
 float_Complex_t RTNAME(ProductComplex3)(REDUCTION_ARGS);
 float_Complex_t RTNAME(ProductComplex4)(REDUCTION_ARGS);
 double_Complex_t RTNAME(ProductComplex8)(REDUCTION_ARGS);
 long_double_Complex_t RTNAME(ProductComplex10)(REDUCTION_ARGS);
-long_double_Complex_t RTNAME(ProductComplex16)(REDUCTION_ARGS);
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CFloat128ComplexType RTNAME(ProductComplex16)(REDUCTION_ARGS);
+#endif
 
 #define DOT_PRODUCT_ARGS \
   const struct CppDescriptor *x, const struct CppDescriptor *y, \
@@ -60,6 +65,8 @@ float_Complex_t RTNAME(DotProductComplex3)(DOT_PRODUCT_ARGS);
 float_Complex_t RTNAME(DotProductComplex4)(DOT_PRODUCT_ARGS);
 double_Complex_t RTNAME(DotProductComplex8)(DOT_PRODUCT_ARGS);
 long_double_Complex_t RTNAME(DotProductComplex10)(DOT_PRODUCT_ARGS);
-long_double_Complex_t RTNAME(DotProductComplex16)(DOT_PRODUCT_ARGS);
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CFloat128ComplexType RTNAME(DotProductComplex16)(DOT_PRODUCT_ARGS);
+#endif
 
 #endif // FORTRAN_RUNTIME_COMPLEX_REDUCTION_H_
diff --git a/flang/runtime/product.cpp b/flang/runtime/product.cpp
index a516bc51a959b77..4c3b8c33a12e0f1 100644
--- a/flang/runtime/product.cpp
+++ b/flang/runtime/product.cpp
@@ -123,7 +123,8 @@ CppTypeFor<TypeCategory::Real, 10> RTDEF(ProductReal10)(const Descriptor &x,
       NonComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 10>>{x},
       "PRODUCT");
 }
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 CppTypeFor<TypeCategory::Real, 16> RTDEF(ProductReal16)(const Descriptor &x,
     const char *source, int line, int dim, const Descriptor *mask) {
   return GetTotalReduction<TypeCategory::Real, 16>(x, source, line, dim, mask,
@@ -154,7 +155,8 @@ void RTDEF(CppProductComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
       mask, ComplexProductAccumulator<CppTypeFor<TypeCategory::Real, 10>>{x},
       "PRODUCT");
 }
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 void RTDEF(CppProductComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {
diff --git a/flang/runtime/sum.cpp b/flang/runtime/sum.cpp
index 048399737c85015..d2495e3e956fe6b 100644
--- a/flang/runtime/sum.cpp
+++ b/flang/runtime/sum.cpp
@@ -175,7 +175,8 @@ void RTDEF(CppSumComplex10)(CppTypeFor<TypeCategory::Complex, 10> &result,
   result = GetTotalReduction<TypeCategory::Complex, 10>(
       x, source, line, dim, mask, ComplexSumAccumulator<long double>{x}, "SUM");
 }
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 void RTDEF(CppSumComplex16)(CppTypeFor<TypeCategory::Complex, 16> &result,
     const Descriptor &x, const char *source, int line, int dim,
     const Descriptor *mask) {

From 062cfada643c1aa48a1bb81894e2920d390fe8cf Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 27 Feb 2024 16:32:53 -0800
Subject: [PATCH 020/114] [builtins] Disable
 COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY by default (#83201)

Most of GCC's Linux targets have a link spec
`%{!static|static-pie:--eh-frame-hdr}` that doesn't pass --eh-frame-hdr
for `-static` links. `-static` links are supposed to utilize
`__register_frame_info` (called by `crtbeginT.o`, not by crtbegin.o or
crtbeginS.o) as a replacement.

compiler-rt crtbegin (not used with GCC) has some ehframe code, which is
not utilized because Clang driver unconditionally passes --eh-frame-hdr
for Linux targets, even for -static. In addition, LLVM libunwind
implements `__register_frame_info` as an empty stub.

Furthermore, in a non-static link, the `__register_frame_info`
references can cause an undesired weak dynamic symbol.

For now, just disable the config by default.
---
 compiler-rt/lib/builtins/CMakeLists.txt    |  2 +-
 compiler-rt/test/builtins/Unit/ctor_dtor.c | 10 ----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 28ded8766f2533a..83f7697a4a2b429 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -916,7 +916,7 @@ cmake_dependent_option(COMPILER_RT_BUILD_CRT "Build crtbegin.o/crtend.o" ON "COM
 if (COMPILER_RT_BUILD_CRT)
   add_compiler_rt_component(crt)
 
-  option(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY "Use eh_frame in crtbegin.o/crtend.o" ON)
+  option(COMPILER_RT_CRT_USE_EH_FRAME_REGISTRY "Use eh_frame in crtbegin.o/crtend.o" OFF)
 
   include(CheckSectionExists)
   check_section_exists(".init_array" COMPILER_RT_HAS_INITFINI_ARRAY
diff --git a/compiler-rt/test/builtins/Unit/ctor_dtor.c b/compiler-rt/test/builtins/Unit/ctor_dtor.c
index 47560722a9f7505..3d5f895a0a1cd16 100644
--- a/compiler-rt/test/builtins/Unit/ctor_dtor.c
+++ b/compiler-rt/test/builtins/Unit/ctor_dtor.c
@@ -9,23 +9,13 @@
 
 // Ensure the various startup functions are called in the proper order.
 
-// CHECK: __register_frame_info()
 /// ctor() is here if ld.so/libc supports DT_INIT/DT_FINI
 // CHECK:      main()
 /// dtor() is here if ld.so/libc supports DT_INIT/DT_FINI
-// CHECK:      __deregister_frame_info()
 
 struct object;
 static int counter;
 
-void __register_frame_info(const void *fi, struct object *obj) {
-  printf("__register_frame_info()\n");
-}
-
-void __deregister_frame_info(const void *fi) {
-  printf("__deregister_frame_info()\n");
-}
-
 void __attribute__((constructor)) ctor() {
   printf("ctor()\n");
   ++counter;

From 8506a63bf7cbe593c0707f995fbd0b8f820d0d62 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Wed, 28 Feb 2024 01:02:39 +0000
Subject: [PATCH 021/114] Revert "[WebAssembly] Disable multivalue emission
 temporarily (#82714)"

This reverts commit 6e6bf9f81756ba6655b4eea8dc45469a47f89b39.

It turned out the multivalue feature had active outside users and it
could cause some disruptions to them, so I'd like to investigate more
about the workarounds before doing this.
---
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  7 ++---
 .../WebAssemblyMachineFunctionInfo.cpp        |  5 +---
 .../WebAssemblyRuntimeLibcallSignatures.cpp   | 26 +++++++++----------
 .../WebAssembly/WebAssemblyTargetMachine.cpp  |  9 -------
 .../lower-em-ehsjlj-multi-return.ll           |  4 +--
 .../multivalue-dont-move-def-past-use.mir     |  2 +-
 .../WebAssembly/multivalue-stackify.ll        |  2 +-
 llvm/test/CodeGen/WebAssembly/multivalue.ll   | 10 +++----
 .../CodeGen/WebAssembly/multivalue_libcall.ll |  2 +-
 9 files changed, 24 insertions(+), 43 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 36f067956e63a5a..7c47790d1e3515b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -43,8 +43,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-lower"
 
-extern cl::opt<bool> WasmEmitMultiValue;
-
 WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     const TargetMachine &TM, const WebAssemblySubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -1290,7 +1288,7 @@ bool WebAssemblyTargetLowering::CanLowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     LLVMContext & /*Context*/) const {
   // WebAssembly can only handle returning tuples with multivalue enabled
-  return (Subtarget->hasMultivalue() && WasmEmitMultiValue) || Outs.size() <= 1;
+  return Subtarget->hasMultivalue() || Outs.size() <= 1;
 }
 
 SDValue WebAssemblyTargetLowering::LowerReturn(
@@ -1298,8 +1296,7 @@ SDValue WebAssemblyTargetLowering::LowerReturn(
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
     SelectionDAG &DAG) const {
-  assert(((Subtarget->hasMultivalue() && WasmEmitMultiValue) ||
-          Outs.size() <= 1) &&
+  assert((Subtarget->hasMultivalue() || Outs.size() <= 1) &&
          "MVP WebAssembly can only return up to one value");
   if (!callingConvSupported(CallConv))
     fail(DL, DAG, "WebAssembly doesn't support non-C calling conventions");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index b969b8370a3e5ee..1e959111a4dbcb5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -22,8 +22,6 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-extern cl::opt<bool> WasmEmitMultiValue;
-
 WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor.
 
 MachineFunctionInfo *WebAssemblyFunctionInfo::clone(
@@ -73,8 +71,7 @@ void llvm::computeSignatureVTs(const FunctionType *Ty,
 
   MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits());
   if (Results.size() > 1 &&
-      (!TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue() ||
-       !WasmEmitMultiValue)) {
+      !TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue()) {
     // WebAssembly can't lower returns of multiple values without demoting to
     // sret unless multivalue is enabled (see
     // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index 2a84c90c89602e8..3e2e029695ab6fa 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -24,8 +24,6 @@
 
 using namespace llvm;
 
-extern cl::opt<bool> WasmEmitMultiValue;
-
 namespace {
 
 enum RuntimeLibcallSignature {
@@ -696,7 +694,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(PtrTy);
     break;
   case i64_i64_func_f32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -705,7 +703,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::F32);
     break;
   case i64_i64_func_f64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -714,7 +712,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::F64);
     break;
   case i16_i16_func_i16_i16:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I32);
       Rets.push_back(wasm::ValType::I32);
     } else {
@@ -724,7 +722,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i32_i32_func_i32_i32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I32);
       Rets.push_back(wasm::ValType::I32);
     } else {
@@ -734,7 +732,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i64_i64_func_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -744,7 +742,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -756,7 +754,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64_iPTR:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -769,7 +767,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(PtrTy);
     break;
   case i64_i64_i64_i64_func_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
@@ -783,7 +781,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -853,7 +851,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i64_i64_i64_i64_i64_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -867,7 +865,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     break;
   case i64_i64_func_i32:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
@@ -876,7 +874,7 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I32);
     break;
   case i64_i64_func_i64:
-    if (Subtarget.hasMultivalue() && WasmEmitMultiValue) {
+    if (Subtarget.hasMultivalue()) {
       Rets.push_back(wasm::ValType::I64);
       Rets.push_back(wasm::ValType::I64);
     } else {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index d088c7d925ddf00..4d4cae110148d6c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -54,15 +54,6 @@ static cl::opt<bool> WasmDisableFixIrreducibleControlFlowPass(
              " irreducible control flow optimization pass"),
     cl::init(false));
 
-// A temporary option to control emission of multivalue until multivalue
-// implementation is stable enough. We currently don't emit multivalue by
-// default even if the feature section allows it.
-// TODO Stabilize multivalue and delete this option
-cl::opt<bool>
-    WasmEmitMultiValue("wasm-emit-multivalue", cl::Hidden,
-                       cl::desc("WebAssembly: Emit multivalue in the backend"),
-                       cl::init(false));
-
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   // Register the target.
   RegisterTargetMachine<WebAssemblyTargetMachine> X(
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
index daf46c6eef02523..4f33439db770dc0 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
@@ -1,5 +1,5 @@
-; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue -wasm-emit-multivalue 2>&1 | FileCheck %s --check-prefix=EH
-; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 -wasm-emit-multivalue | FileCheck %s --check-prefix=SJLJ
+; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=EH
+; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=SJLJ
 
 ; Currently multivalue returning functions are not supported in Emscripten EH /
 ; SjLj. Make sure they error out.
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
index 4fadbd5f07e6dff..4b4661b144667f9 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -wasm-emit-multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
index f4f93ac2f30ce39..52a8c686824d338 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; NOTE: Test functions have been generated by multivalue-stackify.py.
 
-; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue | FileCheck %s
 
 ; Test that the multivalue stackification works
 
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue.ll b/llvm/test/CodeGen/WebAssembly/multivalue.ll
index 846691e5ff0cdaa..675009c8f3e5481 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue.ll
@@ -1,8 +1,7 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call -wasm-emit-multivalue | FileCheck --check-prefix REF %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s --check-prefix REGS
-; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | obj2yaml | FileCheck %s --check-prefix OBJ
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix NO-MULTIVALUE
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call | FileCheck --check-prefix REF %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix REGS
+; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call | obj2yaml | FileCheck %s --check-prefix OBJ
 
 ; Test that the multivalue calls, returns, function types, and block
 ; types work as expected.
@@ -20,7 +19,6 @@ declare void @use_i64(i64)
 ; CHECK-NEXT: i32.const 42{{$}}
 ; CHECK-NEXT: i64.const 42{{$}}
 ; CHECK-NEXT: end_function{{$}}
-; NO-MULTIVALUE-NOT: .functype pair_const () -> (i32, i64)
 define %pair @pair_const() {
   ret %pair { i32 42, i64 42 }
 }
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
index 7bf37b59353ad64..47c5ae7b457dde8 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s --check-prefix=MULTIVALUE
+; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue | FileCheck %s --check-prefix=MULTIVALUE
 ; RUN: llc < %s -verify-machineinstrs -mcpu=mvp | FileCheck %s --check-prefix=NO_MULTIVALUE
 
 ; Test libcall signatures when multivalue is enabled and disabled

From f20ea05f3b2218a7103612e5e367398b0b27bb27 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Tue, 27 Feb 2024 17:02:27 -0800
Subject: [PATCH 022/114] [flang][runtime] Fixed aarach buildbots after #83169.

---
 flang/include/flang/Common/float128.h | 2 ++
 flang/runtime/complex-reduction.c     | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/flang/include/flang/Common/float128.h b/flang/include/flang/Common/float128.h
index 6aa98df5529df22..61b3a77b867a93b 100644
--- a/flang/include/flang/Common/float128.h
+++ b/flang/include/flang/Common/float128.h
@@ -20,6 +20,8 @@
 #ifndef FORTRAN_COMMON_FLOAT128_H_
 #define FORTRAN_COMMON_FLOAT128_H_
 
+#include <float.h>
+
 #ifdef __cplusplus
 /*
  * libc++ does not fully support __float128 right now, e.g.
diff --git a/flang/runtime/complex-reduction.c b/flang/runtime/complex-reduction.c
index 06e4f15c7fa9b5f..72c31ce08b875a7 100644
--- a/flang/runtime/complex-reduction.c
+++ b/flang/runtime/complex-reduction.c
@@ -76,6 +76,7 @@ static long_double_Complex_t CMPLXL(long double r, long double i) {
 #endif
 
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+#ifndef CMPLXF128
 /*
  * GCC 7.4.0 (currently minimum GCC version for llvm builds)
  * supports __builtin_complex. For Clang, require >=12.0.
@@ -95,6 +96,7 @@ static CFloat128ComplexType CMPLXF128(CFloat128Type r, CFloat128Type i) {
 }
 #endif
 #endif
+#endif
 
 /* RTNAME(SumComplex4) calls RTNAME(CppSumComplex4) with the same arguments
  * and converts the members of its C++ complex result to C _Complex.

From bcbce807d76a30388b366d14051c5f80e9724dab Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Tue, 27 Feb 2024 20:19:07 -0500
Subject: [PATCH 023/114] Revert "[HIP] fix host min/max in header (#82956)"

This reverts commit 55783bd0f9cfc30aa93c718919dab5419d86a2c6.

Due to regressions in hipCUB.

hipCUB/hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp:142:33: error: call to 'min' is ambiguous

https://github.com/ROCm/hipCUB/blob/develop/hipcub/include/hipcub/backend/rocprim/device/device_spmv.hpp#L142

The ambuguity seems due to missing min(int, unsigned int).

Previously, there is only min(int, int). After the change,
there are min(int, int) and min(unsigned int, unsigned int),
therefore there is ambiguity.
---
 clang/lib/Headers/__clang_hip_math.h | 72 +++-------------------------
 1 file changed, 6 insertions(+), 66 deletions(-)

diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h
index 34d1de0a0600557..11e1e7d032586f8 100644
--- a/clang/lib/Headers/__clang_hip_math.h
+++ b/clang/lib/Headers/__clang_hip_math.h
@@ -1306,75 +1306,15 @@ float min(float __x, float __y) { return __builtin_fminf(__x, __y); }
 __DEVICE__
 double min(double __x, double __y) { return __builtin_fmin(__x, __y); }
 
-// Define host min/max functions.
-#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) &&                  \
-    !defined(__HIP_NO_HOST_MIN_MAX_IN_GLOBAL_NAMESPACE__)
-
-#pragma push_macro("DEFINE_MIN_MAX_FUNCTIONS")
-#pragma push_macro("DEFINE_MIN_MAX_FUNCTIONS")
-#define DEFINE_MIN_MAX_FUNCTIONS(ret_type, type1, type2)                       \
-  inline ret_type min(const type1 __a, const type2 __b) {                      \
-    return (__a < __b) ? __a : __b;                                            \
-  }                                                                            \
-  inline ret_type max(const type1 __a, const type2 __b) {                      \
-    return (__a > __b) ? __a : __b;                                            \
-  }
-
-// Define min and max functions for same type comparisons
-DEFINE_MIN_MAX_FUNCTIONS(int, int, int)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned int, unsigned int, unsigned int)
-DEFINE_MIN_MAX_FUNCTIONS(long, long, long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long, unsigned long, unsigned long)
-DEFINE_MIN_MAX_FUNCTIONS(long long, long long, long long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, unsigned long long,
-                         unsigned long long)
-
-// The host min/max functions below accept mixed signed/unsigned integer
-// parameters and perform unsigned comparisons, which may produce unexpected
-// results if a signed integer was passed unintentionally. To avoid this
-// happening silently, these overloaded functions are not defined by default.
-// However, for compatibility with CUDA, they will be defined if users define
-// __HIP_DEFINE_MIXED_HOST_MIN_MAX__.
-#ifdef __HIP_DEFINE_MIXED_HOST_MIN_MAX__
-DEFINE_MIN_MAX_FUNCTIONS(unsigned int, int, unsigned int)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned int, unsigned int, int)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long, long, unsigned long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long, unsigned long, long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, long long, unsigned long long)
-DEFINE_MIN_MAX_FUNCTIONS(unsigned long long, unsigned long long, long long)
-#endif // ifdef __HIP_DEFINE_MIXED_HOST_MIN_MAX__
-
-// Floating-point comparisons using built-in functions
-inline float min(float const __a, float const __b) {
-  return __builtin_fminf(__a, __b);
-}
-inline double min(double const __a, double const __b) {
-  return __builtin_fmin(__a, __b);
-}
-inline double min(float const __a, double const __b) {
-  return __builtin_fmin(__a, __b);
-}
-inline double min(double const __a, float const __b) {
-  return __builtin_fmin(__a, __b);
+#if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
+__host__ inline static int min(int __arg1, int __arg2) {
+  return __arg1 < __arg2 ? __arg1 : __arg2;
 }
 
-inline float max(float const __a, float const __b) {
-  return __builtin_fmaxf(__a, __b);
-}
-inline double max(double const __a, double const __b) {
-  return __builtin_fmax(__a, __b);
-}
-inline double max(float const __a, double const __b) {
-  return __builtin_fmax(__a, __b);
+__host__ inline static int max(int __arg1, int __arg2) {
+  return __arg1 > __arg2 ? __arg1 : __arg2;
 }
-inline double max(double const __a, float const __b) {
-  return __builtin_fmax(__a, __b);
-}
-
-#pragma pop_macro("DEFINE_MIN_MAX_FUNCTIONS")
-
-#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) &&
-       // !defined(__HIP_NO_HOST_MIN_MAX_IN_GLOBAL_NAMESPACE__)
+#endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__)
 #endif
 
 #pragma pop_macro("__DEVICE__")

From dba2dd2c487d7bed7ad3b76a67fdfce464f0edbf Mon Sep 17 00:00:00 2001
From: Alexander Yermolovich <43973793+ayermolo@users.noreply.github.com>
Date: Tue, 27 Feb 2024 17:26:06 -0800
Subject: [PATCH 024/114] Revert "[CLANG][DWARF] Do not emit -ggnu-pubnames for
 split dwarf version 5." (#83214)

Reverts llvm/llvm-project#82840
---
 clang/lib/Driver/ToolChains/Clang.cpp | 7 +++----
 clang/test/Driver/split-debug.c       | 5 -----
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index dbfc729bba24c76..6e1b7e8657d0dc9 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4479,10 +4479,9 @@ renderDebugOptions(const ToolChain &TC, const Driver &D, const llvm::Triple &T,
                       options::OPT_gpubnames, options::OPT_gno_pubnames);
   if (DwarfFission != DwarfFissionKind::None ||
       (PubnamesArg && checkDebugInfoOption(PubnamesArg, Args, D, TC)))
-    if (DebuggerTuning != llvm::DebuggerKind::LLDB &&
-        (!PubnamesArg ||
-         (!PubnamesArg->getOption().matches(options::OPT_gno_gnu_pubnames) &&
-          !PubnamesArg->getOption().matches(options::OPT_gno_pubnames))))
+    if (!PubnamesArg ||
+        (!PubnamesArg->getOption().matches(options::OPT_gno_gnu_pubnames) &&
+         !PubnamesArg->getOption().matches(options::OPT_gno_pubnames)))
       CmdArgs.push_back(PubnamesArg && PubnamesArg->getOption().matches(
                                            options::OPT_gpubnames)
                             ? "-gpubnames"
diff --git a/clang/test/Driver/split-debug.c b/clang/test/Driver/split-debug.c
index a2a3dc023545034..968f33b4cc035c5 100644
--- a/clang/test/Driver/split-debug.c
+++ b/clang/test/Driver/split-debug.c
@@ -124,8 +124,3 @@
 // G1_NOSPLIT: "-debug-info-kind=line-tables-only"
 // G1_NOSPLIT-NOT: "-split-dwarf-file"
 // G1_NOSPLIT-NOT: "-split-dwarf-output"
-
-/// Do not generate -ggnu-pubnames for -glldb
-// RUN: %clang -### -c -target x86_64 -gsplit-dwarf -g -glldb %s 2>&1 | FileCheck %s --check-prefixes=GLLDBSPLIT
-
-// GLLDBSPLIT-NOT: "-ggnu-pubnames"

From e9e7aeadaf0ce9d66ff352856fd2d1005b0f7d74 Mon Sep 17 00:00:00 2001
From: Daniel Hoekwater <hoekwater@google.com>
Date: Wed, 28 Feb 2024 01:34:48 +0000
Subject: [PATCH 025/114] [Driver] Allow -fbasic-block-address-map for AArch64
 ELF (#82662)

Emitting the basic block address map with
`-fbasic-block-sections=labels` is allowed for AArch64 ELF since
7eaf94fefa1250fc8a46982cea8ce99abacae11f. Allow doing so with
`-fbasic-block-address-map`.
---
 clang/lib/Driver/ToolChains/Clang.cpp       | 2 +-
 clang/test/Driver/basic-block-address-map.c | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 6e1b7e8657d0dc9..66c3a237c12117a 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5958,7 +5958,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (Arg *A = Args.getLastArg(options::OPT_fbasic_block_address_map,
                                options::OPT_fno_basic_block_address_map)) {
-    if (Triple.isX86() && Triple.isOSBinFormatELF()) {
+    if ((Triple.isX86() || Triple.isAArch64()) && Triple.isOSBinFormatELF()) {
       if (A->getOption().matches(options::OPT_fbasic_block_address_map))
         A->render(Args, CmdArgs);
     } else {
diff --git a/clang/test/Driver/basic-block-address-map.c b/clang/test/Driver/basic-block-address-map.c
index 022f972b412d6b5..12393e8ebfd54ea 100644
--- a/clang/test/Driver/basic-block-address-map.c
+++ b/clang/test/Driver/basic-block-address-map.c
@@ -1,8 +1,9 @@
-// RUN: %clang -### -target x86_64 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-PRESENT %s
+// RUN: %clang -### --target=x86_64 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-PRESENT %s
+// RUN: %clang -### --target=aarch64 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-PRESENT %s
 // CHECK-PRESENT: -fbasic-block-address-map
 
-// RUN: %clang -### -target x86_64 -fno-basic-block-address-map %s -S 2>&1 | FileCheck %s --check-prefix=CHECK-ABSENT
+// RUN: %clang -### --target=x86_64 -fno-basic-block-address-map %s -S 2>&1 | FileCheck %s --check-prefix=CHECK-ABSENT
 // CHECK-ABSENT-NOT: -fbasic-block-address-map
 
-// RUN: not %clang -c -target x86_64-apple-darwin10 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
+// RUN: not %clang -c --target=x86_64-apple-darwin10 -fbasic-block-address-map %s -S 2>&1 | FileCheck -check-prefix=CHECK-TRIPLE %s
 // CHECK-TRIPLE: error: unsupported option '-fbasic-block-address-map' for target

From 50136ca11f62050b34876a920fcd87d2aefccfdb Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Tue, 27 Feb 2024 18:01:37 -0800
Subject: [PATCH 026/114] [DirectX][NFC] Rename ShaderFlag to
 SHADER_FEATURE_FLAG. (#82700)

This is preparation for add ShaderFlag in DXIL.

For #57925
---
 llvm/include/llvm/BinaryFormat/DXContainer.h  |  2 +-
 .../BinaryFormat/DXContainerConstants.def     | 74 +++++++++----------
 llvm/include/llvm/Object/DXContainer.h        |  8 +-
 .../include/llvm/ObjectYAML/DXContainerYAML.h | 14 ++--
 llvm/lib/Object/DXContainer.cpp               |  8 +-
 llvm/lib/ObjectYAML/DXContainerYAML.cpp       | 14 ++--
 llvm/lib/Target/DirectX/DXILShaderFlags.cpp   |  2 +-
 llvm/lib/Target/DirectX/DXILShaderFlags.h     |  6 +-
 llvm/tools/obj2yaml/dxcontainer2yaml.cpp      |  4 +-
 9 files changed, 67 insertions(+), 65 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/DXContainer.h b/llvm/include/llvm/BinaryFormat/DXContainer.h
index c3dcd568216b71f..a28e19edb4c6a62 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainer.h
+++ b/llvm/include/llvm/BinaryFormat/DXContainer.h
@@ -141,7 +141,7 @@ enum class PartType {
 #include "DXContainerConstants.def"
 };
 
-#define SHADER_FLAG(Num, Val, Str) Val = 1ull << Num,
+#define SHADER_FEATURE_FLAG(Num, Val, Str) Val = 1ull << Num,
 enum class FeatureFlags : uint64_t {
 #include "DXContainerConstants.def"
 };
diff --git a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
index 87dd0a5cb6ba709..80ed86bc3a499e9 100644
--- a/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
+++ b/llvm/include/llvm/BinaryFormat/DXContainerConstants.def
@@ -11,43 +11,43 @@ CONTAINER_PART(PSG1)
 #undef CONTAINER_PART
 #endif 
 
-#ifdef SHADER_FLAG
-
-SHADER_FLAG(0, Doubles, "Double-precision floating point")
-SHADER_FLAG(1, ComputeShadersPlusRawAndStructuredBuffers, "Raw and Structured buffers")
-SHADER_FLAG(2, UAVsAtEveryStage, "UAVs at every shader stage")
-SHADER_FLAG(3, Max64UAVs, "64 UAV slots")
-SHADER_FLAG(4, MinimumPrecision, "Minimum-precision data types")
-SHADER_FLAG(5, DX11_1_DoubleExtensions, "Double-precision extensions for 11.1")
-SHADER_FLAG(6, DX11_1_ShaderExtensions, "Shader extensions for 11.1")
-SHADER_FLAG(7, LEVEL9ComparisonFiltering, "Comparison filtering for feature level 9")
-SHADER_FLAG(8, TiledResources, "Tiled resources")
-SHADER_FLAG(9, StencilRef, "PS Output Stencil Ref")
-SHADER_FLAG(10, InnerCoverage, "PS Inner Coverage")
-SHADER_FLAG(11, TypedUAVLoadAdditionalFormats, "Typed UAV Load Additional Formats")
-SHADER_FLAG(12, ROVs, "Raster Ordered UAVs")
-SHADER_FLAG(13, ViewportAndRTArrayIndexFromAnyShaderFeedingRasterizer, "SV_RenderTargetArrayIndex or SV_ViewportArrayIndex from any shader feeding rasterizer")
-SHADER_FLAG(14, WaveOps, "Wave level operations")
-SHADER_FLAG(15, Int64Ops, "64-Bit integer")
-SHADER_FLAG(16, ViewID, "View Instancing")
-SHADER_FLAG(17, Barycentrics, "Barycentrics")
-SHADER_FLAG(18, NativeLowPrecision, "Use native low precision")
-SHADER_FLAG(19, ShadingRate, "Shading Rate")
-SHADER_FLAG(20, Raytracing_Tier_1_1, "Raytracing tier 1.1 features")
-SHADER_FLAG(21, SamplerFeedback, "Sampler feedback")
-SHADER_FLAG(22, AtomicInt64OnTypedResource, "64-bit Atomics on Typed Resources")
-SHADER_FLAG(23, AtomicInt64OnGroupShared, "64-bit Atomics on Group Shared")
-SHADER_FLAG(24, DerivativesInMeshAndAmpShaders, "Derivatives in mesh and amplification shaders")
-SHADER_FLAG(25, ResourceDescriptorHeapIndexing, "Resource descriptor heap indexing")
-SHADER_FLAG(26, SamplerDescriptorHeapIndexing, "Sampler descriptor heap indexing")
-SHADER_FLAG(27, RESERVED, "<RESERVED>")
-SHADER_FLAG(28, AtomicInt64OnHeapResource, "64-bit Atomics on Heap Resources")
-SHADER_FLAG(29, AdvancedTextureOps, "Advanced Texture Ops")
-SHADER_FLAG(30, WriteableMSAATextures, "Writeable MSAA Textures")
-
-SHADER_FLAG(31, NextUnusedBit, "Next reserved shader flag bit (not a flag)")
-
-#undef SHADER_FLAG
+#ifdef SHADER_FEATURE_FLAG
+
+SHADER_FEATURE_FLAG(0, Doubles, "Double-precision floating point")
+SHADER_FEATURE_FLAG(1, ComputeShadersPlusRawAndStructuredBuffers, "Raw and Structured buffers")
+SHADER_FEATURE_FLAG(2, UAVsAtEveryStage, "UAVs at every shader stage")
+SHADER_FEATURE_FLAG(3, Max64UAVs, "64 UAV slots")
+SHADER_FEATURE_FLAG(4, MinimumPrecision, "Minimum-precision data types")
+SHADER_FEATURE_FLAG(5, DX11_1_DoubleExtensions, "Double-precision extensions for 11.1")
+SHADER_FEATURE_FLAG(6, DX11_1_ShaderExtensions, "Shader extensions for 11.1")
+SHADER_FEATURE_FLAG(7, LEVEL9ComparisonFiltering, "Comparison filtering for feature level 9")
+SHADER_FEATURE_FLAG(8, TiledResources, "Tiled resources")
+SHADER_FEATURE_FLAG(9, StencilRef, "PS Output Stencil Ref")
+SHADER_FEATURE_FLAG(10, InnerCoverage, "PS Inner Coverage")
+SHADER_FEATURE_FLAG(11, TypedUAVLoadAdditionalFormats, "Typed UAV Load Additional Formats")
+SHADER_FEATURE_FLAG(12, ROVs, "Raster Ordered UAVs")
+SHADER_FEATURE_FLAG(13, ViewportAndRTArrayIndexFromAnyShaderFeedingRasterizer, "SV_RenderTargetArrayIndex or SV_ViewportArrayIndex from any shader feeding rasterizer")
+SHADER_FEATURE_FLAG(14, WaveOps, "Wave level operations")
+SHADER_FEATURE_FLAG(15, Int64Ops, "64-Bit integer")
+SHADER_FEATURE_FLAG(16, ViewID, "View Instancing")
+SHADER_FEATURE_FLAG(17, Barycentrics, "Barycentrics")
+SHADER_FEATURE_FLAG(18, NativeLowPrecision, "Use native low precision")
+SHADER_FEATURE_FLAG(19, ShadingRate, "Shading Rate")
+SHADER_FEATURE_FLAG(20, Raytracing_Tier_1_1, "Raytracing tier 1.1 features")
+SHADER_FEATURE_FLAG(21, SamplerFeedback, "Sampler feedback")
+SHADER_FEATURE_FLAG(22, AtomicInt64OnTypedResource, "64-bit Atomics on Typed Resources")
+SHADER_FEATURE_FLAG(23, AtomicInt64OnGroupShared, "64-bit Atomics on Group Shared")
+SHADER_FEATURE_FLAG(24, DerivativesInMeshAndAmpShaders, "Derivatives in mesh and amplification shaders")
+SHADER_FEATURE_FLAG(25, ResourceDescriptorHeapIndexing, "Resource descriptor heap indexing")
+SHADER_FEATURE_FLAG(26, SamplerDescriptorHeapIndexing, "Sampler descriptor heap indexing")
+SHADER_FEATURE_FLAG(27, RESERVED, "<RESERVED>")
+SHADER_FEATURE_FLAG(28, AtomicInt64OnHeapResource, "64-bit Atomics on Heap Resources")
+SHADER_FEATURE_FLAG(29, AdvancedTextureOps, "Advanced Texture Ops")
+SHADER_FEATURE_FLAG(30, WriteableMSAATextures, "Writeable MSAA Textures")
+
+SHADER_FEATURE_FLAG(31, NextUnusedBit, "Next reserved shader flag bit (not a flag)")
+
+#undef SHADER_FEATURE_FLAG
 #endif
 
 #ifdef SEMANTIC_KIND
diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h
index a7f18c799698032..b6e3d321da2461a 100644
--- a/llvm/include/llvm/Object/DXContainer.h
+++ b/llvm/include/llvm/Object/DXContainer.h
@@ -276,7 +276,7 @@ class DXContainer {
   dxbc::Header Header;
   SmallVector<uint32_t, 4> PartOffsets;
   std::optional<DXILData> DXIL;
-  std::optional<uint64_t> ShaderFlags;
+  std::optional<uint64_t> ShaderFeatureFlags;
   std::optional<dxbc::ShaderHash> Hash;
   std::optional<DirectX::PSVRuntimeInfo> PSVInfo;
   DirectX::Signature InputSignature;
@@ -286,7 +286,7 @@ class DXContainer {
   Error parseHeader();
   Error parsePartOffsets();
   Error parseDXILHeader(StringRef Part);
-  Error parseShaderFlags(StringRef Part);
+  Error parseShaderFeatureFlags(StringRef Part);
   Error parseHash(StringRef Part);
   Error parsePSVInfo(StringRef Part);
   Error parseSignature(StringRef Part, DirectX::Signature &Array);
@@ -368,7 +368,9 @@ class DXContainer {
 
   const std::optional<DXILData> &getDXIL() const { return DXIL; }
 
-  std::optional<uint64_t> getShaderFlags() const { return ShaderFlags; }
+  std::optional<uint64_t> getShaderFeatureFlags() const {
+    return ShaderFeatureFlags;
+  }
 
   std::optional<dxbc::ShaderHash> getShaderHash() const { return Hash; }
 
diff --git a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
index 66a6ac70bbea10e..497f82bbd0f32a5 100644
--- a/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
+++ b/llvm/include/llvm/ObjectYAML/DXContainerYAML.h
@@ -56,10 +56,10 @@ struct DXILProgram {
   std::optional<std::vector<llvm::yaml::Hex8>> DXIL;
 };
 
-#define SHADER_FLAG(Num, Val, Str) bool Val = false;
-struct ShaderFlags {
-  ShaderFlags() = default;
-  ShaderFlags(uint64_t FlagData);
+#define SHADER_FEATURE_FLAG(Num, Val, Str) bool Val = false;
+struct ShaderFeatureFlags {
+  ShaderFeatureFlags() = default;
+  ShaderFeatureFlags(uint64_t FlagData);
   uint64_t getEncodedFlags();
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 };
@@ -151,7 +151,7 @@ struct Part {
   std::string Name;
   uint32_t Size;
   std::optional<DXILProgram> Program;
-  std::optional<ShaderFlags> Flags;
+  std::optional<ShaderFeatureFlags> Flags;
   std::optional<ShaderHash> Hash;
   std::optional<PSVInfo> Info;
   std::optional<DXContainerYAML::Signature> Signature;
@@ -195,8 +195,8 @@ template <> struct MappingTraits<DXContainerYAML::DXILProgram> {
   static void mapping(IO &IO, DXContainerYAML::DXILProgram &Program);
 };
 
-template <> struct MappingTraits<DXContainerYAML::ShaderFlags> {
-  static void mapping(IO &IO, DXContainerYAML::ShaderFlags &Flags);
+template <> struct MappingTraits<DXContainerYAML::ShaderFeatureFlags> {
+  static void mapping(IO &IO, DXContainerYAML::ShaderFeatureFlags &Flags);
 };
 
 template <> struct MappingTraits<DXContainerYAML::ShaderHash> {
diff --git a/llvm/lib/Object/DXContainer.cpp b/llvm/lib/Object/DXContainer.cpp
index 0401c20b98ec8e8..935749afe33852c 100644
--- a/llvm/lib/Object/DXContainer.cpp
+++ b/llvm/lib/Object/DXContainer.cpp
@@ -72,13 +72,13 @@ Error DXContainer::parseDXILHeader(StringRef Part) {
   return Error::success();
 }
 
-Error DXContainer::parseShaderFlags(StringRef Part) {
-  if (ShaderFlags)
+Error DXContainer::parseShaderFeatureFlags(StringRef Part) {
+  if (ShaderFeatureFlags)
     return parseFailed("More than one SFI0 part is present in the file");
   uint64_t FlagValue = 0;
   if (Error Err = readInteger(Part, Part.begin(), FlagValue))
     return Err;
-  ShaderFlags = FlagValue;
+  ShaderFeatureFlags = FlagValue;
   return Error::success();
 }
 
@@ -168,7 +168,7 @@ Error DXContainer::parsePartOffsets() {
         return Err;
       break;
     case dxbc::PartType::SFI0:
-      if (Error Err = parseShaderFlags(PartData))
+      if (Error Err = parseShaderFeatureFlags(PartData))
         return Err;
       break;
     case dxbc::PartType::HASH:
diff --git a/llvm/lib/ObjectYAML/DXContainerYAML.cpp b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
index 1f03f2c7d399667..7dc9822bdd221d6 100644
--- a/llvm/lib/ObjectYAML/DXContainerYAML.cpp
+++ b/llvm/lib/ObjectYAML/DXContainerYAML.cpp
@@ -23,15 +23,15 @@ namespace llvm {
 static_assert((uint64_t)dxbc::FeatureFlags::NextUnusedBit <= 1ull << 63,
               "Shader flag bits exceed enum size.");
 
-DXContainerYAML::ShaderFlags::ShaderFlags(uint64_t FlagData) {
-#define SHADER_FLAG(Num, Val, Str)                                             \
+DXContainerYAML::ShaderFeatureFlags::ShaderFeatureFlags(uint64_t FlagData) {
+#define SHADER_FEATURE_FLAG(Num, Val, Str)                                     \
   Val = (FlagData & (uint64_t)dxbc::FeatureFlags::Val) > 0;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 }
 
-uint64_t DXContainerYAML::ShaderFlags::getEncodedFlags() {
+uint64_t DXContainerYAML::ShaderFeatureFlags::getEncodedFlags() {
   uint64_t Flag = 0;
-#define SHADER_FLAG(Num, Val, Str)                                             \
+#define SHADER_FEATURE_FLAG(Num, Val, Str)                                     \
   if (Val)                                                                     \
     Flag |= (uint64_t)dxbc::FeatureFlags::Val;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
@@ -103,9 +103,9 @@ void MappingTraits<DXContainerYAML::DXILProgram>::mapping(
   IO.mapOptional("DXIL", Program.DXIL);
 }
 
-void MappingTraits<DXContainerYAML::ShaderFlags>::mapping(
-    IO &IO, DXContainerYAML::ShaderFlags &Flags) {
-#define SHADER_FLAG(Num, Val, Str) IO.mapRequired(#Val, Flags.Val);
+void MappingTraits<DXContainerYAML::ShaderFeatureFlags>::mapping(
+    IO &IO, DXContainerYAML::ShaderFeatureFlags &Flags) {
+#define SHADER_FEATURE_FLAG(Num, Val, Str) IO.mapRequired(#Val, Flags.Val);
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 }
 
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index bbb564356602119..66a9dc46bcbfbf6 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -51,7 +51,7 @@ void ComputedShaderFlags::print(raw_ostream &OS) const {
   if (FlagVal == 0)
     return;
   OS << "; Note: shader requires additional functionality:\n";
-#define SHADER_FLAG(bit, FlagName, Str)                                        \
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str)                                \
   if (FlagName)                                                                \
     OS << ";       " Str "\n";
 #include "llvm/BinaryFormat/DXContainerConstants.def"
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.h b/llvm/lib/Target/DirectX/DXILShaderFlags.h
index 4f51873a2d0b34c..574a7b090f52814 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.h
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.h
@@ -29,17 +29,17 @@ class GlobalVariable;
 namespace dxil {
 
 struct ComputedShaderFlags {
-#define SHADER_FLAG(bit, FlagName, Str) bool FlagName : 1;
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str) bool FlagName : 1;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
 
-#define SHADER_FLAG(bit, FlagName, Str) FlagName = false;
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str) FlagName = false;
   ComputedShaderFlags() {
 #include "llvm/BinaryFormat/DXContainerConstants.def"
   }
 
   operator uint64_t() const {
     uint64_t FlagValue = 0;
-#define SHADER_FLAG(bit, FlagName, Str)                                        \
+#define SHADER_FEATURE_FLAG(bit, FlagName, Str)                                \
   FlagValue |=                                                                 \
       FlagName ? static_cast<uint64_t>(dxbc::FeatureFlags::FlagName) : 0ull;
 #include "llvm/BinaryFormat/DXContainerConstants.def"
diff --git a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
index b58d7cd952aff0e..69d9b9a2f784f79 100644
--- a/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
+++ b/llvm/tools/obj2yaml/dxcontainer2yaml.cpp
@@ -71,10 +71,10 @@ dumpDXContainer(MemoryBufferRef Source) {
       break;
     }
     case dxbc::PartType::SFI0: {
-      std::optional<uint64_t> Flags = Container.getShaderFlags();
+      std::optional<uint64_t> Flags = Container.getShaderFeatureFlags();
       // Omit the flags in the YAML if they are missing or zero.
       if (Flags && *Flags > 0)
-        NewPart.Flags = DXContainerYAML::ShaderFlags(*Flags);
+        NewPart.Flags = DXContainerYAML::ShaderFeatureFlags(*Flags);
       break;
     }
     case dxbc::PartType::HASH: {

From cf1c97b2d29c51d6c2e79454f6ec3d1f8f98e672 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Tue, 27 Feb 2024 18:04:59 -0800
Subject: [PATCH 027/114] [AMDGPU] Do not attempt to fallback to default
 mutations (#83208)

IGLP itself will be in SavedMutations via mutations added during
Scheduler creation, thus falling back results in reapplying IGLP.

In PostRA scheduling, if we have multiple regions with IGLP
instructions, then we may have infinite loop.

Disable the feature for now.
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp     | 21 ++++---------------
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h       |  5 ++---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  8 +++----
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   | 10 ++++-----
 llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll  | 15 +++++++++++++
 5 files changed, 28 insertions(+), 31 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index e3f724850795388..57769fe998d1fea 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2337,8 +2337,6 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
 
   ScheduleDAGMI *DAG;
 
-  std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations;
-
   // Organize lists of SchedGroups by their SyncID. SchedGroups /
   // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
   // between then.
@@ -2381,10 +2379,7 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
   AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
 
   IGroupLPDAGMutation() = default;
-  IGroupLPDAGMutation(
-      AMDGPU::SchedulingPhase Phase,
-      std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations)
-      : SavedMutations(SavedMutations), Phase(Phase) {}
+  IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
 };
 
 unsigned SchedGroup::NumSchedGroups = 0;
@@ -2602,13 +2597,6 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
     PS.solve();
     return;
   }
-
-  if (!SavedMutations)
-    return;
-
-  // We did not apply a mutation, fall back to SavedMutations
-  for (auto &m : *SavedMutations)
-    m->apply(DAG);
 }
 
 void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
@@ -2707,10 +2695,9 @@ namespace llvm {
 /// same scheduling region (e.g. pre and post-RA scheduling / multiple
 /// scheduling "phases"), we can reenter this mutation framework more than once
 /// for a given region.
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
-    AMDGPU::SchedulingPhase Phase,
-    std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations) {
-  return std::make_unique<IGroupLPDAGMutation>(Phase, SavedMutations);
+std::unique_ptr<ScheduleDAGMutation>
+createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) {
+  return std::make_unique<IGroupLPDAGMutation>(Phase);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index 46ef4d702d0022d..aff7096f26d671b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -20,9 +20,8 @@ namespace AMDGPU {
 enum class SchedulingPhase { Initial, PreRAReentry, PostRA };
 } // namespace AMDGPU
 
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(
-    AMDGPU::SchedulingPhase Phase,
-    std::vector<std::unique_ptr<ScheduleDAGMutation>> *SavedMutations);
+std::unique_ptr<ScheduleDAGMutation>
+createIGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase);
 
 } // namespace llvm
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0d830df1f1f1dff..76e843455bab75c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -461,8 +461,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-  DAG->addMutation(
-      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
+  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
   DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
@@ -472,8 +471,7 @@ static ScheduleDAGInstrs *
 createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
       new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
-  DAG->addMutation(
-      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial, nullptr));
+  DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
   return DAG;
 }
 
@@ -937,7 +935,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
       DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
     DAG->addMutation(
-        createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA, nullptr));
+        createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
     if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
       DAG->addMutation(createVOPDPairingMutation());
     return DAG;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index f993ec8409c997e..9f419a7fbf6834f 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -713,8 +713,8 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
     return false;
 
   SavedMutations.swap(DAG.Mutations);
-  DAG.addMutation(createIGroupLPDAGMutation(
-      AMDGPU::SchedulingPhase::PreRAReentry, nullptr));
+  DAG.addMutation(
+      createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));
 
   InitialOccupancy = DAG.MinOccupancy;
   // Aggressivly try to reduce register pressure in the unclustered high RP
@@ -858,8 +858,7 @@ bool GCNSchedStage::initGCNRegion() {
                           StageID == GCNSchedStageID::ILPInitialSchedule;
     DAG.addMutation(createIGroupLPDAGMutation(
         IsInitialStage ? AMDGPU::SchedulingPhase::Initial
-                       : AMDGPU::SchedulingPhase::PreRAReentry,
-        &SavedMutations));
+                       : AMDGPU::SchedulingPhase::PreRAReentry));
   }
 
   return true;
@@ -1573,8 +1572,7 @@ void GCNPostScheduleDAGMILive::schedule() {
   if (HasIGLPInstrs) {
     SavedMutations.clear();
     SavedMutations.swap(Mutations);
-    addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA,
-                                          &SavedMutations));
+    addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
   }
 
   ScheduleDAGMI::schedule();
diff --git a/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll
new file mode 100644
index 000000000000000..1113acb3c0305a0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/iglp.opt.reentry.ll
@@ -0,0 +1,15 @@
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -O3 < %s | FileCheck %s
+
+; Test should not result in build failure
+; CHECK-LABEL: shouldNotReApply
+
+define amdgpu_kernel void @shouldNotReApply() {
+entry:
+  tail call void @llvm.amdgcn.sched.barrier(i32 0)
+  store <4 x i32> zeroinitializer, ptr addrspace(3) null, align 2147483648
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 0, i32 0)
+  tail call void @llvm.amdgcn.sched.barrier(i32 0)
+  store i32 0, ptr addrspace(5) null, align 2147483648
+  tail call void @llvm.amdgcn.sched.group.barrier(i32 0, i32 0, i32 0)
+  ret void
+}

From 91d23370cd4608f84f5209e445579a3b24ae3545 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 28 Feb 2024 10:26:54 +0800
Subject: [PATCH 028/114] [RISCV] Use a tail agnostic vslideup if possible for
 scalable insert_subvector (#83146)

If we know that an insert_subvector inserting a fixed subvector will
overwrite the entire tail of the vector, we use a tail agnostic
vslideup. This was added in https://reviews.llvm.org/D147347, but we can
do the same thing for scalable vectors too.

The `Policy` variable is defined in a slightly weird place but this is
to mirror the fixed length subvector code path as closely as possible. I
think we may be able to deduplicate them in future.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp        |  9 ++++++++-
 llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll    |  2 +-
 llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll         | 14 +++++++-------
 llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll            |  8 ++++----
 llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll        |  6 +++---
 llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll       |  2 +-
 .../CodeGen/RISCV/rvv/vector-interleave-store.ll   |  2 +-
 llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll   | 12 ++++++------
 llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll       |  4 ++--
 .../CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll     |  4 ++--
 10 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e34750d057301c0..e95e21bda687e8e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9728,8 +9728,15 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
 
   auto [Mask, VL] = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
 
+  ElementCount EndIndex =
+      ElementCount::getScalable(RemIdx) + SubVecVT.getVectorElementCount();
   VL = computeVLMax(SubVecVT, DL, DAG);
 
+  // Use tail agnostic policy if we're inserting over Vec's tail.
+  unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
+  if (EndIndex == VecVT.getVectorElementCount())
+    Policy = RISCVII::TAIL_AGNOSTIC;
+
   // If we're inserting into the lowest elements, use a tail undisturbed
   // vmv.v.v.
   if (RemIdx == 0) {
@@ -9743,7 +9750,7 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     VL = DAG.getNode(ISD::ADD, DL, XLenVT, SlideupAmt, VL);
 
     SubVec = getVSlideup(DAG, Subtarget, DL, InterSubVT, AlignedExtract, SubVec,
-                         SlideupAmt, Mask, VL);
+                         SlideupAmt, Mask, VL, Policy);
   }
 
   // If required, insert this subvector back into the correct vector register.
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index 0f3f57a0dec59ac..d377082761736fa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -76,7 +76,7 @@ define <vscale x 4 x i8> @insert_nxv1i8_nxv4i8_3(<vscale x 4 x i8> %vec, <vscale
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a1, a1, a0
 ; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x i8> @llvm.vector.insert.nxv1i8.nxv4i8(<vscale x 4 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
index 243dc19a25588da..1ef63ffa9ee0c7f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp-vp.ll
@@ -2235,9 +2235,9 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFH-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; ZVFH-NEXT:    vmfeq.vv v16, v8, v24, v0.t
 ; ZVFH-NEXT:    add a0, a1, a1
-; ZVFH-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; ZVFH-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; ZVFH-NEXT:    vslideup.vx v16, v1, a1
-; ZVFH-NEXT:    vmv1r.v v0, v16
+; ZVFH-NEXT:    vmv.v.v v0, v16
 ; ZVFH-NEXT:    csrr a0, vlenb
 ; ZVFH-NEXT:    slli a0, a0, 4
 ; ZVFH-NEXT:    add sp, sp, a0
@@ -2337,7 +2337,7 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:  # %bb.3:
 ; ZVFHMIN-NEXT:    mv a2, a5
 ; ZVFHMIN-NEXT:  .LBB85_4:
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v2, v26, a3
 ; ZVFHMIN-NEXT:    sub a5, a2, a4
 ; ZVFHMIN-NEXT:    sltu a6, a2, a5
@@ -2395,12 +2395,12 @@ define <vscale x 64 x i1> @fcmp_oeq_vv_nxv64f16(<vscale x 64 x half> %va, <vscal
 ; ZVFHMIN-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vmv1r.v v0, v1
 ; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v24, v0.t
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v3, a3
 ; ZVFHMIN-NEXT:    add a0, a1, a1
-; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v8, v2, a1
-; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
 ; ZVFHMIN-NEXT:    csrr a0, vlenb
 ; ZVFHMIN-NEXT:    li a1, 34
 ; ZVFHMIN-NEXT:    mul a0, a0, a1
@@ -3637,7 +3637,7 @@ define <vscale x 32 x i1> @fcmp_oeq_vv_nxv32f64(<vscale x 32 x double> %va, <vsc
 ; CHECK-NEXT:    slli a0, a1, 1
 ; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v17, v16, a0
 ; CHECK-NEXT:    vmv1r.v v0, v17
 ; CHECK-NEXT:    csrr a0, vlenb
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
index e77966d8c43b37b..aee255196ce2e83 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-fp.ll
@@ -3387,7 +3387,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    srli a0, a0, 3
 ; RV32-NEXT:    add a1, a0, a0
-; RV32-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; RV32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; RV32-NEXT:    vslideup.vx v0, v24, a0
 ; RV32-NEXT:    ret
 ;
@@ -3400,7 +3400,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    srli a0, a0, 3
 ; RV64-NEXT:    add a1, a0, a0
-; RV64-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; RV64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; RV64-NEXT:    vslideup.vx v0, v24, a0
 ; RV64-NEXT:    ret
 ;
@@ -3413,7 +3413,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN32-NEXT:    csrr a0, vlenb
 ; ZVFHMIN32-NEXT:    srli a0, a0, 3
 ; ZVFHMIN32-NEXT:    add a1, a0, a0
-; ZVFHMIN32-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; ZVFHMIN32-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; ZVFHMIN32-NEXT:    vslideup.vx v0, v24, a0
 ; ZVFHMIN32-NEXT:    ret
 ;
@@ -3426,7 +3426,7 @@ define <vscale x 16 x i1> @fcmp_oeq_vf_nx16f64(<vscale x 16 x double> %va) {
 ; ZVFHMIN64-NEXT:    csrr a0, vlenb
 ; ZVFHMIN64-NEXT:    srli a0, a0, 3
 ; ZVFHMIN64-NEXT:    add a1, a0, a0
-; ZVFHMIN64-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; ZVFHMIN64-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; ZVFHMIN64-NEXT:    vslideup.vx v0, v24, a0
 ; ZVFHMIN64-NEXT:    ret
   %vc = fcmp oeq <vscale x 16 x double> %va, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
index 007afe12b8e4385..a23b7c7b6ae9e43 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-int-vp.ll
@@ -2424,7 +2424,7 @@ define <vscale x 32 x i1> @icmp_eq_vv_nxv32i32(<vscale x 32 x i32> %va, <vscale
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vmseq.vv v16, v8, v24, v0.t
 ; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v1, a1
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -2459,7 +2459,7 @@ define <vscale x 32 x i1> @icmp_eq_vx_nxv32i32(<vscale x 32 x i32> %va, i32 %b,
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    add a0, a2, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v25, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
@@ -2492,7 +2492,7 @@ define <vscale x 32 x i1> @icmp_eq_vx_swap_nxv32i32(<vscale x 32 x i32> %va, i32
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vmseq.vx v16, v8, a0, v0.t
 ; CHECK-NEXT:    add a0, a2, a2
-; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v16, v25, a2
 ; CHECK-NEXT:    vmv1r.v v0, v16
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
index a2ac684604b9441..5f35a4e50a9523f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/setcc-integer.ll
@@ -3235,7 +3235,7 @@ define <vscale x 16 x i1> @icmp_eq_vi_nx16i64(<vscale x 16 x i64> %va) {
 ; CHECK-NEXT:    vsetvli a2, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vmseq.vi v24, v16, 0
 ; CHECK-NEXT:    vmseq.vi v0, v8, 0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v24, a0
 ; CHECK-NEXT:    ret
   %vc = icmp eq <vscale x 16 x i64> %va, zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
index c23c10205e6e36e..4c64b1677b36265 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-store.ll
@@ -22,7 +22,7 @@ define void @vector_interleave_store_nxv32i1_nxv16i1(<vscale x 16 x i1> %a, <vsc
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 2
 ; CHECK-NEXT:    add a2, a1, a1
-; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v8, a1
 ; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vsm.v v9, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index e84fd1b1a7036a5..1acc0fec8fe5869 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -24,7 +24,7 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v8, a0
 ; CHECK-NEXT:    ret
 ;
@@ -44,7 +44,7 @@ define <vscale x 32 x i1> @vector_interleave_nxv32i1_nxv16i1(<vscale x 16 x i1>
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    srli a0, a0, 2
 ; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; ZVBB-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v0, v8, a0
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 32 x i1> @llvm.experimental.vector.interleave2.nxv32i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b)
@@ -376,9 +376,9 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v8, v10, a0
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a0
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv4f16_nxv2f16:
@@ -391,9 +391,9 @@ define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half
 ; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
 ; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a0
-; ZVBB-NEXT:    vmv1r.v v8, v10
+; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
   %res = call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
   ret <vscale x 4 x half> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
index 8e983f63428a6a2..b888fde7d068367 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfptoi-sdnode.ll
@@ -937,7 +937,7 @@ define <vscale x 32 x i1> @vfptosi_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-NEXT:    vfncvt.rtz.x.f.w v8, v24
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
   %evec = fptosi <vscale x 32 x half> %va to <vscale x 32 x i1>
@@ -967,7 +967,7 @@ define <vscale x 32 x i1> @vfptoui_nxv32f16_nxv32i1(<vscale x 32 x half> %va) {
 ; ZVFHMIN-NEXT:    vfncvt.rtz.xu.f.w v8, v24
 ; ZVFHMIN-NEXT:    vand.vi v8, v8, 1
 ; ZVFHMIN-NEXT:    vmsne.vi v0, v8, 0
-; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; ZVFHMIN-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslideup.vx v0, v16, a0
 ; ZVFHMIN-NEXT:    ret
   %evec = fptoui <vscale x 32 x half> %va to <vscale x 32 x i1>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 2546ec95a007939..515d77109af9f71 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -894,7 +894,7 @@ define half @vreduce_ord_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-NEXT:    lui a2, 1048568
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a2
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
@@ -982,7 +982,7 @@ define half @vreduce_fadd_nxv3f16(<vscale x 3 x half> %v, half %s) {
 ; CHECK-NEXT:    lui a2, 1048568
 ; CHECK-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.x v9, a2
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v9, a1
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0

From 1f2a1a72ae6615ce80fcd6f1185d1cde607377d2 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Tue, 27 Feb 2024 18:33:43 -0800
Subject: [PATCH 029/114] [flang][runtime] Fixed flang+Werror buildbots after
 #83169.

---
 flang/include/flang/Common/float128.h     | 5 +++++
 flang/runtime/Float128Math/cabs.cpp       | 6 ++++--
 flang/runtime/Float128Math/math-entries.h | 2 --
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/flang/include/flang/Common/float128.h b/flang/include/flang/Common/float128.h
index 61b3a77b867a93b..2e76bc0a162e61d 100644
--- a/flang/include/flang/Common/float128.h
+++ b/flang/include/flang/Common/float128.h
@@ -54,9 +54,13 @@
 /* Define pure C CFloat128Type and CFloat128ComplexType. */
 #if LDBL_MANT_DIG == 113
 typedef long double CFloat128Type;
+#ifndef __cplusplus
 typedef long double _Complex CFloat128ComplexType;
+#endif
 #elif HAS_FLOAT128
 typedef __float128 CFloat128Type;
+
+#ifndef __cplusplus
 /*
  * Use mode() attribute supported by GCC and Clang.
  * Adjust it for other compilers as needed.
@@ -66,5 +70,6 @@ typedef _Complex float __attribute__((mode(TC))) CFloat128ComplexType;
 #else
 typedef _Complex float __attribute__((mode(KC))) CFloat128ComplexType;
 #endif
+#endif // __cplusplus
 #endif
 #endif /* FORTRAN_COMMON_FLOAT128_H_ */
diff --git a/flang/runtime/Float128Math/cabs.cpp b/flang/runtime/Float128Math/cabs.cpp
index 2867c8a4578a80b..827b197a6a81ae8 100644
--- a/flang/runtime/Float128Math/cabs.cpp
+++ b/flang/runtime/Float128Math/cabs.cpp
@@ -10,13 +10,15 @@
 
 namespace Fortran::runtime {
 extern "C" {
-
+#if 0
+// FIXME: temporarily disabled. Need to add pure C entry point
+// using C _Complex ABI.
 #if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 // NOTE: Flang calls the runtime APIs using C _Complex ABI
 CppTypeFor<TypeCategory::Real, 16> RTDEF(CAbsF128)(CFloat128ComplexType x) {
   return CAbs<RTNAME(CAbsF128)>::invoke(x);
 }
 #endif
-
+#endif
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index 141648d2fb2c54f..83298674c4971fd 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -61,7 +61,6 @@ DEFINE_FALLBACK(Asinh)
 DEFINE_FALLBACK(Atan)
 DEFINE_FALLBACK(Atan2)
 DEFINE_FALLBACK(Atanh)
-DEFINE_FALLBACK(CAbs)
 DEFINE_FALLBACK(Ceil)
 DEFINE_FALLBACK(Cos)
 DEFINE_FALLBACK(Cosh)
@@ -163,7 +162,6 @@ DEFINE_SIMPLE_ALIAS(Asinh, asinhq)
 DEFINE_SIMPLE_ALIAS(Atan, atanq)
 DEFINE_SIMPLE_ALIAS(Atan2, atan2q)
 DEFINE_SIMPLE_ALIAS(Atanh, atanhq)
-DEFINE_SIMPLE_ALIAS(CAbs, cabsq)
 DEFINE_SIMPLE_ALIAS(Ceil, ceilq)
 DEFINE_SIMPLE_ALIAS(Cos, cosq)
 DEFINE_SIMPLE_ALIAS(Cosh, coshq)

From 7c206c7812408f152baffa3c73f765b7d9ffdf18 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Tue, 27 Feb 2024 18:44:28 -0800
Subject: [PATCH 030/114] [BOLT] Refactor interface for instruction labels.
 NFCI (#83209)

To avoid accidentally setting the label twice for the same instruction,
which can lead to a "lost" label, introduce getOrSetInstLabel()
function. Rename existing functions to getInstLabel()/setInstLabel() to
make it explicit that they operate on instruction labels. Add an
assertion in setInstLabel() that the instruction did not have a prior
label set.
---
 bolt/include/bolt/Core/MCPlusBuilder.h   |  9 +++++++--
 bolt/lib/Core/BinaryContext.cpp          |  2 +-
 bolt/lib/Core/BinaryEmitter.cpp          |  2 +-
 bolt/lib/Core/BinaryFunction.cpp         |  2 +-
 bolt/lib/Core/Exceptions.cpp             |  5 ++---
 bolt/lib/Core/MCPlusBuilder.cpp          | 19 ++++++++++++++++---
 bolt/lib/Rewrite/LinuxKernelRewriter.cpp | 14 ++++----------
 7 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index eeb7609ff6b5f15..6bb76d1b917db31 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1183,11 +1183,16 @@ class MCPlusBuilder {
   bool clearOffset(MCInst &Inst) const;
 
   /// Return the label of \p Inst, if available.
-  MCSymbol *getLabel(const MCInst &Inst) const;
+  MCSymbol *getInstLabel(const MCInst &Inst) const;
+
+  /// Set the label of \p Inst or return the existing label for the instruction.
+  /// This label will be emitted right before \p Inst is emitted to MCStreamer.
+  MCSymbol *getOrCreateInstLabel(MCInst &Inst, const Twine &Name,
+                                 MCContext *Ctx) const;
 
   /// Set the label of \p Inst. This label will be emitted right before \p Inst
   /// is emitted to MCStreamer.
-  bool setLabel(MCInst &Inst, MCSymbol *Label) const;
+  void setInstLabel(MCInst &Inst, MCSymbol *Label) const;
 
   /// Get instruction size specified via annotation.
   std::optional<uint32_t> getSize(const MCInst &Inst) const;
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index d544ece13a832fd..b29ebbbfa18c4bc 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1967,7 +1967,7 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction,
     OS << " # Offset: " << *Offset;
   if (std::optional<uint32_t> Size = MIB->getSize(Instruction))
     OS << " # Size: " << *Size;
-  if (MCSymbol *Label = MIB->getLabel(Instruction))
+  if (MCSymbol *Label = MIB->getInstLabel(Instruction))
     OS << " # Label: " << *Label;
 
   MIB->printAnnotations(Instruction, OS);
diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index d4b668c1d7e7bdd..97d19b75200f51d 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -489,7 +489,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF,
 
       if (!EmitCodeOnly) {
         // A symbol to be emitted before the instruction to mark its location.
-        MCSymbol *InstrLabel = BC.MIB->getLabel(Instr);
+        MCSymbol *InstrLabel = BC.MIB->getInstLabel(Instr);
 
         if (opts::UpdateDebugSections && BF.getDWARFUnit()) {
           LastLocSeen = emitLineInfo(BF, Instr.getLoc(), LastLocSeen,
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 54f2f9d972a4617..00df42c11e2239c 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -1424,7 +1424,7 @@ Error BinaryFunction::disassemble() {
     InstrMapType::iterator II = Instructions.find(Offset);
     assert(II != Instructions.end() && "reference to non-existing instruction");
 
-    BC.MIB->setLabel(II->second, Label);
+    BC.MIB->setInstLabel(II->second, Label);
   }
 
   // Reset symbolizer for the disassembler.
diff --git a/bolt/lib/Core/Exceptions.cpp b/bolt/lib/Core/Exceptions.cpp
index 54618aeb95cccbe..82bddf76d5b8c9a 100644
--- a/bolt/lib/Core/Exceptions.cpp
+++ b/bolt/lib/Core/Exceptions.cpp
@@ -408,12 +408,11 @@ void BinaryFunction::updateEHRanges() {
 
         // Same symbol is used for the beginning and the end of the range.
         MCSymbol *EHSymbol;
-        if (MCSymbol *InstrLabel = BC.MIB->getLabel(Instr)) {
+        if (MCSymbol *InstrLabel = BC.MIB->getInstLabel(Instr)) {
           EHSymbol = InstrLabel;
         } else {
           std::unique_lock<llvm::sys::RWMutex> Lock(BC.CtxMutex);
-          EHSymbol = BC.Ctx->createNamedTempSymbol("EH");
-          BC.MIB->setLabel(Instr, EHSymbol);
+          EHSymbol = BC.MIB->getOrCreateInstLabel(Instr, "EH", BC.Ctx.get());
         }
 
         // At this point we could be in one of the following states:
diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp
index 44e5f88d8950fc3..bd9bd0c45922a5d 100644
--- a/bolt/lib/Core/MCPlusBuilder.cpp
+++ b/bolt/lib/Core/MCPlusBuilder.cpp
@@ -12,6 +12,7 @@
 
 #include "bolt/Core/MCPlusBuilder.h"
 #include "bolt/Core/MCPlus.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -266,17 +267,29 @@ bool MCPlusBuilder::clearOffset(MCInst &Inst) const {
   return true;
 }
 
-MCSymbol *MCPlusBuilder::getLabel(const MCInst &Inst) const {
+MCSymbol *MCPlusBuilder::getInstLabel(const MCInst &Inst) const {
   if (std::optional<int64_t> Label =
           getAnnotationOpValue(Inst, MCAnnotation::kLabel))
     return reinterpret_cast<MCSymbol *>(*Label);
   return nullptr;
 }
 
-bool MCPlusBuilder::setLabel(MCInst &Inst, MCSymbol *Label) const {
+MCSymbol *MCPlusBuilder::getOrCreateInstLabel(MCInst &Inst, const Twine &Name,
+                                              MCContext *Ctx) const {
+  MCSymbol *Label = getInstLabel(Inst);
+  if (Label)
+    return Label;
+
+  Label = Ctx->createNamedTempSymbol(Name);
+  setAnnotationOpValue(Inst, MCAnnotation::kLabel,
+                       reinterpret_cast<int64_t>(Label));
+  return Label;
+}
+
+void MCPlusBuilder::setInstLabel(MCInst &Inst, MCSymbol *Label) const {
+  assert(!getInstLabel(Inst) && "Instruction already has assigned label.");
   setAnnotationOpValue(Inst, MCAnnotation::kLabel,
                        reinterpret_cast<int64_t>(Label));
-  return true;
 }
 
 std::optional<uint32_t> MCPlusBuilder::getSize(const MCInst &Inst) const {
diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index 6377c1197253c89..0d7dc1070ce75e4 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -770,11 +770,8 @@ Error LinuxKernelRewriter::rewriteORCTables() {
           continue;
 
         // Issue label for the instruction.
-        MCSymbol *Label = BC.MIB->getLabel(Inst);
-        if (!Label) {
-          Label = BC.Ctx->createTempSymbol("__ORC_");
-          BC.MIB->setLabel(Inst, Label);
-        }
+        MCSymbol *Label =
+            BC.MIB->getOrCreateInstLabel(Inst, "__ORC_", BC.Ctx.get());
 
         if (Error E = emitORCEntry(0, *ErrorOrState, Label))
           return E;
@@ -908,11 +905,8 @@ Error LinuxKernelRewriter::readStaticCalls() {
 
     BC.MIB->addAnnotation(*Inst, "StaticCall", EntryID);
 
-    MCSymbol *Label = BC.MIB->getLabel(*Inst);
-    if (!Label) {
-      Label = BC.Ctx->createTempSymbol("__SC_");
-      BC.MIB->setLabel(*Inst, Label);
-    }
+    MCSymbol *Label =
+        BC.MIB->getOrCreateInstLabel(*Inst, "__SC_", BC.Ctx.get());
 
     StaticCallEntries.push_back({EntryID, BF, Label});
   }

From 9d56be010cf30054313f5b3fea82331491c58cdb Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Wed, 28 Feb 2024 05:00:07 +0100
Subject: [PATCH 031/114]  [MLIR][OpenMP] Support basic materialization for
 `omp.private` ops (#81715)

Adds basic support for materializing delayed privatization. So far, the
restrictions on the implementation are:
- Only `private` clauses are supported (`firstprivate` support will be
  added in a later PR).
---
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |   5 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 182 +++++++++++++++---
 mlir/test/Target/LLVMIR/openmp-private.mlir   | 142 ++++++++++++++
 3 files changed, 297 insertions(+), 32 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-private.mlir

diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index c2b471ab96183f8..8a6980e2c6a2d96 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1957,7 +1957,10 @@ LogicalResult PrivateClauseOp::verify() {
   Type symType = getType();
 
   auto verifyTerminator = [&](Operation *terminator) -> LogicalResult {
-    if (!terminator->hasSuccessors() && !llvm::isa<YieldOp>(terminator))
+    if (!terminator->getBlock()->getSuccessors().empty())
+      return success();
+
+    if (!llvm::isa<YieldOp>(terminator))
       return mlir::emitError(terminator->getLoc())
              << "expected exit block terminator to be an `omp.yield` op.";
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6e53d801a0d2f0b..8c20689c4a39dd3 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -396,9 +396,9 @@ collectReductionDecls(T loop,
 
 /// Translates the blocks contained in the given region and appends them to at
 /// the current insertion point of `builder`. The operations of the entry block
-/// are appended to the current insertion block, which is not expected to have a
-/// terminator. If set, `continuationBlockArgs` is populated with translated
-/// values that correspond to the values omp.yield'ed from the region.
+/// are appended to the current insertion block. If set, `continuationBlockArgs`
+/// is populated with translated values that correspond to the values
+/// omp.yield'ed from the region.
 static LogicalResult inlineConvertOmpRegions(
     Region &region, StringRef blockName, llvm::IRBuilderBase &builder,
     LLVM::ModuleTranslation &moduleTranslation,
@@ -409,7 +409,14 @@ static LogicalResult inlineConvertOmpRegions(
   // Special case for single-block regions that don't create additional blocks:
   // insert operations without creating additional blocks.
   if (llvm::hasSingleElement(region)) {
+    llvm::Instruction *potentialTerminator =
+        builder.GetInsertBlock()->empty() ? nullptr
+                                          : &builder.GetInsertBlock()->back();
+
+    if (potentialTerminator && potentialTerminator->isTerminator())
+      potentialTerminator->removeFromParent();
     moduleTranslation.mapBlock(&region.front(), builder.GetInsertBlock());
+
     if (failed(moduleTranslation.convertBlock(
             region.front(), /*ignoreArguments=*/true, builder)))
       return failure();
@@ -423,6 +430,10 @@ static LogicalResult inlineConvertOmpRegions(
     // Drop the mapping that is no longer necessary so that the same region can
     // be processed multiple times.
     moduleTranslation.forgetMapping(region);
+
+    if (potentialTerminator && potentialTerminator->isTerminator())
+      potentialTerminator->insertAfter(&builder.GetInsertBlock()->back());
+
     return success();
   }
 
@@ -1000,11 +1011,50 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
+/// A RAII class that on construction replaces the region arguments of the
+/// parallel op (which correspond to private variables) with the actual private
+/// variables they correspond to. This prepares the parallel op so that it
+/// matches what is expected by the OMPIRBuilder.
+///
+/// On destruction, it restores the original state of the operation so that on
+/// the MLIR side, the op is not affected by conversion to LLVM IR.
+class OmpParallelOpConversionManager {
+public:
+  OmpParallelOpConversionManager(omp::ParallelOp opInst)
+      : region(opInst.getRegion()), privateVars(opInst.getPrivateVars()),
+        privateArgBeginIdx(opInst.getNumReductionVars()),
+        privateArgEndIdx(privateArgBeginIdx + privateVars.size()) {
+    auto privateVarsIt = privateVars.begin();
+
+    for (size_t argIdx = privateArgBeginIdx; argIdx < privateArgEndIdx;
+         ++argIdx, ++privateVarsIt)
+      mlir::replaceAllUsesInRegionWith(region.getArgument(argIdx),
+                                       *privateVarsIt, region);
+  }
+
+  ~OmpParallelOpConversionManager() {
+    auto privateVarsIt = privateVars.begin();
+
+    for (size_t argIdx = privateArgBeginIdx; argIdx < privateArgEndIdx;
+         ++argIdx, ++privateVarsIt)
+      mlir::replaceAllUsesInRegionWith(*privateVarsIt,
+                                       region.getArgument(argIdx), region);
+  }
+
+private:
+  Region &region;
+  OperandRange privateVars;
+  unsigned privateArgBeginIdx;
+  unsigned privateArgEndIdx;
+};
+
 /// Converts the OpenMP parallel operation to LLVM IR.
 static LogicalResult
 convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+  OmpParallelOpConversionManager raii(opInst);
+
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
   // relying on captured variables.
   LogicalResult bodyGenStatus = success();
@@ -1086,12 +1136,81 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
   // TODO: Perform appropriate actions according to the data-sharing
   // attribute (shared, private, firstprivate, ...) of variables.
-  // Currently defaults to shared.
+  // Currently shared and private are supported.
   auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,
                     llvm::Value &, llvm::Value &vPtr,
                     llvm::Value *&replacementValue) -> InsertPointTy {
     replacementValue = &vPtr;
 
+    // If this is a private value, this lambda will return the corresponding
+    // mlir value and its `PrivateClauseOp`. Otherwise, empty values are
+    // returned.
+    auto [privVar, privatizerClone] =
+        [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
+      if (!opInst.getPrivateVars().empty()) {
+        auto privVars = opInst.getPrivateVars();
+        auto privatizers = opInst.getPrivatizers();
+
+        for (auto [privVar, privatizerAttr] :
+             llvm::zip_equal(privVars, *privatizers)) {
+          // Find the MLIR private variable corresponding to the LLVM value
+          // being privatized.
+          llvm::Value *llvmPrivVar = moduleTranslation.lookupValue(privVar);
+          if (llvmPrivVar != &vPtr)
+            continue;
+
+          SymbolRefAttr privSym = llvm::cast<SymbolRefAttr>(privatizerAttr);
+          omp::PrivateClauseOp privatizer =
+              SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
+                  opInst, privSym);
+
+          // Clone the privatizer in case it is used by more than one parallel
+          // region. The privatizer is processed in-place (see below) before it
+          // gets inlined in the parallel region and therefore processing the
+          // original op is dangerous.
+          return {privVar, privatizer.clone()};
+        }
+      }
+
+      return {mlir::Value(), omp::PrivateClauseOp()};
+    }();
+
+    if (privVar) {
+      if (privatizerClone.getDataSharingType() ==
+          omp::DataSharingClauseType::FirstPrivate) {
+        privatizerClone.emitOpError(
+            "TODO: delayed privatization is not "
+            "supported for `firstprivate` clauses yet.");
+        bodyGenStatus = failure();
+        return codeGenIP;
+      }
+
+      Region &allocRegion = privatizerClone.getAllocRegion();
+
+      // Replace the privatizer block argument with mlir value being privatized.
+      // This way, the body of the privatizer will be changed from using the
+      // region/block argument to the value being privatized.
+      auto allocRegionArg = allocRegion.getArgument(0);
+      replaceAllUsesInRegionWith(allocRegionArg, privVar, allocRegion);
+
+      auto oldIP = builder.saveIP();
+      builder.restoreIP(allocaIP);
+
+      SmallVector<llvm::Value *, 1> yieldedValues;
+      if (failed(inlineConvertOmpRegions(allocRegion, "omp.privatizer", builder,
+                                         moduleTranslation, &yieldedValues))) {
+        opInst.emitError("failed to inline `alloc` region of an `omp.private` "
+                         "op in the parallel region");
+        bodyGenStatus = failure();
+      } else {
+        assert(yieldedValues.size() == 1);
+        replacementValue = yieldedValues.front();
+      }
+
+      privatizerClone.erase();
+      builder.restoreIP(oldIP);
+    }
+
     return codeGenIP;
   };
 
@@ -1635,7 +1754,7 @@ getRefPtrIfDeclareTarget(mlir::Value value,
 // A small helper structure to contain data gathered
 // for map lowering and coalese it into one area and
 // avoiding extra computations such as searches in the
-// llvm module for lowered mapped varibles or checking
+// llvm module for lowered mapped variables or checking
 // if something is declare target (and retrieving the
 // value) more than neccessary.
 struct MapInfoData : llvm::OpenMPIRBuilder::MapInfosTy {
@@ -2854,26 +2973,26 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation(
                                                 moduleTranslation);
               return failure();
             })
-      .Case(
-          "omp.requires",
-          [&](Attribute attr) {
-            if (auto requiresAttr = attr.dyn_cast<omp::ClauseRequiresAttr>()) {
-              using Requires = omp::ClauseRequires;
-              Requires flags = requiresAttr.getValue();
-              llvm::OpenMPIRBuilderConfig &config =
-                  moduleTranslation.getOpenMPBuilder()->Config;
-              config.setHasRequiresReverseOffload(
-                  bitEnumContainsAll(flags, Requires::reverse_offload));
-              config.setHasRequiresUnifiedAddress(
-                  bitEnumContainsAll(flags, Requires::unified_address));
-              config.setHasRequiresUnifiedSharedMemory(
-                  bitEnumContainsAll(flags, Requires::unified_shared_memory));
-              config.setHasRequiresDynamicAllocators(
-                  bitEnumContainsAll(flags, Requires::dynamic_allocators));
-              return success();
-            }
-            return failure();
-          })
+      .Case("omp.requires",
+            [&](Attribute attr) {
+              if (auto requiresAttr =
+                      attr.dyn_cast<omp::ClauseRequiresAttr>()) {
+                using Requires = omp::ClauseRequires;
+                Requires flags = requiresAttr.getValue();
+                llvm::OpenMPIRBuilderConfig &config =
+                    moduleTranslation.getOpenMPBuilder()->Config;
+                config.setHasRequiresReverseOffload(
+                    bitEnumContainsAll(flags, Requires::reverse_offload));
+                config.setHasRequiresUnifiedAddress(
+                    bitEnumContainsAll(flags, Requires::unified_address));
+                config.setHasRequiresUnifiedSharedMemory(
+                    bitEnumContainsAll(flags, Requires::unified_shared_memory));
+                config.setHasRequiresDynamicAllocators(
+                    bitEnumContainsAll(flags, Requires::dynamic_allocators));
+                return success();
+              }
+              return failure();
+            })
       .Default([](Attribute) {
         // Fall through for omp attributes that do not require lowering.
         return success();
@@ -2988,12 +3107,13 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
       .Case([&](omp::TargetOp) {
         return convertOmpTarget(*op, builder, moduleTranslation);
       })
-      .Case<omp::MapInfoOp, omp::DataBoundsOp>([&](auto op) {
-        // No-op, should be handled by relevant owning operations e.g.
-        // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
-        // discarded
-        return success();
-      })
+      .Case<omp::MapInfoOp, omp::DataBoundsOp, omp::PrivateClauseOp>(
+          [&](auto op) {
+            // No-op, should be handled by relevant owning operations e.g.
+            // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
+            // discarded
+            return success();
+          })
       .Default([&](Operation *inst) {
         return inst->emitError("unsupported OpenMP operation: ")
                << inst->getName();
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
new file mode 100644
index 000000000000000..58bda87c3b7bea3
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -0,0 +1,142 @@
+// Test code-gen for `omp.parallel` ops with delayed privatizers (i.e. using
+// `omp.private` ops).
+
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+llvm.func @parallel_op_1_private(%arg0: !llvm.ptr) {
+  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: @parallel_op_1_private
+// CHECK-SAME: (ptr %[[ORIG:.*]]) {
+// CHECK: %[[OMP_PAR_ARG:.*]] = alloca { ptr }, align 8
+// CHECK: %[[ORIG_GEP:.*]] = getelementptr { ptr }, ptr %[[OMP_PAR_ARG]], i32 0, i32 0
+// CHECK: store ptr %[[ORIG]], ptr %[[ORIG_GEP]], align 8
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @parallel_op_1_private..omp_par, ptr %[[OMP_PAR_ARG]])
+// CHECK: }
+
+// CHECK-LABEL: void @parallel_op_1_private..omp_par
+// CHECK-SAME: (ptr noalias %{{.*}}, ptr noalias %{{.*}}, ptr %[[ARG:.*]])
+// CHECK: %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %[[ARG]], i32 0, i32 0
+// CHECK: %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8
+
+// Check that the privatizer alloc region was inlined properly.
+// CHECK: %[[PRIV_ALLOC:.*]] = alloca float, align 4
+// CHECK: %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR]], align 4
+// CHECK: store float %[[ORIG_VAL]], ptr %[[PRIV_ALLOC]], align 4
+// CHECK-NEXT: br
+
+// Check that the privatized value is used (rather than the original one).
+// CHECK: load float, ptr %[[PRIV_ALLOC]], align 4
+// CHECK: }
+
+llvm.func @parallel_op_2_privates(%arg0: !llvm.ptr, %arg1: !llvm.ptr) {
+  omp.parallel private(@x.privatizer %arg0 -> %arg2 : !llvm.ptr, @y.privatizer %arg1 -> %arg3 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    %1 = llvm.load %arg3 : !llvm.ptr -> i32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: @parallel_op_2_privates
+// CHECK-SAME: (ptr %[[ORIG1:.*]], ptr %[[ORIG2:.*]]) {
+// CHECK: %[[OMP_PAR_ARG:.*]] = alloca { ptr, ptr }, align 8
+// CHECK: %[[ORIG1_GEP:.*]] = getelementptr { ptr, ptr }, ptr %[[OMP_PAR_ARG]], i32 0, i32 0
+// CHECK: store ptr %[[ORIG1]], ptr %[[ORIG1_GEP]], align 8
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @parallel_op_2_privates..omp_par, ptr %[[OMP_PAR_ARG]])
+// CHECK: }
+
+// CHECK-LABEL: void @parallel_op_2_privates..omp_par
+// CHECK-SAME: (ptr noalias %{{.*}}, ptr noalias %{{.*}}, ptr %[[ARG:.*]])
+// CHECK: %[[ORIG1_PTR_PTR:.*]] = getelementptr { ptr, ptr }, ptr %[[ARG]], i32 0, i32 0
+// CHECK: %[[ORIG1_PTR:.*]] = load ptr, ptr %[[ORIG1_PTR_PTR]], align 8
+// CHECK: %[[ORIG2_PTR_PTR:.*]] = getelementptr { ptr, ptr }, ptr %[[ARG]], i32 0, i32 1
+// CHECK: %[[ORIG2_PTR:.*]] = load ptr, ptr %[[ORIG2_PTR_PTR]], align 8
+
+// Check that the privatizer alloc region was inlined properly.
+// CHECK: %[[PRIV1_ALLOC:.*]] = alloca float, align 4
+// CHECK: %[[ORIG1_VAL:.*]] = load float, ptr %[[ORIG1_PTR]], align 4
+// CHECK: store float %[[ORIG1_VAL]], ptr %[[PRIV1_ALLOC]], align 4
+// CHECK: %[[PRIV2_ALLOC:.*]] = alloca i32, align 4
+// CHECK: %[[ORIG2_VAL:.*]] = load i32, ptr %[[ORIG2_PTR]], align 4
+// CHECK: store i32 %[[ORIG2_VAL]], ptr %[[PRIV2_ALLOC]], align 4
+// CHECK-NEXT: br
+
+// Check that the privatized value is used (rather than the original one).
+// CHECK: load float, ptr %[[PRIV1_ALLOC]], align 4
+// CHECK: load i32, ptr %[[PRIV2_ALLOC]], align 4
+// CHECK: }
+
+omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr
+  %1 = llvm.load %arg0 : !llvm.ptr -> f32
+  llvm.store %1, %0 : f32, !llvm.ptr
+  omp.yield(%0 : !llvm.ptr)
+}
+
+omp.private {type = private} @y.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x i32 : (i32) -> !llvm.ptr
+  %1 = llvm.load %arg0 : !llvm.ptr -> i32
+  llvm.store %1, %0 : i32, !llvm.ptr
+  omp.yield(%0 : !llvm.ptr)
+}
+
+// -----
+
+llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr) {
+  omp.parallel private(@multi_block.privatizer %arg0 -> %arg2 : !llvm.ptr) {
+    %0 = llvm.load %arg2 : !llvm.ptr -> f32
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define internal void @parallel_op_private_multi_block..omp_par
+// CHECK: omp.par.entry:
+// CHECK:  %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %{{.*}}, i32 0, i32 0
+// CHECK:  %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8
+// CHECK:   br label %[[PRIV_BB1:.*]]
+
+// Check contents of the first block in the `alloc` region.
+// CHECK: [[PRIV_BB1]]:
+// CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = alloca float, align 4
+// CHECK-NEXT:   br label %[[PRIV_BB2:.*]]
+
+// Check contents of the second block in the `alloc` region.
+// CHECK: [[PRIV_BB2]]:
+// CHECK-NEXT:   %[[ORIG_PTR2:.*]] = phi ptr [ %[[ORIG_PTR]], %[[PRIV_BB1]] ]
+// CHECK-NEXT:   %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB1]] ]
+// CHECK-NEXT:   %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR2]], align 4
+// CHECK-NEXT:   store float %[[ORIG_VAL]], ptr %[[PRIV_ALLOC2]], align 4
+// CHECK-NEXT:   br label %[[PRIV_CONT:.*]]
+
+// Check that the privatizer's continuation block yileds the private clone's
+// address.
+// CHECK: [[PRIV_CONT]]:
+// CHECK-NEXT:   %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %[[PRIV_BB2]] ]
+// CHECK-NEXT:   br label %[[PAR_REG:.*]]
+
+// Check that the body of the parallel region loads from the private clone.
+// CHECK: [[PAR_REG]]:
+// CHECK:        %{{.*}} = load float, ptr %[[PRIV_ALLOC3]], align 4
+
+omp.private {type = private} @multi_block.privatizer : !llvm.ptr alloc {
+^bb0(%arg0: !llvm.ptr):
+  %c1 = llvm.mlir.constant(1 : i32) : i32
+  %0 = llvm.alloca %c1 x f32 : (i32) -> !llvm.ptr
+  llvm.br ^bb1(%arg0, %0 : !llvm.ptr, !llvm.ptr)
+
+^bb1(%arg1: !llvm.ptr, %arg2: !llvm.ptr):
+  %1 = llvm.load %arg1 : !llvm.ptr -> f32
+  llvm.store %1, %arg2 : f32, !llvm.ptr
+  omp.yield(%arg2 : !llvm.ptr)
+}

From 87c0260f45e5a02cb07722d089dae3f0f84c7b3d Mon Sep 17 00:00:00 2001
From: erman-gurses <99776114+erman-gurses@users.noreply.github.com>
Date: Tue, 27 Feb 2024 23:28:12 -0500
Subject: [PATCH 032/114] [AMDGPU] Add parameterization for optimized shared
 memory variables (#82508)

- This PR adds parameterization for shared memory variables that are
used for optimization: `sharedMemoryLineSizeBytes` and
`defaultVectorSizeBits.`
- The default values are set to 128 for both variables since it gives
zero bank conflicts.
---
 .../AMDGPU/TransformOps/AMDGPUTransformOps.td |  6 +-
 .../mlir/Dialect/AMDGPU/Transforms/Passes.td  |  9 ++-
 .../Dialect/AMDGPU/Transforms/Transforms.h    | 10 +++-
 .../TransformOps/AMDGPUTransformOps.cpp       |  3 +-
 .../Transforms/OptimizeSharedMemory.cpp       | 57 ++++++++++---------
 .../AMDGPU/optimize_shmem_reads_writes.mlir   | 50 +++++++---------
 ...transform_optimize_shmem_reads_writes.mlir | 46 +++++++--------
 7 files changed, 97 insertions(+), 84 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
index 23873d86b495c61..0eb67050608630f 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
@@ -13,8 +13,8 @@ include "mlir/Dialect/Transform/IR/TransformAttrs.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
 include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
 
+include "mlir/Interfaces/SideEffectInterfaces.td"
 //===----------------------------------------------------------------------===//
 // ApplyOptimizeSharedMemoryReadsAndWritesOp
 //===----------------------------------------------------------------------===//
@@ -28,7 +28,9 @@ def ApplyOptimizeSharedMemoryReadsAndWritesOp :
     reads/writes with the goal of avoiding bank conflicts.
   }];
 
-  let arguments = (ins TransformHandleTypeInterface:$target);
+  let arguments = (ins TransformHandleTypeInterface:$target,
+                    DefaultValuedOptionalAttr<I64Attr, "128">:$sharedMemoryLineSizeBytes,
+                    DefaultValuedOptionalAttr<I64Attr, "128">:$defaultVectorSizeBits);
   let results = (outs);
 
   let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index c8059e6d316e8a3..67f951fd19d1727 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -37,10 +37,17 @@ def OptimizeSharedMemory : Pass<"amdgpu-optimize-shared-memory"> {
     attempts to optimize reads/writes from a memref representing GPU shared
     memory in order to avoid bank conflicts.
   }];
-
   let dependentDialects = [
     "memref::MemRefDialect", "vector::VectorDialect"
   ];
+  let options = [
+    Option<"sharedMemoryLineSizeBytes", "shared-memory-line-size-bytes", "int64_t",
+           /*default=*/"128",
+           "Shared memory line size in bytes">,
+    Option<"defaultVectorSizeBits", "default-vector-size-bits", "int64_t",
+           /*default=*/"128",
+           "Default vector size in bits">,
+  ];
 }
 
 #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
index 79f9ab71a2b430a..843cea2c503b9a2 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
@@ -45,11 +45,15 @@ namespace amdgpu {
 /// function that depends on the row Index. The permutation function is chosen
 /// to ensure that sequential distributed+vectorized reads/writes down a single
 /// dimension of the memref have minimal conflicts.
-LogicalResult optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
-                                                 Value memrefValue);
+LogicalResult
+optimizeSharedMemoryReadsAndWrites(Operation *parentOp, Value memrefValue,
+                                   int64_t sharedMemoryLineSizeBytes,
+                                   int64_t defaultVectorSizeBits);
 
 std::optional<LogicalResult>
-optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp);
+optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp,
+                                     int64_t sharedMemoryLineSizeBytes,
+                                     int64_t defaultVectorSizeBits);
 
 } // namespace amdgpu
 } // namespace mlir
diff --git a/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp b/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
index ff29f9f69385359..b7e17a928973899 100644
--- a/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
@@ -27,7 +27,8 @@ DiagnosedSilenceableFailure
 ApplyOptimizeSharedMemoryReadsAndWritesOp::applyToOne(
     TransformRewriter &rewriter, FuncOp funcOp, ApplyToEachResultList &results,
     TransformState &state) {
-  optimizeSharedMemoryReadsAndWritesOp(funcOp);
+  optimizeSharedMemoryReadsAndWritesOp(funcOp, getSharedMemoryLineSizeBytes(),
+                                       getDefaultVectorSizeBits());
   return DiagnosedSilenceableFailure::success();
 }
 
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
index 6bd03ed833898dd..32fab265e03cc0c 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
@@ -35,13 +35,6 @@ namespace amdgpu {
 using namespace mlir;
 using namespace mlir::amdgpu;
 
-/// The size of a shared memory line according to AMD documentation.
-/// https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/instinct-mi200-cdna2-instruction-set-architecture.pdf
-constexpr int64_t kSharedMemoryLineSizeBytes = 64;
-/// We optimize for 64bit accesses, but this can be made an argument in the
-/// future.
-constexpr int64_t kDefaultVectorSizeBits = 64;
-
 /// Uses `srcIndexValue` to permute `tgtIndexValue` via
 /// `result = xor(floordiv(srcIdxVal,permuteEveryN),
 ///               floordiv(tgtIdxVal,vectorSize)))
@@ -49,7 +42,9 @@ constexpr int64_t kDefaultVectorSizeBits = 64;
 /// This is done using an optimized sequence of `arith` operations.
 static Value permuteVectorOffset(OpBuilder &b, Location loc,
                                  ArrayRef<Value> indices, MemRefType memrefTy,
-                                 int64_t srcDim, int64_t tgtDim) {
+                                 int64_t srcDim, int64_t tgtDim,
+                                 int64_t sharedMemoryLineSizeBytes,
+                                 int64_t defaultVectorSizeBits) {
   // Adjust the src index to change how often the permutation changes
   // if necessary.
   Value src = indices[srcDim];
@@ -57,9 +52,9 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
   // We only want to permute every N iterations of the target dim where N is
   // ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).
   const int64_t permuteEveryN = std::max<int64_t>(
-      1, kSharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *
-                                        memrefTy.getElementTypeBitWidth()) /
-                                       8));
+      1, sharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *
+                                       memrefTy.getElementTypeBitWidth()) /
+                                      8));
 
   // clang-format off
   // Index bit representation (b0 = least significant bit) for dim(1)
@@ -71,7 +66,7 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
   // bits[N:M] = vector index
   // clang-format on
   int64_t n =
-      llvm::Log2_64(kDefaultVectorSizeBits / memrefTy.getElementTypeBitWidth());
+      llvm::Log2_64(defaultVectorSizeBits / memrefTy.getElementTypeBitWidth());
   int64_t m = llvm::Log2_64(memrefTy.getDimSize(tgtDim));
 
   // Capture bits[0:(M-N)] of src by first creating a (M-N) mask.
@@ -105,9 +100,11 @@ static Value permuteVectorOffset(OpBuilder &b, Location loc,
 static void transformIndices(OpBuilder &builder, Location loc,
                              SmallVector<Value, 4> &indices,
                              MemRefType memrefTy, int64_t srcDim,
-                             int64_t tgtDim) {
+                             int64_t tgtDim, int64_t sharedMemoryLineSizeBytes,
+                             int64_t defaultVectorSizeBits) {
   indices[tgtDim] =
-      permuteVectorOffset(builder, loc, indices, memrefTy, srcDim, tgtDim);
+      permuteVectorOffset(builder, loc, indices, memrefTy, srcDim, tgtDim,
+                          sharedMemoryLineSizeBytes, defaultVectorSizeBits);
 }
 
 // Return all operations within `parentOp` that read from or write to
@@ -149,8 +146,9 @@ getShmReadAndWriteOps(Operation *parentOp, Value shmMemRef,
   return success();
 }
 
-LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
-                                                         Value memrefValue) {
+LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(
+    Operation *parentOp, Value memrefValue, int64_t sharedMemoryLineSizeBytes,
+    int64_t defaultVectorSizeBits) {
   auto memRefType = dyn_cast<MemRefType>(memrefValue.getType());
   if (!memRefType ||
       !amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(memRefType))
@@ -167,10 +165,10 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
   // If dim[rank-1] is small enough to fit 8 rows in a 128B line.
   const int64_t rowSize = memRefType.getDimSize(memRefType.getRank() - 1);
   const int64_t rowsPerLine =
-      (8 * kSharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /
+      (8 * sharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /
       rowSize;
   const int64_t threadGroupSize =
-      1LL << (7 - llvm::Log2_64(kDefaultVectorSizeBits / 8));
+      1LL << (7 - llvm::Log2_64(defaultVectorSizeBits / 8));
   if (rowsPerLine >= threadGroupSize)
     return failure();
 
@@ -198,7 +196,8 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
     auto indices = amdgpu::getIndices(shmWriteOp);
     SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
     transformIndices(builder, shmWriteOp->getLoc(), transformedIndices,
-                     memRefType, srcDim, tgtDim);
+                     memRefType, srcDim, tgtDim, sharedMemoryLineSizeBytes,
+                     defaultVectorSizeBits);
     amdgpu::setIndices(shmWriteOp, transformedIndices);
   }
 
@@ -210,7 +209,8 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
     auto indices = amdgpu::getIndices(shmReadOp);
     SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
     transformIndices(builder, shmReadOp->getLoc(), transformedIndices,
-                     memRefType, srcDim, tgtDim);
+                     memRefType, srcDim, tgtDim, sharedMemoryLineSizeBytes,
+                     defaultVectorSizeBits);
     amdgpu::setIndices(shmReadOp, transformedIndices);
   }
 
@@ -218,7 +218,9 @@ LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
 }
 
 std::optional<LogicalResult>
-amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp) {
+amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp,
+                                             int64_t sharedMemoryLineSizeBytes,
+                                             int64_t defaultVectorSizeBits) {
   SmallVector<memref::AllocOp> shmAllocOps;
   funcOp.walk([&](memref::AllocOp allocOp) {
     if (!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(allocOp.getType()))
@@ -226,8 +228,9 @@ amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp) {
     shmAllocOps.push_back(allocOp);
   });
   for (auto allocOp : shmAllocOps) {
-    if (failed(amdgpu::optimizeSharedMemoryReadsAndWrites(funcOp,
-                                                          allocOp.getMemref())))
+    if (failed(amdgpu::optimizeSharedMemoryReadsAndWrites(
+            funcOp, allocOp.getMemref(), sharedMemoryLineSizeBytes,
+            defaultVectorSizeBits)))
       return failure();
   }
   return success();
@@ -237,7 +240,8 @@ struct OptimizeSharedMemoryPass
     : public amdgpu::impl::OptimizeSharedMemoryBase<OptimizeSharedMemoryPass> {
 public:
   OptimizeSharedMemoryPass() = default;
-
+  OptimizeSharedMemoryPass(const OptimizeSharedMemoryOptions &options)
+      : OptimizeSharedMemoryBase(options) {}
   void runOnOperation() override {
     Operation *op = getOperation();
     SmallVector<memref::AllocOp> shmAllocOps;
@@ -248,8 +252,9 @@ struct OptimizeSharedMemoryPass
       shmAllocOps.push_back(allocOp);
     });
     for (auto allocOp : shmAllocOps) {
-      if (failed(optimizeSharedMemoryReadsAndWrites(getOperation(),
-                                                    allocOp.getMemref())))
+      if (failed(optimizeSharedMemoryReadsAndWrites(op, allocOp.getMemref(),
+                                                    sharedMemoryLineSizeBytes,
+                                                    defaultVectorSizeBits)))
         return;
     }
   }
diff --git a/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir b/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
index a1de1ff87c229fb..983eee732e2afe1 100644
--- a/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
+++ b/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
@@ -1,13 +1,13 @@
-// RUN: mlir-opt  %s --pass-pipeline='builtin.module(func.func(amdgpu-optimize-shared-memory))' | FileCheck %s
+// RUN: mlir-opt %s --pass-pipeline='builtin.module(func.func(amdgpu-optimize-shared-memory))' | FileCheck %s
   
   // CHECK: @optimize_shmem([[arg0:%.+]]: memref<{{.*}}>, [[readRow:%.+]]: index, [[readCol:%.+]]: index, [[writeRow:%.+]]: index, [[writeCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index, [[fragColPerm:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index)
-  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>, 
+  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>,
                     %readRow: index, %readCol: index,
                     %writeRow: index, %writeCol: index,
-                    %fragRow: index, %fragCol: index, 
+                    %fragRow: index, %fragCol: index,
                     %fragColPerm: index,
                     %stRow: index, %stCol: index) {
-    // CHECK:    %[[cst:.+]] = arith.constant 0.000000e+00 : f16                  
+    // CHECK:    %[[cst:.+]] = arith.constant 0.000000e+00 : f16
     %cst = arith.constant 0.000000e+00 : f16
 
     // CHECK: [[shmA:%.+]] = memref.alloc
@@ -15,42 +15,36 @@
     %shmA = memref.alloc() {alignment = 64 : i64} : memref<128x32xf16, 3>
     %shmB = memref.alloc() {alignment = 64 : i64} : memref<256x32xf16, 3>
 
-    // CHECK: %[[D0:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
     %0 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
-    // CHECK: vector.transfer_write %[[D0:.+]], [[shmB]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %0, %shmB[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]     
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]       
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
     // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]] 
-    // CHECK:  vector.load [[shmB:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<256x32xf16, 3>, vector<8xf16>
     %1 = vector.load %shmB[%fragRow, %fragColPerm] : memref<256x32xf16, 3>, vector<8xf16>
 
-    // CHECK: %[[D2:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
     %2 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
-    // CHECK: vector.transfer_write %[[D2:.+]], [[shmA:%.+]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %2, %shmA[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]          
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                     
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]] 
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
     // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
-    // CHECK:  vector.load [[shmA:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<128x32xf16, 3>, vector<8xf16>
     %3 = vector.load %shmA[%fragRow, %fragColPerm] : memref<128x32xf16, 3>, vector<8xf16>
     return
   }
diff --git a/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir b/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
index 143e7c2d2709522..b1bb91ffc297214 100644
--- a/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
+++ b/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt  %s -transform-interpreter  | FileCheck %s
+// RUN: mlir-opt %s -transform-interpreter | FileCheck %s
 
   // CHECK: @optimize_shmem([[arg0:%.+]]: memref<{{.*}}>, [[readRow:%.+]]: index, [[readCol:%.+]]: index, [[writeRow:%.+]]: index, [[writeCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index, [[fragColPerm:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index)
-  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>, 
+  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>,
                     %readRow: index, %readCol: index,
                     %writeRow: index, %writeCol: index,
-                    %fragRow: index, %fragCol: index, 
+                    %fragRow: index, %fragCol: index,
                     %fragColPerm: index,
                     %stRow: index, %stCol: index) {
     %cst = arith.constant 0.000000e+00 : f16
@@ -13,33 +13,33 @@
     %shmB = memref.alloc() {alignment = 64 : i64} : memref<256x32xf16, 3>
 
     %0 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %0, %shmB[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]     
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]       
-    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]] 
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
     %1 = vector.load %shmB[%fragRow, %fragColPerm] : memref<256x32xf16, 3>, vector<8xf16>
     %2 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
     vector.transfer_write %2, %shmA[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
     gpu.barrier
     gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]          
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                     
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]] 
+    // CHECK: [[c6:%.+]] = arith.constant 6 : index
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
     // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
     %3 = vector.load %shmA[%fragRow, %fragColPerm] : memref<128x32xf16, 3>, vector<8xf16>
     return
@@ -48,7 +48,7 @@
 module attributes { transform.with_named_sequence } {
   transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
-    transform.amdgpu.optimize_shared_memory_reads_and_writes %0 : (!transform.any_op) -> ()
+    transform.amdgpu.optimize_shared_memory_reads_and_writes %0 {sharedMemoryLineSizeBytes = 128, defaultVectorSizeBits = 128}: (!transform.any_op) -> ()
     transform.yield
   } // @__transform_main
 } // module

From c2b952926fe8707527cf1b8bab211dc4c7ab9aee Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn.dawkins@gmail.com>
Date: Wed, 28 Feb 2024 00:11:28 -0500
Subject: [PATCH 033/114] [mlir][vector] Fix n-d transfer write distribution
 (#83215)

Currently n-d transfer write distribution can be inconsistent with
distribution of reductions if a value has multiple users, one of which
is a transfer_write with a non-standard distribution map, and the other
of which is a vector.reduction.

We may want to consider removing the distribution map functionality in
the future for this reason.
---
 .../Vector/Transforms/VectorDistribute.cpp    | 44 ++++++++++++++++---
 .../Vector/vector-warp-distribute.mlir        | 25 +++++++++++
 .../Dialect/Vector/TestVectorTransforms.cpp   |  6 +--
 3 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 620ceee48b196df..b3ab4a916121e3a 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -443,15 +443,24 @@ static vector::TransferWriteOp cloneWriteOp(RewriterBase &rewriter,
 /// d1) and return vector<16x2x64>
 static VectorType getDistributedType(VectorType originalType, AffineMap map,
                                      int64_t warpSize) {
-  if (map.getNumResults() != 1)
-    return VectorType();
   SmallVector<int64_t> targetShape(originalType.getShape().begin(),
                                    originalType.getShape().end());
   for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
     unsigned position = map.getDimPosition(i);
-    if (targetShape[position] % warpSize != 0)
-      return VectorType();
+    if (targetShape[position] % warpSize != 0) {
+      if (warpSize % targetShape[position] != 0) {
+        return VectorType();
+      }
+      warpSize /= targetShape[position];
+      targetShape[position] = 1;
+      continue;
+    }
     targetShape[position] = targetShape[position] / warpSize;
+    warpSize = 1;
+    break;
+  }
+  if (warpSize != 1) {
+    return VectorType();
   }
   VectorType targetType =
       VectorType::get(targetShape, originalType.getElementType());
@@ -526,7 +535,30 @@ struct WarpOpTransferWrite : public OpRewritePattern<WarpExecuteOnLane0Op> {
     // 4. Reindex the write using the distribution map.
     auto newWarpOp =
         newWriteOp.getVector().getDefiningOp<WarpExecuteOnLane0Op>();
+
+    // Delinearize the lane id based on the way threads are divided across the
+    // vector. To get the number of threads per vector dimension, divide the
+    // sequential size by the distributed size along each dim.
     rewriter.setInsertionPoint(newWriteOp);
+    SmallVector<OpFoldResult> delinearizedIdSizes;
+    for (auto [seqSize, distSize] :
+         llvm::zip_equal(writtenVectorType.getShape(), targetType.getShape())) {
+      assert(seqSize % distSize == 0 && "Invalid distributed vector shape");
+      delinearizedIdSizes.push_back(rewriter.getIndexAttr(seqSize / distSize));
+    }
+    SmallVector<Value> delinearized;
+    if (map.getNumResults() > 1) {
+      delinearized = rewriter
+                         .create<mlir::affine::AffineDelinearizeIndexOp>(
+                             newWarpOp.getLoc(), newWarpOp.getLaneid(),
+                             delinearizedIdSizes)
+                         .getResults();
+    } else {
+      // If there is only one map result, we can elide the delinearization
+      // op and use the lane id directly.
+      delinearized.append(targetType.getRank(), newWarpOp.getLaneid());
+    }
+
     AffineMap indexMap = map.compose(newWriteOp.getPermutationMap());
     Location loc = newWriteOp.getLoc();
     SmallVector<Value> indices(newWriteOp.getIndices().begin(),
@@ -539,11 +571,11 @@ struct WarpOpTransferWrite : public OpRewritePattern<WarpExecuteOnLane0Op> {
         continue;
       unsigned indexPos = indexExpr.getPosition();
       unsigned vectorPos = cast<AffineDimExpr>(std::get<1>(it)).getPosition();
+      Value laneId = delinearized[vectorPos];
       auto scale =
           rewriter.getAffineConstantExpr(targetType.getDimSize(vectorPos));
       indices[indexPos] = affine::makeComposedAffineApply(
-          rewriter, loc, d0 + scale * d1,
-          {indices[indexPos], newWarpOp.getLaneid()});
+          rewriter, loc, d0 + scale * d1, {indices[indexPos], laneId});
     }
     newWriteOp.getIndicesMutable().assign(indices);
 
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index 9072603734879ea..bf90c4a6ebb3c21 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -1559,3 +1559,28 @@ func.func @warp_propagate_multi_dim_create_mask(%laneid: index, %m0: index, %m1:
 //       CHECK-PROP:   %[[DISTM0:.+]] = affine.apply #[[$SUBM0]]()[%[[M0]], %[[LANEID]]]
 //       CHECK-PROP:   %[[DISTM1:.+]] = affine.apply #[[$SUBM1]]()[%[[M1]], %[[LANEID]]]
 //       CHECK-PROP:   vector.create_mask %[[DISTM0]], %[[DISTM1]], %[[M2]] : vector<1x2x4xi1>
+
+// -----
+
+func.func @warp_propagate_nd_write(%laneid: index, %dest: memref<4x1024xf32>) {
+  %c0 = arith.constant 0 : index
+  vector.warp_execute_on_lane_0(%laneid)[32] -> () {
+    %0 = "some_def"() : () -> (vector<4x1024xf32>)
+    vector.transfer_write %0, %dest[%c0, %c0] : vector<4x1024xf32>, memref<4x1024xf32>
+    vector.yield
+  }
+  return
+}
+
+//       CHECK-DIST-AND-PROP: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 128)>
+
+// CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_nd_write(
+//       CHECK-DIST-AND-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x128xf32>) {
+//       CHECK-DIST-AND-PROP:     %[[V0:.*]] = "some_def"
+//       CHECK-DIST-AND-PROP:     vector.yield %[[V0]]
+//  CHECK-DIST-AND-PROP-SAME:       vector<4x1024xf32>
+//       CHECK-DIST-AND-PROP:   }
+
+//       CHECK-DIST-AND-PROP:   %[[IDS:.+]]:2 = affine.delinearize_index %{{.*}} into (%c4, %c8) : index, index
+//       CHECK-DIST-AND-PROP:   %[[INNER_ID:.+]] = affine.apply #map()[%[[IDS]]#1]
+//       CHECK-DIST-AND-PROP:   vector.transfer_write %[[W]], %{{.*}}[%[[IDS]]#0, %[[INNER_ID]]] {{.*}} : vector<1x128xf32>
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index 178a58e796b2469..915f713f7047be3 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -630,15 +630,13 @@ struct TestVectorDistribution
     });
     MLIRContext *ctx = &getContext();
     auto distributionFn = [](Value val) {
-      // Create a map (d0, d1) -> (d1) to distribute along the inner
-      // dimension. Once we support n-d distribution we can add more
-      // complex cases.
+      // Create an identity dim map of the same rank as the vector.
       VectorType vecType = dyn_cast<VectorType>(val.getType());
       int64_t vecRank = vecType ? vecType.getRank() : 0;
       OpBuilder builder(val.getContext());
       if (vecRank == 0)
         return AffineMap::get(val.getContext());
-      return AffineMap::get(vecRank, 0, builder.getAffineDimExpr(vecRank - 1));
+      return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
     };
     auto shuffleFn = [](Location loc, OpBuilder &builder, Value val,
                         Value srcIdx, int64_t warpSz) {

From 267beb10f2812107734a1cd2172b46e928af76b7 Mon Sep 17 00:00:00 2001
From: mlevesquedion <mlevesquedion@google.com>
Date: Tue, 27 Feb 2024 21:31:15 -0800
Subject: [PATCH 034/114] [MLIR] Fix a few links to passes in the documentation
 (#83221)

I double checked the links by building [the
website](https://github.com/llvm/mlir-www):

```
$ mlir-www-helper.sh --install-docs ../llvm-project website
$ cd website && hugo serve
```
---
 mlir/docs/PassManagement.md  | 3 +--
 mlir/docs/PatternRewriter.md | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/mlir/docs/PassManagement.md b/mlir/docs/PassManagement.md
index ff86bbfef7b0bfe..c9d705f0506a3e7 100644
--- a/mlir/docs/PassManagement.md
+++ b/mlir/docs/PassManagement.md
@@ -56,8 +56,7 @@ By default, an operation pass is `op-agnostic`, meaning that it operates on the
 operation type of the pass manager that it is added to. This means a pass may operate
 on many different types of operations. Agnostic passes should be written such that
 they do not make assumptions on the operation they run on. Examples of this type of pass are
-[Canonicalization](Pass.md/-canonicalize-canonicalize-operations)
-[Common Sub-Expression Elimination](Passes.md/#-cse-eliminate-common-sub-expressions).
+[Canonicalization](Passes.md/#-canonicalize) and [Common Sub-Expression Elimination](Passes.md/#-cse).
 
 To create an agnostic operation pass, a derived class must adhere to the following:
 
diff --git a/mlir/docs/PatternRewriter.md b/mlir/docs/PatternRewriter.md
index 011cd14175634be..0ba76199874cc3d 100644
--- a/mlir/docs/PatternRewriter.md
+++ b/mlir/docs/PatternRewriter.md
@@ -366,7 +366,7 @@ Note: This driver listens for IR changes via the callbacks provided by
 rewriter and do not bypass the rewriter API by modifying ops directly.
 
 Note: This driver is the one used by the [canonicalization](Canonicalization.md)
-[pass](Passes.md/#-canonicalize-canonicalize-operations) in MLIR.
+[pass](Passes.md/#-canonicalize) in MLIR.
 
 ### Debugging
 

From 4d04a40adb68f284350831911a658715134c66d8 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Tue, 27 Feb 2024 22:25:55 -0800
Subject: [PATCH 035/114] [alpha.webkit.UncountedCallArgsChecker] Allow a
 variable declaration in a trivial function. (#82291)

---
 .../Checkers/WebKit/PtrTypesSemantics.cpp       |  8 ++++++--
 .../Checkers/WebKit/uncounted-obj-arg.cpp       | 17 +++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index defd83ec8e179c1..01b191ab0eeaf45 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -310,8 +310,12 @@ class TrivialFunctionAnalysisVisitor
         return true;
       if (isa<EnumConstantDecl>(decl))
         return true;
-      if (auto *VD = dyn_cast<VarDecl>(decl))
-        return VD->hasConstantInitialization() && VD->getEvaluatedValue();
+      if (auto *VD = dyn_cast<VarDecl>(decl)) {
+        if (VD->hasConstantInitialization() && VD->getEvaluatedValue())
+          return true;
+        auto *Init = VD->getInit();
+        return !Init || Visit(Init);
+      }
     }
     return false;
   }
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
index ac16a31293f3def..80a9a263dab1404 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
@@ -199,6 +199,8 @@ class RefCounted {
   bool trivial23() const { return OptionSet<Flags>::fromRaw(v).contains(Flags::Flag1); }
   int trivial24() const { ASSERT(v); return v; }
   unsigned trivial25() const { return __c11_atomic_load((volatile _Atomic(unsigned) *)&v, __ATOMIC_RELAXED); }
+  bool trivial26() { bool hasValue = v; return !hasValue; }
+  bool trivial27(int v) { bool value; value = v ? 1 : 0; return value; }
 
   static RefCounted& singleton() {
     static RefCounted s_RefCounted;
@@ -262,6 +264,15 @@ class RefCounted {
     return __c11_atomic_load((volatile _Atomic(unsigned) *)another(), __ATOMIC_RELAXED);
   }
 
+  void nonTrivial11() {
+    Number num(0.3);
+  }
+
+  bool nonTrivial12() {
+    bool val = otherFunction();
+    return val;
+  }
+
   unsigned v { 0 };
   Number* number { nullptr };
   Enum enumValue { Enum::Value1 };
@@ -309,6 +320,8 @@ class UnrelatedClass {
     getFieldTrivial().trivial23(); // no-warning
     getFieldTrivial().trivial24(); // no-warning
     getFieldTrivial().trivial25(); // no-warning
+    getFieldTrivial().trivial26(); // no-warning
+    getFieldTrivial().trivial27(5); // no-warning
     RefCounted::singleton().trivial18(); // no-warning
     RefCounted::singleton().someFunction(); // no-warning
 
@@ -334,6 +347,10 @@ class UnrelatedClass {
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
     getFieldTrivial().nonTrivial10();
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial11();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
+    getFieldTrivial().nonTrivial12();
+    // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
   }
 };
 

From 8eea478f57e79b6fad065d023355907bc2098206 Mon Sep 17 00:00:00 2001
From: Ryan Prichard <rprichard@google.com>
Date: Tue, 27 Feb 2024 22:34:07 -0800
Subject: [PATCH 036/114] [ItaniumDemangle] reject A-F in FP literals (#83061)

Sync this change to the copy of ItaniumDemangle.h in "llvm":

https://github.com/llvm/llvm-project/pull/82864

The Itanium C++ ABI specifies that FP literals are encoded using a
lowercase hexadecimal string. Previously, libc++abi allowed uppercase
A-F characters but decoded them by subtracting 'a' from them, producing
negative digit values. It is especially confusing to accept an 'E' digit
because 'E' marks the end of the FP literal.
---
 llvm/include/llvm/Demangle/ItaniumDemangle.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 04bc58d8f63e474..d33af157543fecf 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -5540,7 +5540,7 @@ Node *AbstractManglingParser<Alloc, Derived>::parseFloatingLiteral() {
     return nullptr;
   std::string_view Data(First, N);
   for (char C : Data)
-    if (!std::isxdigit(C))
+    if (!(C >= '0' && C <= '9') && !(C >= 'a' && C <= 'f'))
       return nullptr;
   First += N;
   if (!consumeIf('E'))

From 28c29fbec3057692a7985819d799a9e5d47eb2d1 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 28 Feb 2024 14:27:54 +0800
Subject: [PATCH 037/114] [RISCV] Add exact VLEN RUNs for insert_subvector and
 concat_vector tests. NFC

Also update the RUNs in the extract_subvector tests to be consistent.
Using the term VLS/VLA here as it's more succinct than KNOWNVLEN/UNKNOWNVLEN.
---
 .../rvv/fixed-vectors-extract-subvector.ll    | 860 +++++++++---------
 .../rvv/fixed-vectors-insert-subvector.ll     | 559 +++++++++---
 .../RISCV/rvv/fixed-vectors-shuffle-concat.ll | 265 ++++--
 3 files changed, 1012 insertions(+), 672 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
index c49b1a7ad1861db..b9c611bf3e54a8d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-subvector.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-max=128 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,CHECK-KNOWNVLEN128
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA
+
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 
 define void @extract_v2i8_v4i8_0(ptr %x, ptr %y) {
 ; CHECK-LABEL: extract_v2i8_v4i8_0:
@@ -63,22 +69,22 @@ define void @extract_v2i8_v8i8_6(ptr %x, ptr %y) {
 }
 
 define void @extract_v1i32_v8i32_4(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v1i32_v8i32_4:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v1i32_v8i32_4:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 4
+; VLA-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v1i32_v8i32_4:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v1i32_v8i32_4:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v9, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <1 x i32> @llvm.vector.extract.v1i32.v8i32(<8 x i32> %a, i64 4)
   store <1 x i32> %c, ptr %y
@@ -86,24 +92,24 @@ define void @extract_v1i32_v8i32_4(ptr %x, ptr %y) {
 }
 
 define void @extract_v1i32_v8i32_5(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v1i32_v8i32_5:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 5
-; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v1i32_v8i32_5:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 5
+; VLA-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v1i32_v8i32_5:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 1
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v1i32_v8i32_5:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 1
+; VLS-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <1 x i32> @llvm.vector.extract.v1i32.v8i32(<8 x i32> %a, i64 5)
   store <1 x i32> %c, ptr %y
@@ -111,20 +117,20 @@ define void @extract_v1i32_v8i32_5(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_0(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_0:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_0:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 0)
   store <2 x i32> %c, ptr %y
@@ -132,24 +138,24 @@ define void @extract_v2i32_v8i32_0(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_2(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_2:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 2
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_2:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 2)
   store <2 x i32> %c, ptr %y
@@ -157,22 +163,22 @@ define void @extract_v2i32_v8i32_2(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_4(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_4:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_4:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 4
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_4:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_4:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v9, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 4)
   store <2 x i32> %c, ptr %y
@@ -180,24 +186,24 @@ define void @extract_v2i32_v8i32_4(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i32_v8i32_6(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_v8i32_6:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vle32.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_v8i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 6
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_v8i32_6:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vl2re32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_v8i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <8 x i32>, ptr %x
   %c = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> %a, i64 6)
   store <2 x i32> %c, ptr %y
@@ -230,59 +236,59 @@ define void @extract_v2i32_nxv16i32_2(<vscale x 16 x i32> %x, ptr %y) {
 }
 
 define void @extract_v2i32_nxv16i32_4(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_nxv16i32_4:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 4
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_nxv16i32_4:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 4
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_4:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v9, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_nxv16i32_4:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v9, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 4)
   store <2 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i32_nxv16i32_6(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_nxv16i32_6:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_nxv16i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 6
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_6:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_nxv16i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 2
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 6)
   store <2 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i32_nxv16i32_8:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i32_nxv16i32_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, m4, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 8
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i32_nxv16i32_8:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vse32.v v10, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i32_nxv16i32_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vse32.v v10, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i32> @llvm.vector.extract.v2i32.nxv16i32(<vscale x 16 x i32> %x, i64 8)
   store <2 x i32> %c, ptr %y
   ret void
@@ -339,40 +345,40 @@ define void @extract_v2i8_nxv2i8_6(<vscale x 2 x i8> %x, ptr %y) {
 }
 
 define void @extract_v8i32_nxv16i32_8(<vscale x 16 x i32> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i32_nxv16i32_8:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 8
-; CHECK-V-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-V-NEXT:    vse32.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i32_nxv16i32_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m4, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 8
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i32_nxv16i32_8:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vs2r.v v10, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i32_nxv16i32_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    ret
   %c = call <8 x i32> @llvm.vector.extract.v8i32.nxv16i32(<vscale x 16 x i32> %x, i64 8)
   store <8 x i32> %c, ptr %y
   ret void
 }
 
 define void @extract_v8i1_v64i1_0(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i1_v64i1_0:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i1_v64i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i1_v64i1_0:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i1_v64i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 0)
   store <8 x i1> %c, ptr %y
@@ -380,26 +386,26 @@ define void @extract_v8i1_v64i1_0(ptr %x, ptr %y) {
 }
 
 define void @extract_v8i1_v64i1_8(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i1_v64i1_8:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i1_v64i1_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 1
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i1_v64i1_8:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 1
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i1_v64i1_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 1
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 8)
   store <8 x i1> %c, ptr %y
@@ -407,26 +413,26 @@ define void @extract_v8i1_v64i1_8(ptr %x, ptr %y) {
 }
 
 define void @extract_v8i1_v64i1_48(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v8i1_v64i1_48:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v8, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v8i1_v64i1_48:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 6
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v8i1_v64i1_48:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 6
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v8i1_v64i1_48:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 1, e8, mf2, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 6
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <8 x i1> @llvm.vector.extract.v8i1.v64i1(<64 x i1> %a, i64 48)
   store <8 x i1> %c, ptr %y
@@ -508,38 +514,38 @@ define void @extract_v8i1_nxv64i1_192(<vscale x 64 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_v64i1_0(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_v64i1_0:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v0, (a0)
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_v64i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_v64i1_0:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_v64i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v0, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %a, i64 0)
   store <2 x i1> %c, ptr %y
@@ -547,48 +553,48 @@ define void @extract_v2i1_v64i1_0(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_v64i1_2:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v0, (a0)
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_v64i1_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 2
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_v64i1_2:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_v64i1_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v0, (a0)
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 2
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %a, i64 2)
   store <2 x i1> %c, ptr %y
@@ -596,49 +602,49 @@ define void @extract_v2i1_v64i1_2(ptr %x, ptr %y) {
 }
 
 define void @extract_v2i1_v64i1_42(ptr %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_v64i1_42:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    li a2, 64
-; CHECK-V-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vlm.v v0, (a0)
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    li a0, 42
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vx v8, v8, a0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a1)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_v64i1_42:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 64
+; VLA-NEXT:    vsetvli zero, a2, e8, m4, ta, ma
+; VLA-NEXT:    vlm.v v0, (a0)
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    li a0, 42
+; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
+; VLA-NEXT:    vslidedown.vx v8, v8, a0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a1)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_v64i1_42:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vlm.v v0, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v10, 10
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a1)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_v64i1_42:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m4, ta, ma
+; VLS-NEXT:    vlm.v v0, (a0)
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v10, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a1)
+; VLS-NEXT:    ret
   %a = load <64 x i1>, ptr %x
   %c = call <2 x i1> @llvm.vector.extract.v2i1.v64i1(<64 x i1> %a, i64 42)
   store <2 x i1> %c, ptr %y
@@ -665,45 +671,45 @@ define void @extract_v2i1_nxv2i1_0(<vscale x 2 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_nxv2i1_2(<vscale x 2 x i1> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_nxv2i1_2:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_nxv2i1_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetvli a1, zero, e8, mf4, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 2
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv2i1_2:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_nxv2i1_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v8, 2
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %x, i64 2)
   store <2 x i1> %c, ptr %y
   ret void
@@ -754,91 +760,91 @@ define void @extract_v2i1_nxv64i1_2(<vscale x 64 x i1> %x, ptr %y) {
 }
 
 define void @extract_v2i1_nxv64i1_42(<vscale x 64 x i1> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_nxv64i1_42:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    li a1, 42
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
-; CHECK-V-NEXT:    vslidedown.vx v8, v8, a1
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_nxv64i1_42:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    li a1, 42
+; VLA-NEXT:    vsetivli zero, 2, e8, m4, ta, ma
+; VLA-NEXT:    vslidedown.vx v8, v8, a1
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv64i1_42:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v10, 10
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_nxv64i1_42:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v10, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv64i1(<vscale x 64 x i1> %x, i64 42)
   store <2 x i1> %c, ptr %y
   ret void
 }
 
 define void @extract_v2i1_nxv32i1_26(<vscale x 32 x i1> %x, ptr %y) {
-; CHECK-V-LABEL: extract_v2i1_nxv32i1_26:
-; CHECK-V:       # %bb.0:
-; CHECK-V-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
-; CHECK-V-NEXT:    vslidedown.vi v8, v8, 26
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-V-NEXT:    vmv.v.i v8, 0
-; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmv.v.i v9, 0
-; CHECK-V-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-V-NEXT:    vmv.v.v v9, v8
-; CHECK-V-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-V-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-V-NEXT:    vsm.v v8, (a0)
-; CHECK-V-NEXT:    ret
+; VLA-LABEL: extract_v2i1_nxv32i1_26:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 2, e8, m2, ta, ma
+; VLA-NEXT:    vslidedown.vi v8, v8, 26
+; VLA-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v8, 0
+; VLA-NEXT:    vmv.v.i v8, 0
+; VLA-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vmsne.vi v8, v9, 0
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
 ;
-; CHECK-KNOWNVLEN128-LABEL: extract_v2i1_nxv32i1_26:
-; CHECK-KNOWNVLEN128:       # %bb.0:
-; CHECK-KNOWNVLEN128-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vslidedown.vi v8, v9, 10
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v8, 0
-; CHECK-KNOWNVLEN128-NEXT:    vmerge.vim v8, v8, 1, v0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.i v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmv.v.v v9, v8
-; CHECK-KNOWNVLEN128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-KNOWNVLEN128-NEXT:    vmsne.vi v8, v9, 0
-; CHECK-KNOWNVLEN128-NEXT:    vsm.v v8, (a0)
-; CHECK-KNOWNVLEN128-NEXT:    ret
+; VLS-LABEL: extract_v2i1_nxv32i1_26:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; VLS-NEXT:    vslidedown.vi v8, v9, 10
+; VLS-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v8, 0
+; VLS-NEXT:    vmv.v.i v8, 0
+; VLS-NEXT:    vmerge.vim v8, v8, 1, v0
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vsetivli zero, 2, e8, mf2, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vmsne.vi v8, v9, 0
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %c = call <2 x i1> @llvm.vector.extract.v2i1.nxv32i1(<vscale x 32 x i1> %x, i64 26)
   store <2 x i1> %c, ptr %y
   ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index efb1f720f2d096d..9f0240c53b219a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -1,9 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
 
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
+
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV32VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV64VLS %s
 
 define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
 ; CHECK-LABEL: insert_nxv8i32_v2i32_0:
@@ -45,26 +48,40 @@ define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v8i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v12
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v8i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v12
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v8i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
+; VLS-NEXT:    vmv.v.v v8, v12
+; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
   ret <vscale x 8 x i32> %v
 }
 
 define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %svp) {
-; CHECK-LABEL: insert_nxv8i32_v8i32_8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v12, 8
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv8i32_v8i32_8:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v12, (a0)
+; VLA-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v12, 8
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv8i32_v8i32_8:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl2re32.v v12, (a0)
+; VLS-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v12, 8
+; VLS-NEXT:    ret
   %sv = load <8 x i32>, ptr %svp
   %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
   ret <vscale x 8 x i32> %v
@@ -82,17 +99,27 @@ define <vscale x 8 x i32> @insert_nxv8i32_undef_v2i32_0(ptr %svp) {
 }
 
 define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v4i32_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v9, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v4i32_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vle32.v v9, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vse32.v v9, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v4i32_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl1re32.v v9, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; VLS-NEXT:    vmv.v.v v9, v8
+; VLS-NEXT:    vs1r.v v9, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <4 x i32>, ptr %vp
   %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 0)
@@ -101,15 +128,25 @@ define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v4i32_v2i32_2(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v4i32_v2i32_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a0)
-; CHECK-NEXT:    vslideup.vi v9, v8, 2
-; CHECK-NEXT:    vse32.v v9, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v4i32_v2i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vle32.v v9, (a0)
+; VLA-NEXT:    vslideup.vi v9, v8, 2
+; VLA-NEXT:    vse32.v v9, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v4i32_v2i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl1re32.v v9, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLS-NEXT:    vslideup.vi v9, v8, 2
+; VLS-NEXT:    vs1r.v v9, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <4 x i32>, ptr %vp
   %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 2)
@@ -118,13 +155,20 @@ define void @insert_v4i32_v2i32_2(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v4i32_undef_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vse32.v v8, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v4i32_undef_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; VLA-NEXT:    vse32.v v8, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v4i32_undef_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vs1r.v v8, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %sv, i64 0)
   store <4 x i32> %v, ptr %vp
@@ -132,17 +176,27 @@ define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_v2i32_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
-; CHECK-NEXT:    vmv.v.v v10, v8
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_v2i32_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v10, (a0)
+; VLA-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; VLA-NEXT:    vmv.v.v v10, v8
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_v2i32_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vl2re32.v v10, (a0)
+; VLS-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
+; VLS-NEXT:    vmv.v.v v10, v8
+; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 0)
@@ -151,17 +205,27 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_v2i32_2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 2
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_v2i32_2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v10, (a0)
+; VLA-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; VLA-NEXT:    vslideup.vi v10, v8, 2
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_v2i32_2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vle32.v v10, (a1)
+; VLS-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v10, 2
+; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 2)
@@ -170,15 +234,25 @@ define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_v2i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vslideup.vi v10, v8, 6
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_v2i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vle32.v v10, (a0)
+; VLA-NEXT:    vslideup.vi v10, v8, 6
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_v2i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vl2re32.v v8, (a0)
+; VLS-NEXT:    vle32.v v10, (a1)
+; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v10, 6
+; VLS-NEXT:    vs2r.v v8, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %vec = load <8 x i32>, ptr %vp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 6)
@@ -187,14 +261,23 @@ define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v8i32_undef_v2i32_6(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v8i32_undef_v2i32_6:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a1)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
-; CHECK-NEXT:    vslideup.vi v10, v8, 6
-; CHECK-NEXT:    vse32.v v10, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v8i32_undef_v2i32_6:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLA-NEXT:    vle32.v v8, (a1)
+; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLA-NEXT:    vslideup.vi v10, v8, 6
+; VLA-NEXT:    vse32.v v10, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v8i32_undef_v2i32_6:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; VLS-NEXT:    vle32.v v8, (a1)
+; VLS-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; VLS-NEXT:    vslideup.vi v10, v8, 6
+; VLS-NEXT:    vs2r.v v10, (a0)
+; VLS-NEXT:    ret
   %sv = load <2 x i32>, ptr %svp
   %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6)
   store <8 x i32> %v, ptr %vp
@@ -239,18 +322,30 @@ define void @insert_v4i16_v2i16_2(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v32i1_v8i1_0(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v32i1_v8i1_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlm.v v9, (a1)
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
-; CHECK-NEXT:    vmv.v.v v8, v9
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vsm.v v8, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v32i1_v8i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 32
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vlm.v v9, (a1)
+; VLA-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
+; VLA-NEXT:    vmv.v.v v8, v9
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v32i1_v8i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vlm.v v9, (a1)
+; VLS-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
+; VLS-NEXT:    vmv.v.v v8, v9
+; VLS-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %v = load <32 x i1>, ptr %vp
   %sv = load <8 x i1>, ptr %svp
   %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 0)
@@ -259,18 +354,30 @@ define void @insert_v32i1_v8i1_0(ptr %vp, ptr %svp) {
 }
 
 define void @insert_v32i1_v8i1_16(ptr %vp, ptr %svp) {
-; CHECK-LABEL: insert_v32i1_v8i1_16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a2, 32
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vlm.v v9, (a1)
-; CHECK-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
-; CHECK-NEXT:    vsm.v v8, (a0)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v32i1_v8i1_16:
+; VLA:       # %bb.0:
+; VLA-NEXT:    li a2, 32
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLA-NEXT:    vlm.v v9, (a1)
+; VLA-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v9, 2
+; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
+; VLA-NEXT:    vsm.v v8, (a0)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v32i1_v8i1_16:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; VLS-NEXT:    vlm.v v9, (a1)
+; VLS-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v9, 2
+; VLS-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; VLS-NEXT:    vsm.v v8, (a0)
+; VLS-NEXT:    ret
   %v = load <32 x i1>, ptr %vp
   %sv = load <8 x i1>, ptr %svp
   %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 16)
@@ -358,22 +465,36 @@ define <vscale x 2 x i16> @insert_nxv2i16_v2i16_2(<vscale x 2 x i16> %v, ptr %sv
 }
 
 define <vscale x 2 x i1> @insert_nxv2i1_v4i1_0(<vscale x 2 x i1> %v, ptr %svp) {
-; CHECK-LABEL: insert_nxv2i1_v4i1_0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vlm.v v8, (a0)
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v9, 0
-; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
-; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
-; CHECK-NEXT:    vmv.v.v v9, v8
-; CHECK-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v9, 0
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_nxv2i1_v4i1_0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLA-NEXT:    vlm.v v8, (a0)
+; VLA-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; VLA-NEXT:    vmv.v.i v9, 0
+; VLA-NEXT:    vmerge.vim v9, v9, 1, v0
+; VLA-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLA-NEXT:    vmv.v.i v10, 0
+; VLA-NEXT:    vmv1r.v v0, v8
+; VLA-NEXT:    vmerge.vim v8, v10, 1, v0
+; VLA-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
+; VLA-NEXT:    vmv.v.v v9, v8
+; VLA-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
+; VLA-NEXT:    vmsne.vi v0, v9, 0
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_nxv2i1_v4i1_0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; VLS-NEXT:    vlm.v v8, (a0)
+; VLS-NEXT:    vmv.v.i v9, 0
+; VLS-NEXT:    vmerge.vim v10, v9, 1, v0
+; VLS-NEXT:    vmv1r.v v0, v8
+; VLS-NEXT:    vmerge.vim v8, v9, 1, v0
+; VLS-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
+; VLS-NEXT:    vmv.v.v v10, v8
+; VLS-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
+; VLS-NEXT:    vmsne.vi v0, v10, 0
+; VLS-NEXT:    ret
   %sv = load <4 x i1>, ptr %svp
   %c = call <vscale x 2 x i1> @llvm.vector.insert.v4i1.nxv2i1(<vscale x 2 x i1> %v, <4 x i1> %sv, i64 0)
   ret <vscale x 2 x i1> %c
@@ -408,15 +529,24 @@ define <vscale x 8 x i1> @insert_nxv8i1_v8i1_16(<vscale x 8 x i1> %v, ptr %svp)
 declare <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
 
 define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
-; CHECK-LABEL: insert_v2i64_nxv16i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vle64.v v16, (a1)
-; CHECK-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 4
-; CHECK-NEXT:    vs8r.v v8, (a2)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v2i64_nxv16i64:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; VLA-NEXT:    vle64.v v8, (a0)
+; VLA-NEXT:    vle64.v v16, (a1)
+; VLA-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 4
+; VLA-NEXT:    vs8r.v v8, (a2)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v2i64_nxv16i64:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl1re64.v v8, (a0)
+; VLS-NEXT:    vl1re64.v v16, (a1)
+; VLS-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vs8r.v v8, (a2)
+; VLS-NEXT:    ret
   %sv0 = load <2 x i64>, ptr %psv0
   %sv1 = load <2 x i64>, ptr %psv1
   %v0 = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv0, i64 0)
@@ -426,12 +556,18 @@ define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
 }
 
 define void @insert_v2i64_nxv16i64_lo0(ptr %psv, ptr %out) {
-; CHECK-LABEL: insert_v2i64_nxv16i64_lo0:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vs8r.v v8, (a1)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v2i64_nxv16i64_lo0:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; VLA-NEXT:    vle64.v v8, (a0)
+; VLA-NEXT:    vs8r.v v8, (a1)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v2i64_nxv16i64_lo0:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl1re64.v v8, (a0)
+; VLS-NEXT:    vs8r.v v8, (a1)
+; VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 0)
   store <vscale x 16 x i64> %v, ptr %out
@@ -439,14 +575,22 @@ define void @insert_v2i64_nxv16i64_lo0(ptr %psv, ptr %out) {
 }
 
 define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) {
-; CHECK-LABEL: insert_v2i64_nxv16i64_lo2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
-; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v16, v8, 2
-; CHECK-NEXT:    vs8r.v v16, (a1)
-; CHECK-NEXT:    ret
+; VLA-LABEL: insert_v2i64_nxv16i64_lo2:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; VLA-NEXT:    vle64.v v8, (a0)
+; VLA-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
+; VLA-NEXT:    vslideup.vi v16, v8, 2
+; VLA-NEXT:    vs8r.v v16, (a1)
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: insert_v2i64_nxv16i64_lo2:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vl1re64.v v8, (a0)
+; VLS-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
+; VLS-NEXT:    vslideup.vi v16, v8, 2
+; VLS-NEXT:    vs8r.v v16, (a1)
+; VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
   store <vscale x 16 x i64> %v, ptr %out
@@ -521,6 +665,127 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
+; RV32VLA-LABEL: insert_v2i64_nxv16i64_hi:
+; RV32VLA:       # %bb.0:
+; RV32VLA-NEXT:    addi sp, sp, -80
+; RV32VLA-NEXT:    .cfi_def_cfa_offset 80
+; RV32VLA-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32VLA-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32VLA-NEXT:    .cfi_offset ra, -4
+; RV32VLA-NEXT:    .cfi_offset s0, -8
+; RV32VLA-NEXT:    addi s0, sp, 80
+; RV32VLA-NEXT:    .cfi_def_cfa s0, 0
+; RV32VLA-NEXT:    csrr a2, vlenb
+; RV32VLA-NEXT:    slli a2, a2, 4
+; RV32VLA-NEXT:    sub sp, sp, a2
+; RV32VLA-NEXT:    andi sp, sp, -64
+; RV32VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32VLA-NEXT:    vle64.v v8, (a0)
+; RV32VLA-NEXT:    addi a0, sp, 128
+; RV32VLA-NEXT:    vse64.v v8, (a0)
+; RV32VLA-NEXT:    csrr a0, vlenb
+; RV32VLA-NEXT:    slli a0, a0, 3
+; RV32VLA-NEXT:    addi a2, sp, 64
+; RV32VLA-NEXT:    add a3, a2, a0
+; RV32VLA-NEXT:    vl8re64.v v8, (a3)
+; RV32VLA-NEXT:    vl8re64.v v16, (a2)
+; RV32VLA-NEXT:    add a0, a1, a0
+; RV32VLA-NEXT:    vs8r.v v8, (a0)
+; RV32VLA-NEXT:    vs8r.v v16, (a1)
+; RV32VLA-NEXT:    addi sp, s0, -80
+; RV32VLA-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32VLA-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32VLA-NEXT:    addi sp, sp, 80
+; RV32VLA-NEXT:    ret
+;
+; RV64VLA-LABEL: insert_v2i64_nxv16i64_hi:
+; RV64VLA:       # %bb.0:
+; RV64VLA-NEXT:    addi sp, sp, -80
+; RV64VLA-NEXT:    .cfi_def_cfa_offset 80
+; RV64VLA-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64VLA-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64VLA-NEXT:    .cfi_offset ra, -8
+; RV64VLA-NEXT:    .cfi_offset s0, -16
+; RV64VLA-NEXT:    addi s0, sp, 80
+; RV64VLA-NEXT:    .cfi_def_cfa s0, 0
+; RV64VLA-NEXT:    csrr a2, vlenb
+; RV64VLA-NEXT:    slli a2, a2, 4
+; RV64VLA-NEXT:    sub sp, sp, a2
+; RV64VLA-NEXT:    andi sp, sp, -64
+; RV64VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64VLA-NEXT:    vle64.v v8, (a0)
+; RV64VLA-NEXT:    addi a0, sp, 128
+; RV64VLA-NEXT:    vse64.v v8, (a0)
+; RV64VLA-NEXT:    csrr a0, vlenb
+; RV64VLA-NEXT:    slli a0, a0, 3
+; RV64VLA-NEXT:    addi a2, sp, 64
+; RV64VLA-NEXT:    add a3, a2, a0
+; RV64VLA-NEXT:    vl8re64.v v8, (a3)
+; RV64VLA-NEXT:    vl8re64.v v16, (a2)
+; RV64VLA-NEXT:    add a0, a1, a0
+; RV64VLA-NEXT:    vs8r.v v8, (a0)
+; RV64VLA-NEXT:    vs8r.v v16, (a1)
+; RV64VLA-NEXT:    addi sp, s0, -80
+; RV64VLA-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64VLA-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64VLA-NEXT:    addi sp, sp, 80
+; RV64VLA-NEXT:    ret
+;
+; RV32VLS-LABEL: insert_v2i64_nxv16i64_hi:
+; RV32VLS:       # %bb.0:
+; RV32VLS-NEXT:    addi sp, sp, -80
+; RV32VLS-NEXT:    .cfi_def_cfa_offset 80
+; RV32VLS-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32VLS-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32VLS-NEXT:    .cfi_offset ra, -4
+; RV32VLS-NEXT:    .cfi_offset s0, -8
+; RV32VLS-NEXT:    addi s0, sp, 80
+; RV32VLS-NEXT:    .cfi_def_cfa s0, 0
+; RV32VLS-NEXT:    addi sp, sp, -256
+; RV32VLS-NEXT:    andi sp, sp, -64
+; RV32VLS-NEXT:    vl1re64.v v8, (a0)
+; RV32VLS-NEXT:    addi a0, sp, 128
+; RV32VLS-NEXT:    vs1r.v v8, (a0)
+; RV32VLS-NEXT:    addi a0, sp, 64
+; RV32VLS-NEXT:    addi a2, sp, 192
+; RV32VLS-NEXT:    vl8re64.v v8, (a2)
+; RV32VLS-NEXT:    vl8re64.v v16, (a0)
+; RV32VLS-NEXT:    addi a0, a1, 128
+; RV32VLS-NEXT:    vs8r.v v8, (a0)
+; RV32VLS-NEXT:    vs8r.v v16, (a1)
+; RV32VLS-NEXT:    addi sp, s0, -80
+; RV32VLS-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32VLS-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32VLS-NEXT:    addi sp, sp, 80
+; RV32VLS-NEXT:    ret
+;
+; RV64VLS-LABEL: insert_v2i64_nxv16i64_hi:
+; RV64VLS:       # %bb.0:
+; RV64VLS-NEXT:    addi sp, sp, -80
+; RV64VLS-NEXT:    .cfi_def_cfa_offset 80
+; RV64VLS-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64VLS-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64VLS-NEXT:    .cfi_offset ra, -8
+; RV64VLS-NEXT:    .cfi_offset s0, -16
+; RV64VLS-NEXT:    addi s0, sp, 80
+; RV64VLS-NEXT:    .cfi_def_cfa s0, 0
+; RV64VLS-NEXT:    addi sp, sp, -256
+; RV64VLS-NEXT:    andi sp, sp, -64
+; RV64VLS-NEXT:    vl1re64.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 128
+; RV64VLS-NEXT:    vs1r.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 192
+; RV64VLS-NEXT:    vl8re64.v v8, (a0)
+; RV64VLS-NEXT:    addi a0, sp, 64
+; RV64VLS-NEXT:    vl8re64.v v16, (a0)
+; RV64VLS-NEXT:    addi a0, a1, 128
+; RV64VLS-NEXT:    vs8r.v v8, (a0)
+; RV64VLS-NEXT:    vs8r.v v16, (a1)
+; RV64VLS-NEXT:    addi sp, s0, -80
+; RV64VLS-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64VLS-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64VLS-NEXT:    addi sp, sp, 80
+; RV64VLS-NEXT:    ret
   %sv = load <2 x i64>, ptr %psv
   %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 8)
   store <vscale x 16 x i64> %v, ptr %out
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
index 6ef5aa846d6d965..ce8827fe47536bc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-concat.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLA %s
+
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS %s
 
 define <8 x i32> @concat_2xv4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: concat_2xv4i32:
@@ -128,31 +131,51 @@ define <16 x i32> @concat_8xv2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x
 }
 
 define <32 x i32> @concat_2xv16i32(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: concat_2xv16i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv4r.v v16, v12
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 16
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_2xv16i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv4r.v v16, v12
+; VLA-NEXT:    li a0, 32
+; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_2xv16i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv4r.v v16, v12
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 16
+; VLS-NEXT:    ret
   %ab = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i32> %ab
 }
 
 define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
-; CHECK-LABEL: concat_4xv8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmv2r.v v16, v14
-; CHECK-NEXT:    vmv2r.v v24, v12
-; CHECK-NEXT:    vmv2r.v v0, v10
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v0, 8
-; CHECK-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v24, 16
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 24
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_4xv8i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    vmv2r.v v16, v14
+; VLA-NEXT:    vmv2r.v v24, v12
+; VLA-NEXT:    vmv2r.v v0, v10
+; VLA-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v0, 8
+; VLA-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v24, 16
+; VLA-NEXT:    li a0, 32
+; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 24
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_4xv8i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    vmv2r.v v16, v14
+; VLS-NEXT:    vmv2r.v v24, v12
+; VLS-NEXT:    vmv2r.v v0, v10
+; VLS-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v0, 8
+; VLS-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v24, 16
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 24
+; VLS-NEXT:    ret
   %ab = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %cd = shufflevector <8 x i32> %c, <8 x i32> %d, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %abcd = shufflevector <16 x i32> %ab, <16 x i32> %cd, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -160,82 +183,128 @@ define <32 x i32> @concat_4xv8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x
 }
 
 define <32 x i32> @concat_8xv4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h) {
-; CHECK-LABEL: concat_8xv4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
-; CHECK-NEXT:    vmv1r.v v16, v15
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v16, v14
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v16, v13
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v16, v12
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vmv1r.v v0, v11
-; CHECK-NEXT:    vmv1r.v v24, v10
-; CHECK-NEXT:    vmv1r.v v16, v9
-; CHECK-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v16, 4
-; CHECK-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v24, 8
-; CHECK-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v0, 12
-; CHECK-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 16
-; CHECK-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 20
-; CHECK-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 24
-; CHECK-NEXT:    li a0, 32
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 0
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a1, a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vslideup.vi v8, v16, 28
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 5
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+; VLA-LABEL: concat_8xv4i32:
+; VLA:       # %bb.0:
+; VLA-NEXT:    addi sp, sp, -16
+; VLA-NEXT:    .cfi_def_cfa_offset 16
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 5
+; VLA-NEXT:    sub sp, sp, a0
+; VLA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; VLA-NEXT:    vmv1r.v v16, v15
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    li a1, 0
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a1, a1, a0
+; VLA-NEXT:    slli a0, a0, 1
+; VLA-NEXT:    add a0, a0, a1
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v14
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 4
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v13
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v16, v12
+; VLA-NEXT:    addi a0, sp, 16
+; VLA-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLA-NEXT:    vmv1r.v v0, v11
+; VLA-NEXT:    vmv1r.v v24, v10
+; VLA-NEXT:    vmv1r.v v16, v9
+; VLA-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v16, 4
+; VLA-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v24, 8
+; VLA-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLA-NEXT:    vslideup.vi v8, v0, 12
+; VLA-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 16
+; VLA-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 20
+; VLA-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 4
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 24
+; VLA-NEXT:    li a0, 32
+; VLA-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    li a1, 0
+; VLA-NEXT:    slli a0, a0, 3
+; VLA-NEXT:    add a1, a1, a0
+; VLA-NEXT:    slli a0, a0, 1
+; VLA-NEXT:    add a0, a0, a1
+; VLA-NEXT:    add a0, sp, a0
+; VLA-NEXT:    addi a0, a0, 16
+; VLA-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLA-NEXT:    vslideup.vi v8, v16, 28
+; VLA-NEXT:    csrr a0, vlenb
+; VLA-NEXT:    slli a0, a0, 5
+; VLA-NEXT:    add sp, sp, a0
+; VLA-NEXT:    addi sp, sp, 16
+; VLA-NEXT:    ret
+;
+; VLS-LABEL: concat_8xv4i32:
+; VLS:       # %bb.0:
+; VLS-NEXT:    addi sp, sp, -16
+; VLS-NEXT:    .cfi_def_cfa_offset 16
+; VLS-NEXT:    addi sp, sp, -512
+; VLS-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; VLS-NEXT:    vmv1r.v v16, v15
+; VLS-NEXT:    addi a0, sp, 400
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v14
+; VLS-NEXT:    addi a0, sp, 272
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v13
+; VLS-NEXT:    addi a0, sp, 144
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v16, v12
+; VLS-NEXT:    addi a0, sp, 16
+; VLS-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; VLS-NEXT:    vmv1r.v v0, v11
+; VLS-NEXT:    vmv1r.v v24, v10
+; VLS-NEXT:    vmv1r.v v16, v9
+; VLS-NEXT:    vsetivli zero, 8, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v16, 4
+; VLS-NEXT:    vsetivli zero, 12, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v24, 8
+; VLS-NEXT:    vsetivli zero, 16, e32, m8, tu, ma
+; VLS-NEXT:    vslideup.vi v8, v0, 12
+; VLS-NEXT:    vsetivli zero, 20, e32, m8, tu, ma
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 16
+; VLS-NEXT:    vsetivli zero, 24, e32, m8, tu, ma
+; VLS-NEXT:    addi a0, sp, 144
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 20
+; VLS-NEXT:    vsetivli zero, 28, e32, m8, tu, ma
+; VLS-NEXT:    addi a0, sp, 272
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 24
+; VLS-NEXT:    vsetvli a0, zero, e32, m8, ta, ma
+; VLS-NEXT:    addi a0, sp, 400
+; VLS-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; VLS-NEXT:    vslideup.vi v8, v16, 28
+; VLS-NEXT:    addi sp, sp, 512
+; VLS-NEXT:    addi sp, sp, 16
+; VLS-NEXT:    ret
   %ab = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %cd = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %abcd = shufflevector <8 x i32> %ab, <8 x i32> %cd, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>

From f81d5e549f0e02e1bfc5ccaf6341ad35d4ea8e98 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Wed, 28 Feb 2024 08:48:47 +0100
Subject: [PATCH 038/114] [flang] Handle OPTIONAL polymorphic captured in
 internal procedures (#82042)

The current code was doing an unconditional `fir.store %optional_box to
%host_link` which caused a crash when %optional_box is absent because is
is attempting to copy a descriptor from a null address.

Add code to conditionally do the copy at runtime.

The polymorphic array case with lower bounds can be handled with the
array case that already deals with descriptor argument with a few
modifications, just use that.
---
 flang/lib/Lower/HostAssociations.cpp          | 64 +++++++++++----
 flang/lib/Optimizer/Builder/MutableBox.cpp    |  2 +-
 .../HLFIR/internal-procedures-polymorphic.f90 | 81 +++++++++++++++++++
 3 files changed, 131 insertions(+), 16 deletions(-)
 create mode 100644 flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90

diff --git a/flang/lib/Lower/HostAssociations.cpp b/flang/lib/Lower/HostAssociations.cpp
index a62f7a7e99b6ffd..44cc0e74e3b52ab 100644
--- a/flang/lib/Lower/HostAssociations.cpp
+++ b/flang/lib/Lower/HostAssociations.cpp
@@ -247,9 +247,11 @@ class CapturedCharacterScalars
   }
 };
 
-/// Class defining how polymorphic entities are captured in internal procedures.
-/// Polymorphic entities are always boxed as a fir.class box.
-class CapturedPolymorphic : public CapturedSymbols<CapturedPolymorphic> {
+/// Class defining how polymorphic scalar entities are captured in internal
+/// procedures. Polymorphic entities are always boxed as a fir.class box.
+/// Polymorphic array can be handled in CapturedArrays directly
+class CapturedPolymorphicScalar
+    : public CapturedSymbols<CapturedPolymorphicScalar> {
 public:
   static mlir::Type getType(Fortran::lower::AbstractConverter &converter,
                             const Fortran::semantics::Symbol &sym) {
@@ -257,19 +259,50 @@ class CapturedPolymorphic : public CapturedSymbols<CapturedPolymorphic> {
   }
   static void instantiateHostTuple(const InstantiateHostTuple &args,
                                    Fortran::lower::AbstractConverter &converter,
-                                   const Fortran::semantics::Symbol &) {
+                                   const Fortran::semantics::Symbol &sym) {
     fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+    mlir::Location loc = args.loc;
     mlir::Type typeInTuple = fir::dyn_cast_ptrEleTy(args.addrInTuple.getType());
     assert(typeInTuple && "addrInTuple must be an address");
     mlir::Value castBox = builder.createConvert(args.loc, typeInTuple,
                                                 fir::getBase(args.hostValue));
-    builder.create<fir::StoreOp>(args.loc, castBox, args.addrInTuple);
+    if (Fortran::semantics::IsOptional(sym)) {
+      auto isPresent =
+          builder.create<fir::IsPresentOp>(loc, builder.getI1Type(), castBox);
+      builder.genIfThenElse(loc, isPresent)
+          .genThen([&]() {
+            builder.create<fir::StoreOp>(loc, castBox, args.addrInTuple);
+          })
+          .genElse([&]() {
+            mlir::Value null = fir::factory::createUnallocatedBox(
+                builder, loc, typeInTuple,
+                /*nonDeferredParams=*/mlir::ValueRange{});
+            builder.create<fir::StoreOp>(loc, null, args.addrInTuple);
+          })
+          .end();
+    } else {
+      builder.create<fir::StoreOp>(loc, castBox, args.addrInTuple);
+    }
   }
   static void getFromTuple(const GetFromTuple &args,
                            Fortran::lower::AbstractConverter &converter,
                            const Fortran::semantics::Symbol &sym,
                            const Fortran::lower::BoxAnalyzer &ba) {
-    bindCapturedSymbol(sym, args.valueInTuple, converter, args.symMap);
+    fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+    mlir::Location loc = args.loc;
+    mlir::Value box = args.valueInTuple;
+    if (Fortran::semantics::IsOptional(sym)) {
+      auto boxTy = box.getType().cast<fir::BaseBoxType>();
+      auto eleTy = boxTy.getEleTy();
+      if (!fir::isa_ref_type(eleTy))
+        eleTy = builder.getRefType(eleTy);
+      auto addr = builder.create<fir::BoxAddrOp>(loc, eleTy, box);
+      mlir::Value isPresent = builder.genIsNotNullAddr(loc, addr);
+      auto absentBox = builder.create<fir::AbsentOp>(loc, boxTy);
+      box =
+          builder.create<mlir::arith::SelectOp>(loc, isPresent, box, absentBox);
+    }
+    bindCapturedSymbol(sym, box, converter, args.symMap);
   }
 };
 
@@ -342,7 +375,12 @@ class CapturedArrays : public CapturedSymbols<CapturedArrays> {
   static mlir::Type getType(Fortran::lower::AbstractConverter &converter,
                             const Fortran::semantics::Symbol &sym) {
     mlir::Type type = converter.genType(sym);
-    assert(type.isa<fir::SequenceType>() && "must be a sequence type");
+    bool isPolymorphic = Fortran::semantics::IsPolymorphic(sym);
+    assert(type.isa<fir::SequenceType>() ||
+           (isPolymorphic && type.isa<fir::ClassType>()) &&
+               "must be a sequence type");
+    if (isPolymorphic)
+      return type;
     return fir::BoxType::get(type);
   }
 
@@ -410,13 +448,13 @@ class CapturedArrays : public CapturedSymbols<CapturedArrays> {
                          fir::factory::readBoxValue(builder, loc, boxValue),
                          converter, args.symMap);
     } else {
-      // Keep variable as a fir.box.
+      // Keep variable as a fir.box/fir.class.
       // If this is an optional that is absent, the fir.box needs to be an
       // AbsentOp result, otherwise it will not work properly with IsPresentOp
       // (absent boxes are null descriptor addresses, not descriptors containing
       // a null base address).
       if (Fortran::semantics::IsOptional(sym)) {
-        auto boxTy = box.getType().cast<fir::BoxType>();
+        auto boxTy = box.getType().cast<fir::BaseBoxType>();
         auto eleTy = boxTy.getEleTy();
         if (!fir::isa_ref_type(eleTy))
           eleTy = builder.getRefType(eleTy);
@@ -470,14 +508,10 @@ walkCaptureCategories(T visitor, Fortran::lower::AbstractConverter &converter,
   ba.analyze(sym);
   if (Fortran::semantics::IsAllocatableOrPointer(sym))
     return CapturedAllocatableAndPointer::visit(visitor, converter, sym, ba);
-  if (Fortran::semantics::IsPolymorphic(sym)) {
-    if (ba.isArray() && !ba.lboundIsAllOnes())
-      TODO(converter.genLocation(sym.name()),
-           "polymorphic array with non default lower bound");
-    return CapturedPolymorphic::visit(visitor, converter, sym, ba);
-  }
   if (ba.isArray())
     return CapturedArrays::visit(visitor, converter, sym, ba);
+  if (Fortran::semantics::IsPolymorphic(sym))
+    return CapturedPolymorphicScalar::visit(visitor, converter, sym, ba);
   if (ba.isChar())
     return CapturedCharacterScalars::visit(visitor, converter, sym, ba);
   assert(ba.isTrivial() && "must be trivial scalar");
diff --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp
index 4d8860b60915c43..d4012e9c3d9d937 100644
--- a/flang/lib/Optimizer/Builder/MutableBox.cpp
+++ b/flang/lib/Optimizer/Builder/MutableBox.cpp
@@ -674,7 +674,7 @@ void fir::factory::disassociateMutableBox(fir::FirOpBuilder &builder,
     // 7.3.2.3 point 7. The dynamic type of a disassociated pointer is the
     // same as its declared type.
     auto boxTy = box.getBoxTy().dyn_cast<fir::BaseBoxType>();
-    auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(boxTy.getEleTy());
+    auto eleTy = fir::unwrapPassByRefType(boxTy.getEleTy());
     mlir::Type derivedType = fir::getDerivedType(eleTy);
     if (auto recTy = derivedType.dyn_cast<fir::RecordType>()) {
       fir::runtime::genNullifyDerivedType(builder, loc, box.getAddr(), recTy,
diff --git a/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90 b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
new file mode 100644
index 000000000000000..8645488290d715b
--- /dev/null
+++ b/flang/test/Lower/HLFIR/internal-procedures-polymorphic.f90
@@ -0,0 +1,81 @@
+! Test lowering of internal procedure capturing OPTIONAL polymorphic
+! objects.
+! RUN: bbc -emit-hlfir --polymorphic-type -o - %s -I nw | FileCheck %s
+
+
+module captured_optional_polymorphic
+  type sometype
+  end type
+contains
+subroutine test(x, y)
+  class(sometype), optional :: x
+  class(sometype), optional :: y(2:)
+  call internal()
+contains
+  subroutine internal()
+    if (present(x).and.present(y)) then
+      print *, same_type_as(x, y)
+    end if
+  end subroutine
+end
+end module
+
+! CHECK-LABEL:   func.func @_QMcaptured_optional_polymorphicPtest(
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare{{.*}}Ex
+! CHECK:           %[[VAL_3:.*]] = arith.constant 2 : i64
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i64) -> index
+! CHECK:           %[[VAL_5:.*]] = fir.shift %[[VAL_4]] : (index) -> !fir.shift<1>
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare{{.*}}Ey
+! CHECK:           %[[VAL_7:.*]] = fir.alloca tuple<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>, !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_9:.*]] = fir.coordinate_of %[[VAL_7]], %[[VAL_8]]
+! CHECK:           %[[VAL_10:.*]] = fir.is_present %[[VAL_2]]#1 : (!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> i1
+! CHECK:           fir.if %[[VAL_10]] {
+! CHECK:             fir.store %[[VAL_2]]#1 to %[[VAL_9]] : !fir.ref<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_11:.*]] = fir.zero_bits !fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:             %[[VAL_12:.*]] = fir.embox %[[VAL_11]] : (!fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> !fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:             fir.store %[[VAL_12]] to %[[VAL_9]] : !fir.ref<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           }
+! CHECK:           %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_14:.*]] = fir.coordinate_of %[[VAL_7]], %[[VAL_13]]
+! CHECK:           %[[VAL_15:.*]] = fir.is_present %[[VAL_6]]#1 : (!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>) -> i1
+! CHECK:           fir.if %[[VAL_15]] {
+! CHECK:             %[[VAL_16:.*]] = fir.shift %[[VAL_4]] : (index) -> !fir.shift<1>
+! CHECK:             %[[VAL_17:.*]] = fir.rebox %[[VAL_6]]#1(%[[VAL_16]]) : (!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>, !fir.shift<1>) -> !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:             fir.store %[[VAL_17]] to %[[VAL_14]] : !fir.ref<!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>
+! CHECK:           } else {
+! CHECK:             %[[VAL_18:.*]] = fir.type_desc !fir.type<_QMcaptured_optional_polymorphicTsometype>
+! CHECK:             %[[VAL_19:.*]] = fir.convert %[[VAL_14]] : (!fir.ref<!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>) -> !fir.ref<!fir.box<none>>
+! CHECK:             %[[VAL_20:.*]] = fir.convert %[[VAL_18]] : (!fir.tdesc<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> !fir.ref<none>
+! CHECK:             %[[VAL_21:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_22:.*]] = arith.constant 0 : i32
+! CHECK:             %[[VAL_23:.*]] = fir.call @_FortranAPointerNullifyDerived(%[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> none
+! CHECK:           }
+! CHECK:           fir.call @_QMcaptured_optional_polymorphicFtestPinternal(%[[VAL_7]])
+
+! CHECK-LABEL: func.func{{.*}} @_QMcaptured_optional_polymorphicFtestPinternal(
+! CHECK-SAME:      %[[VAL_0:.*]]: !fir.ref<tuple<{{.*}}>>
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
+! CHECK:           %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]]
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> !fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<!fir.type<_QMcaptured_optional_polymorphicTsometype>>) -> i64
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_7:.*]] = arith.cmpi ne, %[[VAL_5]], %[[VAL_6]] : i64
+! CHECK:           %[[VAL_8:.*]] = fir.absent !fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:           %[[VAL_9:.*]] = arith.select %[[VAL_7]], %[[VAL_3]], %[[VAL_8]] : !fir.class<!fir.type<_QMcaptured_optional_polymorphicTsometype>>
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_9]] {fortran_attrs = #fir.var_attrs<optional, host_assoc>, {{.*}}Ex
+! CHECK:           %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_12:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_11]]
+! CHECK:           %[[VAL_13:.*]] = fir.load %[[VAL_12]] : !fir.ref<!fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>>
+! CHECK:           %[[VAL_14:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_15:.*]]:3 = fir.box_dims %[[VAL_13]], %[[VAL_14]]
+! CHECK:           %[[VAL_16:.*]] = fir.box_addr %[[VAL_13]]
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>) -> i64
+! CHECK:           %[[VAL_18:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_17]], %[[VAL_18]] : i64
+! CHECK:           %[[VAL_20:.*]] = fir.absent !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           %[[VAL_21:.*]] = arith.select %[[VAL_19]], %[[VAL_13]], %[[VAL_20]] : !fir.class<!fir.array<?x!fir.type<_QMcaptured_optional_polymorphicTsometype>>>
+! CHECK:           %[[VAL_22:.*]] = fir.shift %[[VAL_15]]#0 : (index) -> !fir.shift<1>
+! CHECK:           %[[VAL_23:.*]]:2 = hlfir.declare %[[VAL_21]](%[[VAL_22]]) {fortran_attrs = #fir.var_attrs<optional, host_assoc>, {{.*}}Ey

From 9617da88ab961145047076c45bb2bb1ac4513634 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 28 Feb 2024 15:58:55 +0800
Subject: [PATCH 039/114] [RISCV] Use a ta vslideup if inserting over end of
 InterSubVT (#83230)

The description in #83146 is slightly inaccurate: it relaxes a tail
undisturbed vslideup to tail agnostic if we are inserting over the
entire tail of the vector **and** we didn't shrink the LMUL of the
vector being inserted into.

This handles the case where we did shrink down the LMUL via InterSubVT
by checking if we inserted over the entire tail of InterSubVT, the
actual type that we're performing the vslideup on, not VecVT.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp          |  4 ++--
 llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll     |  2 +-
 llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll      | 10 +++++-----
 llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll | 12 ++++++------
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e95e21bda687e8e..dde1882f5eea832 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9732,9 +9732,9 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
       ElementCount::getScalable(RemIdx) + SubVecVT.getVectorElementCount();
   VL = computeVLMax(SubVecVT, DL, DAG);
 
-  // Use tail agnostic policy if we're inserting over Vec's tail.
+  // Use tail agnostic policy if we're inserting over InterSubVT's tail.
   unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
-  if (EndIndex == VecVT.getVectorElementCount())
+  if (EndIndex == InterSubVT.getVectorElementCount())
     Policy = RISCVII::TAIL_AGNOSTIC;
 
   // If we're inserting into the lowest elements, use a tail undisturbed
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index a2d02b6bb641b28..76aa2b913c6525d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -474,7 +474,7 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
 ; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v12, v9, a0
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v12, v10, a0
 ; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index d377082761736fa..b15896580d4253a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -227,7 +227,7 @@ define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec,
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i32> @llvm.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 1)
@@ -306,7 +306,7 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_7(<vscale x 16 x i8> %vec, <vsc
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    sub a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 7)
@@ -319,7 +319,7 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_15(<vscale x 16 x i8> %vec, <vs
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a1, a0, 3
 ; CHECK-NEXT:    sub a1, a0, a1
-; CHECK-NEXT:    vsetvli zero, a0, e8, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v10, a1
 ; CHECK-NEXT:    ret
   %v = call <vscale x 16 x i8> @llvm.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 15)
@@ -344,7 +344,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_2(<vscale x 32 x half> %vec
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 2)
@@ -357,7 +357,7 @@ define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_26(<vscale x 32 x half> %ve
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v14, v16, a0
 ; CHECK-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 26)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
index 515d77109af9f71..6d42b15273cf866 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll
@@ -916,7 +916,7 @@ define half @vreduce_ord_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v10, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
@@ -938,11 +938,11 @@ define half @vreduce_ord_fadd_nxv10f16(<vscale x 10 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v12, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.v.v v11, v12
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v12, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
@@ -1002,7 +1002,7 @@ define half @vreduce_fadd_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v10, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
@@ -1025,11 +1025,11 @@ define half @vreduce_fmin_nxv10f16(<vscale x 10 x half> %v) {
 ; CHECK-NEXT:    vlse16.v v12, (a1), zero
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    add a1, a0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v12, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, ma
 ; CHECK-NEXT:    vmv.v.v v11, v12
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v11, v12, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfredmin.vs v8, v8, v8

From 2b545108ffcb188b69d1a5e37081494d231e1456 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 28 Feb 2024 08:32:32 +0000
Subject: [PATCH 040/114] [GlobalISel] Add a TargetLowering variable to
 IRTranslator. NFC (#83009)

This prevents us from getting the variable multiple times.
---
 .../llvm/CodeGen/GlobalISel/IRTranslator.h    |  1 +
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  | 74 +++++++------------
 2 files changed, 28 insertions(+), 47 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 5454df02914af63..bfac54a65c5b4ea 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -612,6 +612,7 @@ class IRTranslator : public MachineFunctionPass {
   AAResults *AA = nullptr;
   AssumptionCache *AC = nullptr;
   const TargetLibraryInfo *LibInfo = nullptr;
+  const TargetLowering *TLI = nullptr;
   FunctionLoweringInfo FuncInfo;
 
   // True when either the Target Machine specifies no optimizations or the
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 38bb808dd5bd53a..7c986dbbc2c7c88 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -596,8 +596,6 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
   const Value *CondVal = BrInst.getCondition();
   MachineBasicBlock *Succ1MBB = &getMBB(*BrInst.getSuccessor(1));
 
-  const auto &TLI = *MF->getSubtarget().getTargetLowering();
-
   // If this is a series of conditions that are or'd or and'd together, emit
   // this as a sequence of branches instead of setcc's with and/or operations.
   // As long as jumps are not expensive (exceptions for multi-use logic ops,
@@ -617,7 +615,7 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
   //     jle foo
   using namespace PatternMatch;
   const Instruction *CondI = dyn_cast<Instruction>(CondVal);
-  if (!TLI.isJumpExpensive() && CondI && CondI->hasOneUse() &&
+  if (!TLI->isJumpExpensive() && CondI && CondI->hasOneUse() &&
       !BrInst.hasMetadata(LLVMContext::MD_unpredictable)) {
     Instruction::BinaryOps Opcode = (Instruction::BinaryOps)0;
     Value *Vec;
@@ -1385,9 +1383,8 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
     return true;
   }
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
   MachineMemOperand::Flags Flags =
-      TLI.getLoadMemOperandFlags(LI, *DL, AC, LibInfo);
+      TLI->getLoadMemOperandFlags(LI, *DL, AC, LibInfo);
   if (AA && !(Flags & MachineMemOperand::MOInvariant)) {
     if (AA->pointsToConstantMemory(
             MemoryLocation(Ptr, LocationSize::precise(StoreSize), AAInfo))) {
@@ -1434,8 +1431,7 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
     return true;
   }
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  MachineMemOperand::Flags Flags = TLI.getStoreMemOperandFlags(SI, *DL);
+  MachineMemOperand::Flags Flags = TLI->getStoreMemOperandFlags(SI, *DL);
 
   for (unsigned i = 0; i < Vals.size(); ++i) {
     Register Addr;
@@ -1779,8 +1775,7 @@ void IRTranslator::getStackGuard(Register DstReg,
   auto MIB =
       MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD, {DstReg}, {});
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  Value *Global = TLI.getSDagStackGuard(*MF->getFunction().getParent());
+  Value *Global = TLI->getSDagStackGuard(*MF->getFunction().getParent());
   if (!Global)
     return;
 
@@ -2111,9 +2106,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     // does. Simplest intrinsic ever!
     return true;
   case Intrinsic::vastart: {
-    auto &TLI = *MF->getSubtarget().getTargetLowering();
     Value *Ptr = CI.getArgOperand(0);
-    unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8;
+    unsigned ListSize = TLI->getVaListSizeInBits(*DL) / 8;
     Align Alignment = getKnownAlignment(Ptr, *DL);
 
     MIRBuilder.buildInstr(TargetOpcode::G_VASTART, {}, {getOrCreateVReg(*Ptr)})
@@ -2189,14 +2183,13 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return translateFixedPointIntrinsic(TargetOpcode::G_UDIVFIXSAT, CI, MIRBuilder);
   case Intrinsic::fmuladd: {
     const TargetMachine &TM = MF->getTarget();
-    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
     Register Dst = getOrCreateVReg(CI);
     Register Op0 = getOrCreateVReg(*CI.getArgOperand(0));
     Register Op1 = getOrCreateVReg(*CI.getArgOperand(1));
     Register Op2 = getOrCreateVReg(*CI.getArgOperand(2));
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
-        TLI.isFMAFasterThanFMulAndFAdd(*MF,
-                                       TLI.getValueType(*DL, CI.getType()))) {
+        TLI->isFMAFasterThanFMulAndFAdd(*MF,
+                                        TLI->getValueType(*DL, CI.getType()))) {
       // TODO: Revisit this to see if we should move this part of the
       // lowering to the combiner.
       MIRBuilder.buildFMA(Dst, Op0, Op1, Op2,
@@ -2254,10 +2247,9 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     getStackGuard(getOrCreateVReg(CI), MIRBuilder);
     return true;
   case Intrinsic::stackprotector: {
-    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
     LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
     Register GuardVal;
-    if (TLI.useLoadStackGuardNode()) {
+    if (TLI->useLoadStackGuardNode()) {
       GuardVal = MRI->createGenericVirtualRegister(PtrTy);
       getStackGuard(GuardVal, MIRBuilder);
     } else
@@ -2635,10 +2627,9 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   }
 
   // Add a MachineMemOperand if it is a target mem intrinsic.
-  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
   TargetLowering::IntrinsicInfo Info;
   // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
-  if (TLI.getTgtMemIntrinsic(Info, CI, *MF, ID)) {
+  if (TLI->getTgtMemIntrinsic(Info, CI, *MF, ID)) {
     Align Alignment = Info.align.value_or(
         DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
     LLT MemTy = Info.memVT.isSimple()
@@ -2818,10 +2809,9 @@ bool IRTranslator::translateLandingPad(const User &U,
 
   // If there aren't registers to copy the values into (e.g., during SjLj
   // exceptions), then don't bother.
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
   const Constant *PersonalityFn = MF->getFunction().getPersonalityFn();
-  if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
-      TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
+  if (TLI->getExceptionPointerRegister(PersonalityFn) == 0 &&
+      TLI->getExceptionSelectorRegister(PersonalityFn) == 0)
     return true;
 
   // If landingpad's return type is token type, we don't create DAG nodes
@@ -2852,7 +2842,7 @@ bool IRTranslator::translateLandingPad(const User &U,
   assert(Tys.size() == 2 && "Only two-valued landingpads are supported");
 
   // Mark exception register as live in.
-  Register ExceptionReg = TLI.getExceptionPointerRegister(PersonalityFn);
+  Register ExceptionReg = TLI->getExceptionPointerRegister(PersonalityFn);
   if (!ExceptionReg)
     return false;
 
@@ -2860,7 +2850,7 @@ bool IRTranslator::translateLandingPad(const User &U,
   ArrayRef<Register> ResRegs = getOrCreateVRegs(LP);
   MIRBuilder.buildCopy(ResRegs[0], ExceptionReg);
 
-  Register SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn);
+  Register SelectorReg = TLI->getExceptionSelectorRegister(PersonalityFn);
   if (!SelectorReg)
     return false;
 
@@ -2986,8 +2976,7 @@ bool IRTranslator::translateExtractElement(const User &U,
 
   Register Res = getOrCreateVReg(U);
   Register Val = getOrCreateVReg(*U.getOperand(0));
-  const auto &TLI = *MF->getSubtarget().getTargetLowering();
-  unsigned PreferredVecIdxWidth = TLI.getVectorIdxTy(*DL).getSizeInBits();
+  unsigned PreferredVecIdxWidth = TLI->getVectorIdxTy(*DL).getSizeInBits();
   Register Idx;
   if (auto *CI = dyn_cast<ConstantInt>(U.getOperand(1))) {
     if (CI->getBitWidth() != PreferredVecIdxWidth) {
@@ -3039,8 +3028,7 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
   const AtomicCmpXchgInst &I = cast<AtomicCmpXchgInst>(U);
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  auto Flags = TLI.getAtomicMemOperandFlags(I, *DL);
+  auto Flags = TLI->getAtomicMemOperandFlags(I, *DL);
 
   auto Res = getOrCreateVRegs(I);
   Register OldValRes = Res[0];
@@ -3061,8 +3049,7 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
 bool IRTranslator::translateAtomicRMW(const User &U,
                                       MachineIRBuilder &MIRBuilder) {
   const AtomicRMWInst &I = cast<AtomicRMWInst>(U);
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  auto Flags = TLI.getAtomicMemOperandFlags(I, *DL);
+  auto Flags = TLI->getAtomicMemOperandFlags(I, *DL);
 
   Register Res = getOrCreateVReg(I);
   Register Addr = getOrCreateVReg(*I.getPointerOperand());
@@ -3302,8 +3289,7 @@ bool IRTranslator::translate(const Instruction &Inst) {
   CurBuilder->setDebugLoc(Inst.getDebugLoc());
   CurBuilder->setPCSections(Inst.getMetadata(LLVMContext::MD_pcsections));
 
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  if (TLI.fallBackToDAGISel(Inst))
+  if (TLI->fallBackToDAGISel(Inst))
     return false;
 
   switch (Inst.getOpcode()) {
@@ -3454,9 +3440,8 @@ bool IRTranslator::finalizeBasicBlock(const BasicBlock &BB,
   // Check if we need to generate stack-protector guard checks.
   StackProtector &SP = getAnalysis<StackProtector>();
   if (SP.shouldEmitSDCheck(BB)) {
-    const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
     bool FunctionBasedInstrumentation =
-        TLI.getSSPStackGuardCheck(*MF->getFunction().getParent());
+        TLI->getSSPStackGuardCheck(*MF->getFunction().getParent());
     SPDescriptor.initialize(&BB, &MBB, FunctionBasedInstrumentation);
   }
   // Handle stack protector.
@@ -3501,10 +3486,9 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
                                           MachineBasicBlock *ParentBB) {
   CurBuilder->setInsertPt(*ParentBB, ParentBB->end());
   // First create the loads to the guard/stack slot for the comparison.
-  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
   Type *PtrIRTy = PointerType::getUnqual(MF->getFunction().getContext());
   const LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
-  LLT PtrMemTy = getLLTForMVT(TLI.getPointerMemTy(*DL));
+  LLT PtrMemTy = getLLTForMVT(TLI->getPointerMemTy(*DL));
 
   MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
   int FI = MFI.getStackProtectorIndex();
@@ -3522,13 +3506,13 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
                       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile)
           .getReg(0);
 
-  if (TLI.useStackGuardXorFP()) {
+  if (TLI->useStackGuardXorFP()) {
     LLVM_DEBUG(dbgs() << "Stack protector xor'ing with FP not yet implemented");
     return false;
   }
 
   // Retrieve guard check function, nullptr if instrumentation is inlined.
-  if (const Function *GuardCheckFn = TLI.getSSPStackGuardCheck(M)) {
+  if (const Function *GuardCheckFn = TLI->getSSPStackGuardCheck(M)) {
     // This path is currently untestable on GlobalISel, since the only platform
     // that needs this seems to be Windows, and we fall back on that currently.
     // The code still lives here in case that changes.
@@ -3563,13 +3547,13 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
 
   // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
   // Otherwise, emit a volatile load to retrieve the stack guard value.
-  if (TLI.useLoadStackGuardNode()) {
+  if (TLI->useLoadStackGuardNode()) {
     Guard =
         MRI->createGenericVirtualRegister(LLT::scalar(PtrTy.getSizeInBits()));
     getStackGuard(Guard, *CurBuilder);
   } else {
     // TODO: test using android subtarget when we support @llvm.thread.pointer.
-    const Value *IRGuard = TLI.getSDagStackGuard(M);
+    const Value *IRGuard = TLI->getSDagStackGuard(M);
     Register GuardPtr = getOrCreateVReg(*IRGuard);
 
     Guard = CurBuilder
@@ -3593,13 +3577,12 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
 bool IRTranslator::emitSPDescriptorFailure(StackProtectorDescriptor &SPD,
                                            MachineBasicBlock *FailureBB) {
   CurBuilder->setInsertPt(*FailureBB, FailureBB->end());
-  const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
 
   const RTLIB::Libcall Libcall = RTLIB::STACKPROTECTOR_CHECK_FAIL;
-  const char *Name = TLI.getLibcallName(Libcall);
+  const char *Name = TLI->getLibcallName(Libcall);
 
   CallLowering::CallLoweringInfo Info;
-  Info.CallConv = TLI.getLibcallCallingConv(Libcall);
+  Info.CallConv = TLI->getLibcallCallingConv(Libcall);
   Info.Callee = MachineOperand::CreateES(Name);
   Info.OrigRet = {Register(), Type::getVoidTy(MF->getFunction().getContext()),
                   0};
@@ -3662,6 +3645,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   bool EnableCSE = EnableCSEInIRTranslator.getNumOccurrences()
                        ? EnableCSEInIRTranslator
                        : TPC->isGISelCSEEnabled();
+  TLI = MF->getSubtarget().getTargetLowering();
 
   if (EnableCSE) {
     EntryBuilder = std::make_unique<CSEMIRBuilder>(CurMF);
@@ -3696,12 +3680,8 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   FuncInfo.CanLowerReturn = CLI->checkReturnTypeForCallConv(*MF);
 
-  const auto &TLI = *MF->getSubtarget().getTargetLowering();
-
   SL = std::make_unique<GISelSwitchLowering>(this, FuncInfo);
-  SL->init(TLI, TM, *DL);
-
-
+  SL->init(*TLI, TM, *DL);
 
   assert(PendingPHIs.empty() && "stale PHIs");
 

From a4fff36b6cf64d0afa965a8ef4927145c5558124 Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Wed, 28 Feb 2024 09:32:03 +0100
Subject: [PATCH 041/114] [bazel] Add "include/" for libc includes

for 04e8653f189bf3d65680c7fb3b3033ad82903ee9 #83199
---
 utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
index 17eb30c8e458c22..7d815bc4a2299c6 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
@@ -17,6 +17,7 @@ def libc_common_copts():
     libc_include_path = paths.join(root_label.workspace_root, root_label.package)
     return [
         "-I" + libc_include_path,
+        "-I" + paths.join(libc_include_path, "include"),
         "-DLIBC_NAMESPACE=" + LIBC_NAMESPACE,
     ]
 

From 619ee20b3911f9a481a75a64704c80aef16af9d0 Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <zinenko@google.com>
Date: Wed, 28 Feb 2024 09:48:15 +0100
Subject: [PATCH 042/114] [mlir] add an example of using transform dialect
 standalone (#82623)

Transform dialect interpreter is designed to be usable outside of the
pass pipeline, as the main program transformation driver, e.g., for
languages with explicit schedules. Provide an example of such usage with
a couple of tests.
---
 mlir/examples/CMakeLists.txt                  |   1 +
 mlir/examples/transform-opt/CMakeLists.txt    |  26 ++
 mlir/examples/transform-opt/README.md         |  40 ++
 .../transform-opt/mlir-transform-opt.cpp      | 389 ++++++++++++++++++
 mlir/test/CMakeLists.txt                      |   1 +
 mlir/test/Examples/transform-opt/empty.mlir   |  12 +
 .../Examples/transform-opt/external-decl.mlir |  18 +
 .../Examples/transform-opt/external-def.mlir  |   8 +
 mlir/test/Examples/transform-opt/pass.mlir    |  19 +
 .../transform-opt/self-contained.mlir         |  21 +
 .../Examples/transform-opt/syntax-error.mlir  |   5 +
 mlir/test/lit.cfg.py                          |   1 +
 12 files changed, 541 insertions(+)
 create mode 100644 mlir/examples/transform-opt/CMakeLists.txt
 create mode 100644 mlir/examples/transform-opt/README.md
 create mode 100644 mlir/examples/transform-opt/mlir-transform-opt.cpp
 create mode 100644 mlir/test/Examples/transform-opt/empty.mlir
 create mode 100644 mlir/test/Examples/transform-opt/external-decl.mlir
 create mode 100644 mlir/test/Examples/transform-opt/external-def.mlir
 create mode 100644 mlir/test/Examples/transform-opt/pass.mlir
 create mode 100644 mlir/test/Examples/transform-opt/self-contained.mlir
 create mode 100644 mlir/test/Examples/transform-opt/syntax-error.mlir

diff --git a/mlir/examples/CMakeLists.txt b/mlir/examples/CMakeLists.txt
index d256bf1a5cbb13d..2a1cac34d8c290c 100644
--- a/mlir/examples/CMakeLists.txt
+++ b/mlir/examples/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(toy)
 add_subdirectory(transform)
+add_subdirectory(transform-opt)
 add_subdirectory(minimal-opt)
diff --git a/mlir/examples/transform-opt/CMakeLists.txt b/mlir/examples/transform-opt/CMakeLists.txt
new file mode 100644
index 000000000000000..8e23555d0b5d734
--- /dev/null
+++ b/mlir/examples/transform-opt/CMakeLists.txt
@@ -0,0 +1,26 @@
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
+
+set(LIBS
+  MLIRAnalysis
+  MLIRIR
+  MLIRParser
+  MLIRSupport
+  MLIRTransformDialect
+  MLIRTransformDialectTransforms
+  MLIRTransforms
+  ${dialect_libs}
+  ${conversion_libs}
+  ${extension_libs}
+)
+
+add_mlir_tool(mlir-transform-opt
+  mlir-transform-opt.cpp
+
+  DEPENDS
+  ${LIBS}
+)
+target_link_libraries(mlir-transform-opt PRIVATE ${LIBS})
+llvm_update_compile_flags(mlir-transform-opt)
+mlir_check_all_link_libraries(mlir-transform-opt)
diff --git a/mlir/examples/transform-opt/README.md b/mlir/examples/transform-opt/README.md
new file mode 100644
index 000000000000000..e9c8cc0173c7b2c
--- /dev/null
+++ b/mlir/examples/transform-opt/README.md
@@ -0,0 +1,40 @@
+# Standalone Transform Dialect Interpreter
+
+This is an example of using the Transform dialect interpreter functionality standalone, that is, outside of the regular pass pipeline. The example is a
+binary capable of processing MLIR source files similar to `mlir-opt` and other
+optimizer drivers, with the entire transformation process driven by a Transform
+dialect script. This script can be embedded into the source file or provided in
+a separate MLIR source file.
+
+Either the input module or the transform module must contain a top-level symbol
+named `__transform_main`, which is used as the entry point to the transformation
+script.
+
+```sh
+mlir-transform-opt payload_with_embedded_transform.mlir
+mlir-transform-opt payload.mlir -transform=transform.mlir
+```
+
+The name of the entry point can be overridden using command-line options.
+
+```sh
+mlir-transform-opt payload-mlir -transform-entry-point=another_entry_point
+```
+
+Transform scripts can reference symbols defined in other source files, called
+libraries, which can be supplied to the binary through command-line options.
+Libraries will be embedded into the main transformation module by the tool and
+the interpreter will process everything as a single module. A debug option is
+available to see the contents of the transform module before it goes into the interpreter.
+
+```sh
+mlir-transform-opt payload.mlir -transform=transform.mlir \
+  -transform-library=external_definitions_1.mlir \
+  -transform-library=external_definitions_2.mlir \
+  -dump-library-module
+```
+
+Check out the [Transform dialect
+tutorial](https://mlir.llvm.org/docs/Tutorials/transform/) as well as
+[documentation](https://mlir.llvm.org/docs/Dialects/Transform/) to learn more
+about the dialect. 
diff --git a/mlir/examples/transform-opt/mlir-transform-opt.cpp b/mlir/examples/transform-opt/mlir-transform-opt.cpp
new file mode 100644
index 000000000000000..41a17f18726b1aa
--- /dev/null
+++ b/mlir/examples/transform-opt/mlir-transform-opt.cpp
@@ -0,0 +1,389 @@
+//===- mlir-transform-opt.cpp -----------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/Utils.h"
+#include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllExtensions.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include <cstdlib>
+
+namespace {
+
+using namespace llvm;
+
+/// Structure containing command line options for the tool, these will get
+/// initialized when an instance is created.
+struct MlirTransformOptCLOptions {
+  cl::opt<bool> allowUnregisteredDialects{
+      "allow-unregistered-dialect",
+      cl::desc("Allow operations coming from an unregistered dialect"),
+      cl::init(false)};
+
+  cl::opt<bool> verifyDiagnostics{
+      "verify-diagnostics",
+      cl::desc("Check that emitted diagnostics match expected-* lines "
+               "on the corresponding line"),
+      cl::init(false)};
+
+  cl::opt<std::string> payloadFilename{cl::Positional, cl::desc("<input file>"),
+                                       cl::init("-")};
+
+  cl::opt<std::string> outputFilename{"o", cl::desc("Output filename"),
+                                      cl::value_desc("filename"),
+                                      cl::init("-")};
+
+  cl::opt<std::string> transformMainFilename{
+      "transform",
+      cl::desc("File containing entry point of the transform script, if "
+               "different from the input file"),
+      cl::value_desc("filename"), cl::init("")};
+
+  cl::list<std::string> transformLibraryFilenames{
+      "transform-library", cl::desc("File(s) containing definitions of "
+                                    "additional transform script symbols")};
+
+  cl::opt<std::string> transformEntryPoint{
+      "transform-entry-point",
+      cl::desc("Name of the entry point transform symbol"),
+      cl::init(mlir::transform::TransformDialect::kTransformEntryPointSymbolName
+                   .str())};
+
+  cl::opt<bool> disableExpensiveChecks{
+      "disable-expensive-checks",
+      cl::desc("Disables potentially expensive checks in the transform "
+               "interpreter, providing more speed at the expense of "
+               "potential memory problems and silent corruptions"),
+      cl::init(false)};
+
+  cl::opt<bool> dumpLibraryModule{
+      "dump-library-module",
+      cl::desc("Prints the combined library module before the output"),
+      cl::init(false)};
+};
+} // namespace
+
+/// "Managed" static instance of the command-line options structure. This makes
+/// them locally-scoped and explicitly initialized/deinitialized. While this is
+/// not strictly necessary in the tool source file that is not being used as a
+/// library (where the options would pollute the global list of options), it is
+/// good practice to follow this.
+static llvm::ManagedStatic<MlirTransformOptCLOptions> clOptions;
+
+/// Explicitly registers command-line options.
+static void registerCLOptions() { *clOptions; }
+
+namespace {
+/// A wrapper class for source managers diagnostic. This provides both unique
+/// ownership and virtual function-like overload for a pair of
+/// inheritance-related classes that do not use virtual functions.
+class DiagnosticHandlerWrapper {
+public:
+  /// Kind of the diagnostic handler to use.
+  enum class Kind { EmitDiagnostics, VerifyDiagnostics };
+
+  /// Constructs the diagnostic handler of the specified kind of the given
+  /// source manager and context.
+  DiagnosticHandlerWrapper(Kind kind, llvm::SourceMgr &mgr,
+                           mlir::MLIRContext *context) {
+    if (kind == Kind::EmitDiagnostics)
+      handler = new mlir::SourceMgrDiagnosticHandler(mgr, context);
+    else
+      handler = new mlir::SourceMgrDiagnosticVerifierHandler(mgr, context);
+  }
+
+  /// This object is non-copyable but movable.
+  DiagnosticHandlerWrapper(const DiagnosticHandlerWrapper &) = delete;
+  DiagnosticHandlerWrapper(DiagnosticHandlerWrapper &&other) = default;
+  DiagnosticHandlerWrapper &
+  operator=(const DiagnosticHandlerWrapper &) = delete;
+  DiagnosticHandlerWrapper &operator=(DiagnosticHandlerWrapper &&) = default;
+
+  /// Verifies the captured "expected-*" diagnostics if required.
+  mlir::LogicalResult verify() const {
+    if (auto *ptr =
+            handler.dyn_cast<mlir::SourceMgrDiagnosticVerifierHandler *>()) {
+      return ptr->verify();
+    }
+    return mlir::success();
+  }
+
+  /// Destructs the object of the same type as allocated.
+  ~DiagnosticHandlerWrapper() {
+    if (auto *ptr = handler.dyn_cast<mlir::SourceMgrDiagnosticHandler *>()) {
+      delete ptr;
+    } else {
+      delete handler.get<mlir::SourceMgrDiagnosticVerifierHandler *>();
+    }
+  }
+
+private:
+  /// Internal storage is a type-safe union.
+  llvm::PointerUnion<mlir::SourceMgrDiagnosticHandler *,
+                     mlir::SourceMgrDiagnosticVerifierHandler *>
+      handler;
+};
+
+/// MLIR has deeply rooted expectations that the LLVM source manager contains
+/// exactly one buffer, until at least the lexer level. This class wraps
+/// multiple LLVM source managers each managing a buffer to match MLIR's
+/// expectations while still providing a centralized handling mechanism.
+class TransformSourceMgr {
+public:
+  /// Constructs the source manager indicating whether diagnostic messages will
+  /// be verified later on.
+  explicit TransformSourceMgr(bool verifyDiagnostics)
+      : verifyDiagnostics(verifyDiagnostics) {}
+
+  /// Deconstructs the source manager. Note that `checkResults` must have been
+  /// called on this instance before deconstructing it.
+  ~TransformSourceMgr() {
+    assert(resultChecked && "must check the result of diagnostic handlers by "
+                            "running TransformSourceMgr::checkResult");
+  }
+
+  /// Parses the given buffer and creates the top-level operation of the kind
+  /// specified as template argument in the given context. Additional parsing
+  /// options may be provided.
+  template <typename OpTy = mlir::Operation *>
+  mlir::OwningOpRef<OpTy> parseBuffer(std::unique_ptr<MemoryBuffer> buffer,
+                                      mlir::MLIRContext &context,
+                                      const mlir::ParserConfig &config) {
+    // Create a single-buffer LLVM source manager. Note that `unique_ptr` allows
+    // the code below to capture a reference to the source manager in such a way
+    // that it is not invalidated when the vector contents is eventually
+    // reallocated.
+    llvm::SourceMgr &mgr =
+        *sourceMgrs.emplace_back(std::make_unique<llvm::SourceMgr>());
+    mgr.AddNewSourceBuffer(std::move(buffer), llvm::SMLoc());
+
+    // Choose the type of diagnostic handler depending on whether diagnostic
+    // verification needs to happen and store it.
+    if (verifyDiagnostics) {
+      diagHandlers.emplace_back(
+          DiagnosticHandlerWrapper::Kind::VerifyDiagnostics, mgr, &context);
+    } else {
+      diagHandlers.emplace_back(DiagnosticHandlerWrapper::Kind::EmitDiagnostics,
+                                mgr, &context);
+    }
+
+    // Defer to MLIR's parser.
+    return mlir::parseSourceFile<OpTy>(mgr, config);
+  }
+
+  /// If diagnostic message verification has been requested upon construction of
+  /// this source manager, performs the verification, reports errors and returns
+  /// the result of the verification. Otherwise passes through the given value.
+  mlir::LogicalResult checkResult(mlir::LogicalResult result) {
+    resultChecked = true;
+    if (!verifyDiagnostics)
+      return result;
+
+    return mlir::failure(llvm::any_of(diagHandlers, [](const auto &handler) {
+      return mlir::failed(handler.verify());
+    }));
+  }
+
+private:
+  /// Indicates whether diagnostic message verification is requested.
+  const bool verifyDiagnostics;
+
+  /// Indicates that diagnostic message verification has taken place, and the
+  /// deconstruction is therefore safe.
+  bool resultChecked = false;
+
+  /// Storage for per-buffer source managers and diagnostic handlers. These are
+  /// wrapped into unique pointers in order to make it safe to capture
+  /// references to these objects: if the vector is reallocated, the unique
+  /// pointer objects are moved by the pointer addresses won't change. Also, for
+  /// handlers, this allows to store the pointer to the base class.
+  SmallVector<std::unique_ptr<llvm::SourceMgr>> sourceMgrs;
+  SmallVector<DiagnosticHandlerWrapper> diagHandlers;
+};
+} // namespace
+
+/// Trivial wrapper around `applyTransforms` that doesn't support extra mapping
+/// and doesn't enforce the entry point transform ops being top-level.
+static mlir::LogicalResult
+applyTransforms(mlir::Operation *payloadRoot,
+                mlir::transform::TransformOpInterface transformRoot,
+                const mlir::transform::TransformOptions &options) {
+  return applyTransforms(payloadRoot, transformRoot, {}, options,
+                         /*enforceToplevelTransformOp=*/false);
+}
+
+/// Applies transforms indicated in the transform dialect script to the input
+/// buffer. The transform script may be embedded in the input buffer or as a
+/// separate buffer. The transform script may have external symbols, the
+/// definitions of which must be provided in transform library buffers. If the
+/// application is successful, prints the transformed input buffer into the
+/// given output stream. Additional configuration options are derived from
+/// command-line options.
+static mlir::LogicalResult processPayloadBuffer(
+    raw_ostream &os, std::unique_ptr<MemoryBuffer> inputBuffer,
+    std::unique_ptr<llvm::MemoryBuffer> transformBuffer,
+    MutableArrayRef<std::unique_ptr<MemoryBuffer>> transformLibraries,
+    mlir::DialectRegistry &registry) {
+
+  // Initialize the MLIR context, and various configurations.
+  mlir::MLIRContext context(registry, mlir::MLIRContext::Threading::DISABLED);
+  context.allowUnregisteredDialects(clOptions->allowUnregisteredDialects);
+  mlir::ParserConfig config(&context);
+  TransformSourceMgr sourceMgr(
+      /*verifyDiagnostics=*/clOptions->verifyDiagnostics);
+
+  // Parse the input buffer that will be used as transform payload.
+  mlir::OwningOpRef<mlir::Operation *> payloadRoot =
+      sourceMgr.parseBuffer(std::move(inputBuffer), context, config);
+  if (!payloadRoot)
+    return sourceMgr.checkResult(mlir::failure());
+
+  // Identify the module containing the transform script entry point. This may
+  // be the same module as the input or a separate module. In the former case,
+  // make a copy of the module so it can be modified freely. Modification may
+  // happen in the script itself (at which point it could be rewriting itself
+  // during interpretation, leading to tricky memory errors) or by embedding
+  // library modules in the script.
+  mlir::OwningOpRef<mlir::ModuleOp> transformRoot;
+  if (transformBuffer) {
+    transformRoot = sourceMgr.parseBuffer<mlir::ModuleOp>(
+        std::move(transformBuffer), context, config);
+    if (!transformRoot)
+      return sourceMgr.checkResult(mlir::failure());
+  } else {
+    transformRoot = cast<mlir::ModuleOp>(payloadRoot->clone());
+  }
+
+  // Parse and merge the libraries into the main transform module.
+  for (auto &&transformLibrary : transformLibraries) {
+    mlir::OwningOpRef<mlir::ModuleOp> libraryModule =
+        sourceMgr.parseBuffer<mlir::ModuleOp>(std::move(transformLibrary),
+                                              context, config);
+
+    if (!libraryModule ||
+        mlir::failed(mlir::transform::detail::mergeSymbolsInto(
+            *transformRoot, std::move(libraryModule))))
+      return sourceMgr.checkResult(mlir::failure());
+  }
+
+  // If requested, dump the combined transform module.
+  if (clOptions->dumpLibraryModule)
+    transformRoot->dump();
+
+  // Find the entry point symbol. Even if it had originally been in the payload
+  // module, it was cloned into the transform module so only look there.
+  mlir::transform::TransformOpInterface entryPoint =
+      mlir::transform::detail::findTransformEntryPoint(
+          *transformRoot, mlir::ModuleOp(), clOptions->transformEntryPoint);
+  if (!entryPoint)
+    return sourceMgr.checkResult(mlir::failure());
+
+  // Apply the requested transformations.
+  mlir::transform::TransformOptions transformOptions;
+  transformOptions.enableExpensiveChecks(!clOptions->disableExpensiveChecks);
+  if (mlir::failed(applyTransforms(*payloadRoot, entryPoint, transformOptions)))
+    return sourceMgr.checkResult(mlir::failure());
+
+  // Print the transformed result and check the captured diagnostics if
+  // requested.
+  payloadRoot->print(os);
+  return sourceMgr.checkResult(mlir::success());
+}
+
+/// Tool entry point.
+static mlir::LogicalResult runMain(int argc, char **argv) {
+  // Register all upstream dialects and extensions. Specific uses are advised
+  // not to register all dialects indiscriminately but rather hand-pick what is
+  // necessary for their use case.
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::registerAllExtensions(registry);
+  mlir::registerAllPasses();
+
+  // Explicitly register the transform dialect. This is not strictly necessary
+  // since it has been already registered as part of the upstream dialect list,
+  // but useful for example purposes for cases when dialects to register are
+  // hand-picked. The transform dialect must be registered.
+  registry.insert<mlir::transform::TransformDialect>();
+
+  // Register various command-line options. Note that the LLVM initializer
+  // object is a RAII that ensures correct deconstruction of command-line option
+  // objects inside ManagedStatic.
+  llvm::InitLLVM y(argc, argv);
+  mlir::registerAsmPrinterCLOptions();
+  mlir::registerMLIRContextCLOptions();
+  registerCLOptions();
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "Minimal Transform dialect driver\n");
+
+  // Try opening the main input file.
+  std::string errorMessage;
+  std::unique_ptr<llvm::MemoryBuffer> payloadFile =
+      mlir::openInputFile(clOptions->payloadFilename, &errorMessage);
+  if (!payloadFile) {
+    llvm::errs() << errorMessage << "\n";
+    return mlir::failure();
+  }
+
+  // Try opening the output file.
+  std::unique_ptr<llvm::ToolOutputFile> outputFile =
+      mlir::openOutputFile(clOptions->outputFilename, &errorMessage);
+  if (!outputFile) {
+    llvm::errs() << errorMessage << "\n";
+    return mlir::failure();
+  }
+
+  // Try opening the main transform file if provided.
+  std::unique_ptr<llvm::MemoryBuffer> transformRootFile;
+  if (!clOptions->transformMainFilename.empty()) {
+    if (clOptions->transformMainFilename == clOptions->payloadFilename) {
+      llvm::errs() << "warning: " << clOptions->payloadFilename
+                   << " is provided as both payload and transform file\n";
+    } else {
+      transformRootFile =
+          mlir::openInputFile(clOptions->transformMainFilename, &errorMessage);
+      if (!transformRootFile) {
+        llvm::errs() << errorMessage << "\n";
+        return mlir::failure();
+      }
+    }
+  }
+
+  // Try opening transform library files if provided.
+  SmallVector<std::unique_ptr<llvm::MemoryBuffer>> transformLibraries;
+  transformLibraries.reserve(clOptions->transformLibraryFilenames.size());
+  for (llvm::StringRef filename : clOptions->transformLibraryFilenames) {
+    transformLibraries.emplace_back(
+        mlir::openInputFile(filename, &errorMessage));
+    if (!transformLibraries.back()) {
+      llvm::errs() << errorMessage << "\n";
+      return mlir::failure();
+    }
+  }
+
+  return processPayloadBuffer(outputFile->os(), std::move(payloadFile),
+                              std::move(transformRootFile), transformLibraries,
+                              registry);
+}
+
+int main(int argc, char **argv) {
+  return mlir::asMainReturnCode(runMain(argc, argv));
+}
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 74921544c555781..baf07ea1f010ac7 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -173,6 +173,7 @@ if(LLVM_BUILD_EXAMPLES)
     transform-opt-ch3
     transform-opt-ch4
     mlir-minimal-opt
+    mlir-transform-opt
     )
   if(MLIR_ENABLE_EXECUTION_ENGINE)
     list(APPEND MLIR_TEST_DEPENDS
diff --git a/mlir/test/Examples/transform-opt/empty.mlir b/mlir/test/Examples/transform-opt/empty.mlir
new file mode 100644
index 000000000000000..b525769db688221
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/empty.mlir
@@ -0,0 +1,12 @@
+// RUN: mlir-transform-opt %s --transform=%p/self-contained.mlir | FileCheck %s
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/external-def.mlir --transform-entry-point=external_def | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --transform-library=%p/external-def.mlir | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/syntax-error.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/self-contained.mlir --transform-library=%p/syntax-error.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/self-contained.mlir --transform-library=%p/external-def.mlir --transform-library=%p/syntax-error.mlir --verify-diagnostics
+
+// CHECK: IR printer: in self-contained
+// EXTERNAL: IR printer: external_def
+// CHECK-NOT: @__transform_main
+module {}
diff --git a/mlir/test/Examples/transform-opt/external-decl.mlir b/mlir/test/Examples/transform-opt/external-decl.mlir
new file mode 100644
index 000000000000000..5a73735892429b4
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/external-decl.mlir
@@ -0,0 +1,18 @@
+// This test just needs to parse. Note that the diagnostic message below will
+// be produced in *another* multi-file test, do *not* -verify-diagnostics here.
+// RUN: mlir-opt %s
+
+// RUN: mlir-transform-opt %s --transform-library=%p/external-def.mlir | FileCheck %s
+
+module attributes {transform.with_named_sequence} {
+  // The definition should not be printed here.
+  // CHECK: @external_def
+  // CHECK-NOT: transform.print
+  transform.named_sequence private @external_def(%root: !transform.any_op {transform.readonly})
+
+  transform.named_sequence private @__transform_main(%root: !transform.any_op) {
+    // expected-error @below {{unresolved external named sequence}}
+    transform.include @external_def failures(propagate) (%root) : (!transform.any_op) -> ()
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/external-def.mlir b/mlir/test/Examples/transform-opt/external-def.mlir
new file mode 100644
index 000000000000000..9dc4fbbdd6b6195
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/external-def.mlir
@@ -0,0 +1,8 @@
+// RUN: mlir-opt %s
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @external_def(%root: !transform.any_op {transform.readonly}) {
+    transform.print %root { name = "external_def" } : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/pass.mlir b/mlir/test/Examples/transform-opt/pass.mlir
new file mode 100644
index 000000000000000..5c7c8bf1e256dfd
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/pass.mlir
@@ -0,0 +1,19 @@
+// RUN: mlir-transform-opt %s | FileCheck %s
+
+module attributes {transform.with_named_sequence} {
+  // CHECK-LABEL: @return_42
+  // CHECK: %[[C42:.+]] = arith.constant 42
+  // CHECK: return %[[C42]]
+  func.func @return_42() -> i32 {
+    %0 = arith.constant 21 : i32
+    %1 = arith.constant 2 : i32
+    %2 = arith.muli %0, %1 : i32
+    return %2 : i32
+  }
+
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %arg1 = transform.apply_registered_pass "canonicalize" to %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.print %arg1 : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/self-contained.mlir b/mlir/test/Examples/transform-opt/self-contained.mlir
new file mode 100644
index 000000000000000..b9a93af61b8bbcb
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/self-contained.mlir
@@ -0,0 +1,21 @@
+// RUN: mlir-transform-opt %s | FileCheck %s
+// RUN: mlir-transform-opt %s --transform=%s | FileCheck %s
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --verify-diagnostics
+// RUN: mlir-transform-opt %s --transform=%p/external-def.mlir --transform-entry-point=external_def | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/external-decl.mlir --transform-library=%p/external-def.mlir | FileCheck %s --check-prefix=EXTERNAL
+// RUN: mlir-transform-opt %s --transform=%p/syntax-error.mlir --verify-diagnostics
+
+// CHECK: IR printer: in self-contained
+// EXTERNAL: IR printer: external_def
+
+// The first occurrence comes from the print operation and the second is the
+// roundtrip output. However, we shouldn't have the symbol duplicated because
+// of library merging.
+// CHECK-COUNT-2: @__transform_main
+// CHECK-NOT: @__transform_main
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence private @__transform_main(%root: !transform.any_op) {
+    transform.print %root { name = "in self-contained" } : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Examples/transform-opt/syntax-error.mlir b/mlir/test/Examples/transform-opt/syntax-error.mlir
new file mode 100644
index 000000000000000..89f1d472fe8952e
--- /dev/null
+++ b/mlir/test/Examples/transform-opt/syntax-error.mlir
@@ -0,0 +1,5 @@
+// RUN: mlir-opt %s --verify-diagnostics
+// This file is used as additional input.
+
+// expected-error @below {{expected operation name in quotes}}
+module {
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 904dfb680a0404e..7636ef30c2d3ef4 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -161,6 +161,7 @@ def add_runtime(name):
         ToolSubst("transform-opt-ch2", unresolved="ignore"),
         ToolSubst("transform-opt-ch3", unresolved="ignore"),
         ToolSubst("transform-opt-ch4", unresolved="ignore"),
+        ToolSubst("mlir-transform-opt", unresolved="ignore"),
         ToolSubst("%mlir_lib_dir", config.mlir_lib_dir, unresolved="ignore"),
         ToolSubst("%mlir_src_dir", config.mlir_src_root, unresolved="ignore"),
     ]

From 49c399c2d113df1654b09c9b5afa38924829a8fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 28 Feb 2024 07:38:22 +0100
Subject: [PATCH 043/114] [clang][Interp] Toplevel destructors may fail

We used to run them, but not check if they failed. If they do,
the expression is invalid, even if we already have a result.

I do have a suspicion that we need to manually call destroyLocals()
in more places (everywhere basically?), but I'll wait with that
until I have a reproducer at hand.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 22 ++++++++++--------
 clang/lib/AST/Interp/ByteCodeExprGen.h   | 29 ++++++++++++++++--------
 clang/lib/AST/Interp/EvalEmitter.cpp     |  5 +++-
 clang/lib/AST/Interp/EvaluationResult.h  |  3 ++-
 clang/test/AST/Interp/cxx20.cpp          | 22 ++++++++++++++++--
 5 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index f193f959d3a6c08..b151f8d0d7a79cd 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2587,7 +2587,10 @@ bool ByteCodeExprGen<Emitter>::visitExpr(const Expr *E) {
 
     if (!this->emitFinishInit(E))
       return false;
-    return this->emitRetValue(E);
+    // We are destroying the locals AFTER the Ret op.
+    // The Ret op needs to copy the (alive) values, but the
+    // destructors may still turn the entire expression invalid.
+    return this->emitRetValue(E) && RootScope.destroyLocals();
   }
 
   return false;
@@ -3414,14 +3417,15 @@ bool ByteCodeExprGen<Emitter>::emitRecordDestruction(const Record *R) {
   // Now emit the destructor and recurse into base classes.
   if (const CXXDestructorDecl *Dtor = R->getDestructor();
       Dtor && !Dtor->isTrivial()) {
-    if (const Function *DtorFunc = getFunction(Dtor)) {
-      assert(DtorFunc->hasThisPointer());
-      assert(DtorFunc->getNumParams() == 1);
-      if (!this->emitDupPtr(SourceInfo{}))
-        return false;
-      if (!this->emitCall(DtorFunc, 0, SourceInfo{}))
-        return false;
-    }
+    const Function *DtorFunc = getFunction(Dtor);
+    if (!DtorFunc)
+      return false;
+    assert(DtorFunc->hasThisPointer());
+    assert(DtorFunc->getNumParams() == 1);
+    if (!this->emitDupPtr(SourceInfo{}))
+      return false;
+    if (!this->emitCall(DtorFunc, 0, SourceInfo{}))
+      return false;
   }
 
   for (const Record::Base &Base : llvm::reverse(R->bases())) {
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 5b3b533dba38777..acbbcc3dc9619ad 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -332,7 +332,7 @@ template <class Emitter> class VariableScope {
   }
 
   virtual void emitDestruction() {}
-  virtual void emitDestructors() {}
+  virtual bool emitDestructors() { return true; }
   VariableScope *getParent() const { return Parent; }
 
 protected:
@@ -356,13 +356,18 @@ template <class Emitter> class LocalScope : public VariableScope<Emitter> {
   }
 
   /// Overriden to support explicit destruction.
-  void emitDestruction() override {
+  void emitDestruction() override { destroyLocals(); }
+
+  /// Explicit destruction of local variables.
+  bool destroyLocals() {
     if (!Idx)
-      return;
-    this->emitDestructors();
+      return true;
+
+    bool Success = this->emitDestructors();
     this->Ctx->emitDestroy(*Idx, SourceInfo{});
     removeStoredOpaqueValues();
     this->Idx = std::nullopt;
+    return Success;
   }
 
   void addLocal(const Scope::Local &Local) override {
@@ -374,19 +379,25 @@ template <class Emitter> class LocalScope : public VariableScope<Emitter> {
     this->Ctx->Descriptors[*Idx].emplace_back(Local);
   }
 
-  void emitDestructors() override {
+  bool emitDestructors() override {
     if (!Idx)
-      return;
+      return true;
     // Emit destructor calls for local variables of record
     // type with a destructor.
     for (Scope::Local &Local : this->Ctx->Descriptors[*Idx]) {
       if (!Local.Desc->isPrimitive() && !Local.Desc->isPrimitiveArray()) {
-        this->Ctx->emitGetPtrLocal(Local.Offset, SourceInfo{});
-        this->Ctx->emitDestruction(Local.Desc);
-        this->Ctx->emitPopPtr(SourceInfo{});
+        if (!this->Ctx->emitGetPtrLocal(Local.Offset, SourceInfo{}))
+          return false;
+
+        if (!this->Ctx->emitDestruction(Local.Desc))
+          return false;
+
+        if (!this->Ctx->emitPopPtr(SourceInfo{}))
+          return false;
         removeIfStoredOpaqueValue(Local);
       }
     }
+    return true;
   }
 
   void removeStoredOpaqueValues() {
diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp
index 9cae25f5c4d6421..c9c2bf9b145b22f 100644
--- a/clang/lib/AST/Interp/EvalEmitter.cpp
+++ b/clang/lib/AST/Interp/EvalEmitter.cpp
@@ -38,8 +38,11 @@ EvaluationResult EvalEmitter::interpretExpr(const Expr *E,
   this->ConvertResultToRValue = ConvertResultToRValue;
   EvalResult.setSource(E);
 
-  if (!this->visitExpr(E) && EvalResult.empty())
+  if (!this->visitExpr(E)) {
+    // EvalResult may already have a result set, but something failed
+    // after that (e.g. evaluating destructors).
     EvalResult.setInvalid();
+  }
 
   return std::move(this->EvalResult);
 }
diff --git a/clang/lib/AST/Interp/EvaluationResult.h b/clang/lib/AST/Interp/EvaluationResult.h
index 28e1ae6ba3e7ab7..ecf2250074cc9e0 100644
--- a/clang/lib/AST/Interp/EvaluationResult.h
+++ b/clang/lib/AST/Interp/EvaluationResult.h
@@ -72,7 +72,8 @@ class EvaluationResult final {
     Kind = LValue;
   }
   void setInvalid() {
-    assert(empty());
+    // We are NOT asserting empty() here, since setting it to invalid
+    // is allowed even if there is already a result.
     Kind = Invalid;
   }
   void setValid() {
diff --git a/clang/test/AST/Interp/cxx20.cpp b/clang/test/AST/Interp/cxx20.cpp
index 5c9c62579651089..b24b0c8a3ba0eca 100644
--- a/clang/test/AST/Interp/cxx20.cpp
+++ b/clang/test/AST/Interp/cxx20.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fcxx-exceptions -fexperimental-new-constant-interpreter -std=c++20 -verify %s
-// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -verify=ref %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexperimental-new-constant-interpreter -std=c++20 -verify=both,expected -fcxx-exceptions %s
+// RUN: %clang_cc1 -fcxx-exceptions -std=c++20 -verify=both,ref -fcxx-exceptions %s
 
 void test_alignas_operand() {
   alignas(8) char dummy;
@@ -799,3 +799,21 @@ void f2() {
                        // access info for unnamed bit-field
 }
 }
+
+namespace FailingDestructor {
+  struct D {
+    int n;
+    bool can_destroy;
+
+    constexpr ~D() {
+      if (!can_destroy)
+        throw "oh no";
+    }
+  };
+  template<D d>
+  void f() {} // both-note {{invalid explicitly-specified argument}}
+
+  void g() {
+    f<D{0, false}>(); // both-error {{no matching function}}
+  }
+}

From 9f99eda1208787364b1a381b2d4e146fc4868cd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 28 Feb 2024 07:50:44 +0100
Subject: [PATCH 044/114] [clang][Interp][NFC] Convert test to
 verify=expected,both style

---
 clang/test/AST/Interp/cxx20.cpp | 156 +++++++++++---------------------
 1 file changed, 52 insertions(+), 104 deletions(-)

diff --git a/clang/test/AST/Interp/cxx20.cpp b/clang/test/AST/Interp/cxx20.cpp
index b24b0c8a3ba0eca..2c28e53784c5c63 100644
--- a/clang/test/AST/Interp/cxx20.cpp
+++ b/clang/test/AST/Interp/cxx20.cpp
@@ -58,13 +58,10 @@ static_assert(pointerAssign2() == 12, "");
 
 constexpr int unInitLocal() {
   int a;
-  return a; // ref-note {{read of uninitialized object}} \
-            // expected-note {{read of uninitialized object}}
+  return a; // both-note {{read of uninitialized object}}
 }
-static_assert(unInitLocal() == 0, ""); // ref-error {{not an integral constant expression}} \
-                                       // ref-note {{in call to 'unInitLocal()'}} \
-                                       // expected-error {{not an integral constant expression}} \
-                                       // expected-note {{in call to 'unInitLocal()'}} \
+static_assert(unInitLocal() == 0, ""); // both-error {{not an integral constant expression}} \
+                                       // both-note {{in call to 'unInitLocal()'}}
 
 constexpr int initializedLocal() {
   int a;
@@ -75,25 +72,19 @@ static_assert(initializedLocal() == 20);
 
 constexpr int initializedLocal2() {
   int a[2];
-  return *a; // expected-note {{read of uninitialized object is not allowed in a constant expression}} \
-             // ref-note {{read of uninitialized object is not allowed in a constant expression}}
+  return *a; // both-note {{read of uninitialized object is not allowed in a constant expression}}
 }
-static_assert(initializedLocal2() == 20); // expected-error {{not an integral constant expression}} \
-                                          // expected-note {{in call to}} \
-                                          // ref-error {{not an integral constant expression}} \
-                                          // ref-note {{in call to}}
+static_assert(initializedLocal2() == 20); // both-error {{not an integral constant expression}} \
+                                          // both-note {{in call to}}
 
 
 struct Int { int a; };
 constexpr int initializedLocal3() {
   Int i;
-  return i.a; // ref-note {{read of uninitialized object is not allowed in a constant expression}} \
-              // expected-note {{read of uninitialized object}}
+  return i.a; // both-note {{read of uninitialized object is not allowed in a constant expression}}
 }
-static_assert(initializedLocal3() == 20); // expected-error {{not an integral constant expression}} \
-                                          // expected-note {{in call to}} \
-                                          // ref-error {{not an integral constant expression}} \
-                                          // ref-note {{in call to}}
+static_assert(initializedLocal3() == 20); // both-error {{not an integral constant expression}} \
+                                          // both-note {{in call to}}
 
 
 
@@ -137,22 +128,16 @@ static_assert(!b4); // ref-error {{not an integral constant expression}} \
 namespace UninitializedFields {
   class A {
   public:
-    int a; // expected-note 4{{subobject declared here}} \
-           // ref-note 4{{subobject declared here}}
+    int a; // both-note 4{{subobject declared here}}
     constexpr A() {}
   };
-  constexpr A a; // expected-error {{must be initialized by a constant expression}} \
-                 // expected-note {{subobject 'a' is not initialized}} \
-                 // ref-error {{must be initialized by a constant expression}} \
-                 // ref-note {{subobject 'a' is not initialized}}
-  constexpr A aarr[2]; // expected-error {{must be initialized by a constant expression}} \
-                       // expected-note {{subobject 'a' is not initialized}} \
-                       // ref-error {{must be initialized by a constant expression}} \
-                       // ref-note {{subobject 'a' is not initialized}}
+  constexpr A a; // both-error {{must be initialized by a constant expression}} \
+                 // both-note {{subobject 'a' is not initialized}}
+  constexpr A aarr[2]; // both-error {{must be initialized by a constant expression}} \
+                       // both-note {{subobject 'a' is not initialized}}
   class F {
     public:
-      int f; // expected-note 3{{subobject declared here}} \
-             // ref-note 3{{subobject declared here}}
+      int f; // both-note 3{{subobject declared here}}
 
       constexpr F() {}
       constexpr F(bool b) {
@@ -161,26 +146,19 @@ namespace UninitializedFields {
       }
   };
 
-  constexpr F foo[2] = {true}; // expected-error {{must be initialized by a constant expression}} \
-                               // expected-note {{subobject 'f' is not initialized}} \
-                               // ref-error {{must be initialized by a constant expression}} \
-                               // ref-note {{subobject 'f' is not initialized}}
-  constexpr F foo2[3] = {true, false, true}; // expected-error {{must be initialized by a constant expression}} \
-                                             // expected-note {{subobject 'f' is not initialized}} \
-                                             // ref-error {{must be initialized by a constant expression}} \
-                                             // ref-note {{subobject 'f' is not initialized}}
-  constexpr F foo3[3] = {true, true, F()}; // expected-error {{must be initialized by a constant expression}} \
-                                           // expected-note {{subobject 'f' is not initialized}} \
-                                           // ref-error {{must be initialized by a constant expression}} \
-                                           // ref-note {{subobject 'f' is not initialized}}
+  constexpr F foo[2] = {true}; // both-error {{must be initialized by a constant expression}} \
+                               // both-note {{subobject 'f' is not initialized}}
+  constexpr F foo2[3] = {true, false, true}; // both-error {{must be initialized by a constant expression}} \
+                                             // both-note {{subobject 'f' is not initialized}}
+  constexpr F foo3[3] = {true, true, F()}; // both-error {{must be initialized by a constant expression}} \
+                                           // both-note {{subobject 'f' is not initialized}}
 
 
 
   class Base {
   public:
     bool b;
-    int a; // expected-note {{subobject declared here}} \
-           // ref-note {{subobject declared here}}
+    int a; // both-note {{subobject declared here}}
     constexpr Base() : b(true) {}
   };
 
@@ -188,56 +166,44 @@ namespace UninitializedFields {
   public:
     constexpr Derived() : Base() {}   };
 
-  constexpr Derived D; // expected-error {{must be initialized by a constant expression}} \
-                       // expected-note {{subobject 'a' is not initialized}} \
-                       // ref-error {{must be initialized by a constant expression}} \
-                       // ref-note {{subobject 'a' is not initialized}}
+  constexpr Derived D; // both-error {{must be initialized by a constant expression}} \
+                       // both-note {{subobject 'a' is not initialized}}
 
   class C2 {
   public:
     A a;
     constexpr C2() {}   };
-  constexpr C2 c2; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{subobject 'a' is not initialized}} \
-                   // ref-error {{must be initialized by a constant expression}} \
-                   // ref-note {{subobject 'a' is not initialized}}
+  constexpr C2 c2; // both-error {{must be initialized by a constant expression}} \
+                   // both-note {{subobject 'a' is not initialized}}
 
   class C3 {
   public:
     A a[2];
     constexpr C3() {}
   };
-  constexpr C3 c3; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{subobject 'a' is not initialized}} \
-                   // ref-error {{must be initialized by a constant expression}} \
-                   // ref-note {{subobject 'a' is not initialized}}
+  constexpr C3 c3; // both-error {{must be initialized by a constant expression}} \
+                   // both-note {{subobject 'a' is not initialized}}
 
   class C4 {
   public:
-    bool B[2][3]; // expected-note {{subobject declared here}} \
-                  // ref-note {{subobject declared here}}
+    bool B[2][3]; // both-note {{subobject declared here}}
     constexpr C4(){}
   };
-  constexpr C4 c4; // expected-error {{must be initialized by a constant expression}} \
-                   // expected-note {{subobject 'B' is not initialized}} \
-                   // ref-error {{must be initialized by a constant expression}} \
-                   // ref-note {{subobject 'B' is not initialized}}
+  constexpr C4 c4; // both-error {{must be initialized by a constant expression}} \
+                   // both-note {{subobject 'B' is not initialized}}
 };
 
 namespace ConstThis {
   class Foo {
-    const int T = 12; // expected-note {{declared const here}} \
-                      // ref-note {{declared const here}}
+    const int T = 12; // both-note {{declared const here}}
     int a;
   public:
     constexpr Foo() {
       this->a = 10;
-      T = 13; // expected-error {{cannot assign to non-static data member 'T' with const-qualified type}} \
-              // ref-error {{cannot assign to non-static data member 'T' with const-qualified type}}
+      T = 13; // both-error {{cannot assign to non-static data member 'T' with const-qualified type}}
     }
   };
-  constexpr Foo F; // expected-error {{must be initialized by a constant expression}} \
-                   // ref-error {{must be initialized by a constant expression}}
+  constexpr Foo F; // both-error {{must be initialized by a constant expression}}
 
 
   class FooDtor {
@@ -264,8 +230,7 @@ namespace ConstThis {
     constexpr ctor_test() {
       if (Good)
         a = 10;
-      int local = 100 / a; // expected-note {{division by zero}} \
-                           // ref-note {{division by zero}}
+      int local = 100 / a; // both-note {{division by zero}}
     }
   };
 
@@ -277,22 +242,17 @@ namespace ConstThis {
     constexpr ~dtor_test() {
       if (Good)
         a = 10;
-      int local = 100 / a; // expected-note {{division by zero}} \
-                           // ref-note {{division by zero}}
+      int local = 100 / a; // both-note {{division by zero}}
     }
   };
 
   constexpr ctor_test<true> good_ctor;
   constexpr dtor_test<true> good_dtor;
 
-  constexpr ctor_test<false> bad_ctor; // expected-error {{must be initialized by a constant expression}} \
-                                       // expected-note {{in call to}} \
-                                       // ref-error {{must be initialized by a constant expression}} \
-                                       // ref-note {{in call to}}
-  constexpr dtor_test<false> bad_dtor; // expected-error {{must have constant destruction}} \
-                                       // expected-note {{in call to}} \
-                                       // ref-error {{must have constant destruction}} \
-                                       // ref-note {{in call to}}
+  constexpr ctor_test<false> bad_ctor; // both-error {{must be initialized by a constant expression}} \
+                                       // both-note {{in call to}}
+  constexpr dtor_test<false> bad_dtor; // both-error {{must have constant destruction}} \
+                                       // both-note {{in call to}}
 };
 
 namespace BaseInit {
@@ -311,10 +271,8 @@ namespace BaseInit {
   };
 
   static_assert(Final{1, 2, 3}.c == 3, ""); // OK
-  static_assert(Final{1, 2, 3}.a == 0, ""); // expected-error {{not an integral constant expression}} \
-                                            // expected-note {{read of uninitialized object}} \
-                                            // ref-error {{not an integral constant expression}} \
-                                            // ref-note {{read of uninitialized object}}
+  static_assert(Final{1, 2, 3}.a == 0, ""); // both-error {{not an integral constant expression}} \
+                                            // both-note {{read of uninitialized object}}
 
 
   struct Mixin  {
@@ -333,10 +291,8 @@ namespace BaseInit {
 
   static_assert(Final2{1, 2, 3}.c == 3, ""); // OK
   static_assert(Final2{1, 2, 3}.b == 2, ""); // OK
-  static_assert(Final2{1, 2, 3}.a == 0, ""); // expected-error {{not an integral constant expression}} \
-                                             // expected-note {{read of uninitialized object}} \
-                                             // ref-error {{not an integral constant expression}} \
-                                             // ref-note {{read of uninitialized object}}
+  static_assert(Final2{1, 2, 3}.a == 0, ""); // both-error {{not an integral constant expression}} \
+                                             // both-note {{read of uninitialized object}}
 
 
   struct Mixin3  {
@@ -352,10 +308,8 @@ namespace BaseInit {
 
   static_assert(Final3{1, 2, 3}.c == 3, ""); // OK
   static_assert(Final3{1, 2, 3}.b == 2, ""); // OK
-  static_assert(Final3{1, 2, 3}.a == 0, ""); // expected-error {{not an integral constant expression}} \
-                                             // expected-note {{read of uninitialized object}} \
-                                             // ref-error {{not an integral constant expression}} \
-                                             // ref-note {{read of uninitialized object}}
+  static_assert(Final3{1, 2, 3}.a == 0, ""); // both-error {{not an integral constant expression}} \
+                                             // both-note {{read of uninitialized object}}
 };
 
 namespace Destructors {
@@ -633,16 +587,13 @@ namespace ImplicitFunction {
 
    /// The operator= call here will fail and the diagnostics should be fine.
    b = a; // ref-note {{subobject 'a' is not initialized}} \
-          // ref-note {{in call to}} \
           // expected-note {{read of uninitialized object}} \
-          // expected-note {{in call to}}
+          // both-note {{in call to}}
 
    return 1;
   }
-  static_assert(callMe() == 1, ""); // ref-error {{not an integral constant expression}} \
-                                    // ref-note {{in call to 'callMe()'}} \
-                                    // expected-error {{not an integral constant expression}} \
-                                    // expected-note {{in call to 'callMe()'}}
+  static_assert(callMe() == 1, ""); // both-error {{not an integral constant expression}} \
+                                    // both-note {{in call to 'callMe()'}}
 }
 
 /// FIXME: Unfortunately, the similar tests in test/SemaCXX/{compare-cxx2a.cpp use member pointers,
@@ -680,8 +631,7 @@ namespace ThreeWayCmp {
   static_assert(1.0 <=> 2.f == -1, "");
   static_assert(1.0 <=> 1.0 == 0, "");
   static_assert(2.0 <=> 1.0 == 1, "");
-  constexpr int k = (1 <=> 1, 0); // expected-warning {{comparison result unused}} \
-                                  // ref-warning {{comparison result unused}}
+  constexpr int k = (1 <=> 1, 0); // both-warning {{comparison result unused}}
   static_assert(k== 0, "");
 
   /// Pointers.
@@ -690,10 +640,8 @@ namespace ThreeWayCmp {
   constexpr const int *pa1 = &a[1];
   constexpr const int *pa2 = &a[2];
   constexpr const int *pb1 = &b[1];
-  static_assert(pa1 <=> pb1 != 0, ""); // expected-error {{not an integral constant expression}} \
-                                       // expected-note {{has unspecified value}} \
-                                       // ref-error {{not an integral constant expression}} \
-                                       // ref-note {{has unspecified value}}
+  static_assert(pa1 <=> pb1 != 0, ""); // both-error {{not an integral constant expression}} \
+                                       // both-note {{has unspecified value}} \
   static_assert(pa1 <=> pa1 == 0, "");
   static_assert(pa1 <=> pa2 == -1, "");
   static_assert(pa2 <=> pa1 == 1, "");

From 1d61709f7718d2dc2aee1c27d02043a748e8e6d7 Mon Sep 17 00:00:00 2001
From: Jean Perier <jperier@nvidia.com>
Date: Wed, 28 Feb 2024 01:05:01 -0800
Subject: [PATCH 045/114] [flang] fix warning after #82042

---
 flang/lib/Lower/HostAssociations.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Lower/HostAssociations.cpp b/flang/lib/Lower/HostAssociations.cpp
index 44cc0e74e3b52ab..b9e13ccad1c9290 100644
--- a/flang/lib/Lower/HostAssociations.cpp
+++ b/flang/lib/Lower/HostAssociations.cpp
@@ -376,9 +376,9 @@ class CapturedArrays : public CapturedSymbols<CapturedArrays> {
                             const Fortran::semantics::Symbol &sym) {
     mlir::Type type = converter.genType(sym);
     bool isPolymorphic = Fortran::semantics::IsPolymorphic(sym);
-    assert(type.isa<fir::SequenceType>() ||
-           (isPolymorphic && type.isa<fir::ClassType>()) &&
-               "must be a sequence type");
+    assert((type.isa<fir::SequenceType>() ||
+            (isPolymorphic && type.isa<fir::ClassType>())) &&
+           "must be a sequence type");
     if (isPolymorphic)
       return type;
     return fir::BoxType::get(type);

From 26b8be201e2d15867bb327a8008fffb3e34d42a5 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Wed, 28 Feb 2024 10:15:57 +0100
Subject: [PATCH 046/114]  [flang][OpenMP][MLIR] Basic support for delayed
 privatization code-gen (#81833)

Adds basic support for emitting delayed privatizers from flang. So far,
only types of symbols are supported (i.e. scalars), support for more
complicated types will be added later. This also makes sure that
reduction and delayed privatization work properly together by merging
the
body-gen callbacks for both in case both clauses are present on the
parallel construct.
---
 flang/include/flang/Lower/AbstractConverter.h |   6 +
 flang/lib/Lower/Bridge.cpp                    |  13 ++-
 .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 107 +++++++++++++++---
 flang/lib/Lower/OpenMP/DataSharingProcessor.h |  38 ++++++-
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  91 +++++++++++++--
 flang/lib/Lower/OpenMP/Utils.cpp              |   6 +
 flang/lib/Lower/OpenMP/Utils.h                |   1 +
 .../delayed-privatization-firstprivate.f90    |  29 +++++
 .../FIR/delayed-privatization-private.f90     |  38 +++++++
 .../delayed-privatization-firstprivate.f90    |  29 +++++
 ...yed-privatization-private-firstprivate.f90 |  34 ++++++
 .../OpenMP/delayed-privatization-private.f90  |  28 +++++
 .../delayed-privatization-reduction.f90       |  30 +++++
 13 files changed, 422 insertions(+), 28 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90
 create mode 100644 flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90
 create mode 100644 flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90
 create mode 100644 flang/test/Lower/OpenMP/delayed-privatization-private.f90
 create mode 100644 flang/test/Lower/OpenMP/delayed-privatization-reduction.f90

diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index e2af59e0aaa1968..fa3fd9a8cc05eef 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -16,6 +16,7 @@
 #include "flang/Common/Fortran.h"
 #include "flang/Lower/LoweringOptions.h"
 #include "flang/Lower/PFTDefs.h"
+#include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Semantics/symbol.h"
 #include "mlir/IR/Builders.h"
@@ -299,6 +300,11 @@ class AbstractConverter {
     return loweringOptions;
   }
 
+  /// Find the symbol in one level up of symbol map such as for host-association
+  /// in OpenMP code or return null.
+  virtual Fortran::lower::SymbolBox
+  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) = 0;
+
 private:
   /// Options controlling lowering behavior.
   const Fortran::lower::LoweringOptions &loweringOptions;
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index f865b53f74debd1..0691753defceb30 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -1000,6 +1000,17 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       if (sym.detailsIf<Fortran::semantics::CommonBlockDetails>())
         return symMap->lookupSymbol(sym);
 
+      // For symbols to be privatized in OMP, the symbol is mapped to an
+      // instance of `SymbolBox::Intrinsic` (i.e. a direct mapping to an MLIR
+      // SSA value). This MLIR SSA value is the block argument to the
+      // `omp.private`'s `alloc` block. If this is the case, we return this
+      // `SymbolBox::Intrinsic` value.
+      if (Fortran::lower::SymbolBox v = symMap->lookupSymbol(sym))
+        return v.match(
+            [&](const Fortran::lower::SymbolBox::Intrinsic &)
+                -> Fortran::lower::SymbolBox { return v; },
+            [](const auto &) -> Fortran::lower::SymbolBox { return {}; });
+
       return {};
     }
     if (Fortran::lower::SymbolBox v = symMap->lookupSymbol(sym))
@@ -1018,7 +1029,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   /// Find the symbol in one level up of symbol map such as for host-association
   /// in OpenMP code or return null.
   Fortran::lower::SymbolBox
-  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) {
+  lookupOneLevelUpSymbol(const Fortran::semantics::Symbol &sym) override {
     if (Fortran::lower::SymbolBox v = localSymbols.lookupOneLevelUpSymbol(sym))
       return v;
     return {};
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 136bda0b582ee37..d6287cb4bc239eb 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -66,9 +66,10 @@ void DataSharingProcessor::cloneSymbol(const Fortran::semantics::Symbol *sym) {
 }
 
 void DataSharingProcessor::copyFirstPrivateSymbol(
-    const Fortran::semantics::Symbol *sym) {
+    const Fortran::semantics::Symbol *sym,
+    mlir::OpBuilder::InsertPoint *copyAssignIP) {
   if (sym->test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate))
-    converter.copyHostAssociateVar(*sym);
+    converter.copyHostAssociateVar(*sym, copyAssignIP);
 }
 
 void DataSharingProcessor::copyLastPrivateSymbol(
@@ -307,14 +308,10 @@ void DataSharingProcessor::privatize() {
   for (const Fortran::semantics::Symbol *sym : privatizedSymbols) {
     if (const auto *commonDet =
             sym->detailsIf<Fortran::semantics::CommonBlockDetails>()) {
-      for (const auto &mem : commonDet->objects()) {
-        cloneSymbol(&*mem);
-        copyFirstPrivateSymbol(&*mem);
-      }
-    } else {
-      cloneSymbol(sym);
-      copyFirstPrivateSymbol(sym);
-    }
+      for (const auto &mem : commonDet->objects())
+        doPrivatize(&*mem);
+    } else
+      doPrivatize(sym);
   }
 }
 
@@ -338,11 +335,95 @@ void DataSharingProcessor::defaultPrivatize() {
         !sym->GetUltimate().has<Fortran::semantics::NamelistDetails>() &&
         !symbolsInNestedRegions.contains(sym) &&
         !symbolsInParentRegions.contains(sym) &&
-        !privatizedSymbols.contains(sym)) {
+        !privatizedSymbols.contains(sym))
+      doPrivatize(sym);
+  }
+}
+
+void DataSharingProcessor::doPrivatize(const Fortran::semantics::Symbol *sym) {
+  if (!useDelayedPrivatization) {
+    cloneSymbol(sym);
+    copyFirstPrivateSymbol(sym);
+    return;
+  }
+
+  Fortran::lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(*sym);
+  assert(hsb && "Host symbol box not found");
+
+  mlir::Type symType = hsb.getAddr().getType();
+  mlir::Location symLoc = hsb.getAddr().getLoc();
+  std::string privatizerName = sym->name().ToString() + ".privatizer";
+  bool isFirstPrivate =
+      sym->test(Fortran::semantics::Symbol::Flag::OmpFirstPrivate);
+
+  mlir::omp::PrivateClauseOp privatizerOp = [&]() {
+    auto moduleOp = firOpBuilder.getModule();
+    auto uniquePrivatizerName = fir::getTypeAsString(
+        symType, converter.getKindMap(),
+        converter.mangleName(*sym) +
+            (isFirstPrivate ? "_firstprivate" : "_private"));
+
+    if (auto existingPrivatizer =
+            moduleOp.lookupSymbol<mlir::omp::PrivateClauseOp>(
+                uniquePrivatizerName))
+      return existingPrivatizer;
+
+    auto ip = firOpBuilder.saveInsertionPoint();
+    firOpBuilder.setInsertionPoint(&moduleOp.getBodyRegion().front(),
+                                   moduleOp.getBodyRegion().front().begin());
+    auto result = firOpBuilder.create<mlir::omp::PrivateClauseOp>(
+        symLoc, uniquePrivatizerName, symType,
+        isFirstPrivate ? mlir::omp::DataSharingClauseType::FirstPrivate
+                       : mlir::omp::DataSharingClauseType::Private);
+
+    symTable->pushScope();
+
+    // Populate the `alloc` region.
+    {
+      mlir::Region &allocRegion = result.getAllocRegion();
+      mlir::Block *allocEntryBlock = firOpBuilder.createBlock(
+          &allocRegion, /*insertPt=*/{}, symType, symLoc);
+
+      firOpBuilder.setInsertionPointToEnd(allocEntryBlock);
+      symTable->addSymbol(*sym, allocRegion.getArgument(0));
+      symTable->pushScope();
       cloneSymbol(sym);
-      copyFirstPrivateSymbol(sym);
+      firOpBuilder.create<mlir::omp::YieldOp>(
+          hsb.getAddr().getLoc(),
+          symTable->shallowLookupSymbol(*sym).getAddr());
+      symTable->popScope();
     }
-  }
+
+    // Populate the `copy` region if this is a `firstprivate`.
+    if (isFirstPrivate) {
+      mlir::Region &copyRegion = result.getCopyRegion();
+      // First block argument corresponding to the original/host value while
+      // second block argument corresponding to the privatized value.
+      mlir::Block *copyEntryBlock = firOpBuilder.createBlock(
+          &copyRegion, /*insertPt=*/{}, {symType, symType}, {symLoc, symLoc});
+      firOpBuilder.setInsertionPointToEnd(copyEntryBlock);
+      symTable->addSymbol(*sym, copyRegion.getArgument(0),
+                          /*force=*/true);
+      symTable->pushScope();
+      symTable->addSymbol(*sym, copyRegion.getArgument(1));
+      auto ip = firOpBuilder.saveInsertionPoint();
+      copyFirstPrivateSymbol(sym, &ip);
+
+      firOpBuilder.create<mlir::omp::YieldOp>(
+          hsb.getAddr().getLoc(),
+          symTable->shallowLookupSymbol(*sym).getAddr());
+      symTable->popScope();
+    }
+
+    symTable->popScope();
+    firOpBuilder.restoreInsertionPoint(ip);
+    return result;
+  }();
+
+  delayedPrivatizationInfo.privatizers.push_back(
+      mlir::SymbolRefAttr::get(privatizerOp));
+  delayedPrivatizationInfo.originalAddresses.push_back(hsb.getAddr());
+  delayedPrivatizationInfo.symbols.push_back(sym);
 }
 
 } // namespace omp
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index 10c0a30c09c3915..9f7301df07598f4 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -23,6 +23,24 @@ namespace lower {
 namespace omp {
 
 class DataSharingProcessor {
+public:
+  /// Collects all the information needed for delayed privatization. This can be
+  /// used by ops with data-sharing clauses to properly generate their regions
+  /// (e.g. add region arguments) and map the original SSA values to their
+  /// corresponding OMP region operands.
+  struct DelayedPrivatizationInfo {
+    // The list of symbols referring to delayed privatizer ops (i.e.
+    // `omp.private` ops).
+    llvm::SmallVector<mlir::SymbolRefAttr> privatizers;
+    // SSA values that correspond to "original" values being privatized.
+    // "Original" here means the SSA value outside the OpenMP region from which
+    // a clone is created inside the region.
+    llvm::SmallVector<mlir::Value> originalAddresses;
+    // Fortran symbols corresponding to the above SSA values.
+    llvm::SmallVector<const Fortran::semantics::Symbol *> symbols;
+  };
+
+private:
   bool hasLastPrivateOp;
   mlir::OpBuilder::InsertPoint lastPrivIP;
   mlir::OpBuilder::InsertPoint insPt;
@@ -36,6 +54,9 @@ class DataSharingProcessor {
   fir::FirOpBuilder &firOpBuilder;
   const Fortran::parser::OmpClauseList &opClauseList;
   Fortran::lower::pft::Evaluation &eval;
+  bool useDelayedPrivatization;
+  Fortran::lower::SymMap *symTable;
+  DelayedPrivatizationInfo delayedPrivatizationInfo;
 
   bool needBarrier();
   void collectSymbols(Fortran::semantics::Symbol::Flag flag);
@@ -47,10 +68,13 @@ class DataSharingProcessor {
   void collectDefaultSymbols();
   void privatize();
   void defaultPrivatize();
+  void doPrivatize(const Fortran::semantics::Symbol *sym);
   void copyLastPrivatize(mlir::Operation *op);
   void insertLastPrivateCompare(mlir::Operation *op);
   void cloneSymbol(const Fortran::semantics::Symbol *sym);
-  void copyFirstPrivateSymbol(const Fortran::semantics::Symbol *sym);
+  void
+  copyFirstPrivateSymbol(const Fortran::semantics::Symbol *sym,
+                         mlir::OpBuilder::InsertPoint *copyAssignIP = nullptr);
   void copyLastPrivateSymbol(const Fortran::semantics::Symbol *sym,
                              mlir::OpBuilder::InsertPoint *lastPrivIP);
   void insertDeallocs();
@@ -58,10 +82,14 @@ class DataSharingProcessor {
 public:
   DataSharingProcessor(Fortran::lower::AbstractConverter &converter,
                        const Fortran::parser::OmpClauseList &opClauseList,
-                       Fortran::lower::pft::Evaluation &eval)
+                       Fortran::lower::pft::Evaluation &eval,
+                       bool useDelayedPrivatization = false,
+                       Fortran::lower::SymMap *symTable = nullptr)
       : hasLastPrivateOp(false), converter(converter),
         firOpBuilder(converter.getFirOpBuilder()), opClauseList(opClauseList),
-        eval(eval) {}
+        eval(eval), useDelayedPrivatization(useDelayedPrivatization),
+        symTable(symTable) {}
+
   // Privatisation is split into two steps.
   // Step1 performs cloning of all privatisation clauses and copying for
   // firstprivates. Step1 is performed at the place where process/processStep1
@@ -80,6 +108,10 @@ class DataSharingProcessor {
     assert(!loopIV && "Loop iteration variable already set");
     loopIV = iv;
   }
+
+  const DelayedPrivatizationInfo &getDelayedPrivatizationInfo() const {
+    return delayedPrivatizationInfo;
+  }
 };
 
 } // namespace omp
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 7953bf83cba0fee..21ad51c53d8742d 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -558,6 +558,7 @@ genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
 
 static mlir::omp::ParallelOp
 genParallelOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::lower::SymMap &symTable,
               Fortran::semantics::SemanticsContext &semaCtx,
               Fortran::lower::pft::Evaluation &eval, bool genNested,
               mlir::Location currentLocation,
@@ -590,8 +591,8 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   auto reductionCallback = [&](mlir::Operation *op) {
     llvm::SmallVector<mlir::Location> locs(reductionVars.size(),
                                            currentLocation);
-    auto block = converter.getFirOpBuilder().createBlock(&op->getRegion(0), {},
-                                                         reductionTypes, locs);
+    auto *block = converter.getFirOpBuilder().createBlock(&op->getRegion(0), {},
+                                                          reductionTypes, locs);
     for (auto [arg, prv] :
          llvm::zip_equal(reductionSymbols, block->getArguments())) {
       converter.bindSymbol(*arg, prv);
@@ -599,13 +600,78 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
     return reductionSymbols;
   };
 
-  return genOpWithBody<mlir::omp::ParallelOp>(
+  OpWithBodyGenInfo genInfo =
       OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
           .setOuterCombined(outerCombined)
           .setClauses(&clauseList)
           .setReductions(&reductionSymbols, &reductionTypes)
-          .setGenRegionEntryCb(reductionCallback),
+          .setGenRegionEntryCb(reductionCallback);
+
+  if (!enableDelayedPrivatization) {
+    return genOpWithBody<mlir::omp::ParallelOp>(
+        genInfo,
+        /*resultTypes=*/mlir::TypeRange(), ifClauseOperand,
+        numThreadsClauseOperand, allocateOperands, allocatorOperands,
+        reductionVars,
+        reductionDeclSymbols.empty()
+            ? nullptr
+            : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
+                                   reductionDeclSymbols),
+        procBindKindAttr, /*private_vars=*/llvm::SmallVector<mlir::Value>{},
+        /*privatizers=*/nullptr);
+  }
+
+  bool privatize = !outerCombined;
+  DataSharingProcessor dsp(converter, clauseList, eval,
+                           /*useDelayedPrivatization=*/true, &symTable);
+
+  if (privatize)
+    dsp.processStep1();
+
+  const auto &delayedPrivatizationInfo = dsp.getDelayedPrivatizationInfo();
+
+  auto genRegionEntryCB = [&](mlir::Operation *op) {
+    auto parallelOp = llvm::cast<mlir::omp::ParallelOp>(op);
+
+    llvm::SmallVector<mlir::Location> reductionLocs(reductionVars.size(),
+                                                    currentLocation);
+
+    mlir::OperandRange privateVars = parallelOp.getPrivateVars();
+    mlir::Region &region = parallelOp.getRegion();
+
+    llvm::SmallVector<mlir::Type> privateVarTypes = reductionTypes;
+    privateVarTypes.reserve(privateVarTypes.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(privateVarTypes),
+                    [](mlir::Value v) { return v.getType(); });
+
+    llvm::SmallVector<mlir::Location> privateVarLocs = reductionLocs;
+    privateVarLocs.reserve(privateVarLocs.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(privateVarLocs),
+                    [](mlir::Value v) { return v.getLoc(); });
+
+    converter.getFirOpBuilder().createBlock(&region, /*insertPt=*/{},
+                                            privateVarTypes, privateVarLocs);
+
+    llvm::SmallVector<const Fortran::semantics::Symbol *> allSymbols =
+        reductionSymbols;
+    allSymbols.append(delayedPrivatizationInfo.symbols);
+    for (auto [arg, prv] : llvm::zip_equal(allSymbols, region.getArguments())) {
+      converter.bindSymbol(*arg, prv);
+    }
+
+    return allSymbols;
+  };
+
+  // TODO Merge with the reduction CB.
+  genInfo.setGenRegionEntryCb(genRegionEntryCB).setDataSharingProcessor(&dsp);
+
+  llvm::SmallVector<mlir::Attribute> privatizers(
+      delayedPrivatizationInfo.privatizers.begin(),
+      delayedPrivatizationInfo.privatizers.end());
+
+  return genOpWithBody<mlir::omp::ParallelOp>(
+      genInfo,
       /*resultTypes=*/mlir::TypeRange(), ifClauseOperand,
       numThreadsClauseOperand, allocateOperands, allocatorOperands,
       reductionVars,
@@ -613,8 +679,11 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
           ? nullptr
           : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
                                  reductionDeclSymbols),
-      procBindKindAttr, /*private_vars=*/llvm::SmallVector<mlir::Value>{},
-      /*privatizers=*/nullptr);
+      procBindKindAttr, delayedPrivatizationInfo.originalAddresses,
+      delayedPrivatizationInfo.privatizers.empty()
+          ? nullptr
+          : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
+                                 privatizers));
 }
 
 static mlir::omp::SectionOp
@@ -1621,7 +1690,7 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
     if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet)
             .test(ompDirective)) {
       validDirective = true;
-      genParallelOp(converter, semaCtx, eval, /*genNested=*/false,
+      genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/false,
                     currentLocation, loopOpClauseList,
                     /*outerCombined=*/true);
     }
@@ -1711,8 +1780,8 @@ genOMP(Fortran::lower::AbstractConverter &converter,
                        currentLocation);
     break;
   case llvm::omp::Directive::OMPD_parallel:
-    genParallelOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
-                  beginClauseList);
+    genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/true,
+                  currentLocation, beginClauseList);
     break;
   case llvm::omp::Directive::OMPD_single:
     genSingleOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
@@ -1769,7 +1838,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
           .test(directive.v)) {
     bool outerCombined =
         directive.v != llvm::omp::Directive::OMPD_target_parallel;
-    genParallelOp(converter, semaCtx, eval, /*genNested=*/false,
+    genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/false,
                   currentLocation, beginClauseList, outerCombined);
     combinedDirective = true;
   }
@@ -1852,7 +1921,7 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 
   // Parallel wrapper of PARALLEL SECTIONS construct
   if (dir == llvm::omp::Directive::OMPD_parallel_sections) {
-    genParallelOp(converter, semaCtx, eval,
+    genParallelOp(converter, symTable, semaCtx, eval,
                   /*genNested=*/false, currentLocation, sectionsClauseList,
                   /*outerCombined=*/true);
   } else {
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 31b15257d18687a..49517f62895dfe7 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -24,6 +24,12 @@ llvm::cl::opt<bool> treatIndexAsSection(
     llvm::cl::desc("In the OpenMP data clauses treat `a(N)` as `a(N:N)`."),
     llvm::cl::init(true));
 
+llvm::cl::opt<bool> enableDelayedPrivatization(
+    "openmp-enable-delayed-privatization",
+    llvm::cl::desc(
+        "Emit `[first]private` variables as clauses on the MLIR ops."),
+    llvm::cl::init(false));
+
 namespace Fortran {
 namespace lower {
 namespace omp {
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index c346f891f0797e1..f57cd7420ce4d50 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -15,6 +15,7 @@
 #include "llvm/Support/CommandLine.h"
 
 extern llvm::cl::opt<bool> treatIndexAsSection;
+extern llvm::cl::opt<bool> enableDelayedPrivatization;
 
 namespace fir {
 class FirOpBuilder;
diff --git a/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90 b/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90
new file mode 100644
index 000000000000000..122542345f104b1
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/delayed-privatization-firstprivate.f90
@@ -0,0 +1,29 @@
+! Test delayed privatization for the `private` clause.
+
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_firstprivate
+  implicit none
+  integer :: var1
+
+!$OMP PARALLEL FIRSTPRIVATE(var1)
+  var1 = 10
+!$OMP END PARALLEL
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_firstprivateEvar1"}
+! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<i32>)
+! CHECK: } copy {
+! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref<i32>, %[[PRIV_PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]] : !fir.ref<i32>
+! CHECK:    fir.store %[[ORIG_VAL]] to %[[PRIV_PRIV_ARG]] : !fir.ref<i32>
+! CHECK:    omp.yield(%[[PRIV_PRIV_ARG]] : !fir.ref<i32>)
+! CHECK: }
+
+! CHECK-LABEL: @_QPdelayed_privatization_firstprivate
+! CHECK: omp.parallel private(@[[VAR1_PRIVATIZER_SYM]] %{{.*}} -> %{{.*}} : !fir.ref<i32>) {
+! CHECK: omp.terminator
+
diff --git a/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90 b/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90
new file mode 100644
index 000000000000000..2e9995ea1fd4c47
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/delayed-privatization-private.f90
@@ -0,0 +1,38 @@
+! Test delayed privatization for the `private` clause.
+
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_private
+  implicit none
+  integer :: var1
+
+!$OMP PARALLEL PRIVATE(var1)
+  var1 = 10
+!$OMP END PARALLEL
+
+!$OMP PARALLEL PRIVATE(var1)
+  var1 = 20
+!$OMP END PARALLEL
+
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_privateEvar1"}
+! CHECK-NEXT:   omp.yield(%[[PRIV_ALLOC]] : !fir.ref<i32>)
+! CHECK-NOT: } copy {
+
+! CHECK-LABEL: @_QPdelayed_privatization_private
+! CHECK: %[[ORIG_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatization_privateEvar1"}
+! CHECK: omp.parallel private(@[[PRIVATIZER_SYM]] %[[ORIG_ALLOC]] -> %[[PAR_ARG:.*]] : !fir.ref<i32>) {
+! CHECK: %[[C10:.*]] = arith.constant 10 : i32
+! CHECK: fir.store %[[C10]] to %[[PAR_ARG]] : !fir.ref<i32>
+! CHECK: omp.terminator
+
+! Test that the same privatizer is used if the a variable with the same type and
+! name was previously privatized.
+! CHECK: omp.parallel private(@[[PRIVATIZER_SYM]] %[[ORIG_ALLOC]] -> %[[PAR_ARG:.*]] : !fir.ref<i32>) {
+! CHECK: %[[C20:.*]] = arith.constant 20 : i32
+! CHECK: fir.store %[[C20]] to %[[PAR_ARG]] : !fir.ref<i32>
+! CHECK: omp.terminator
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90
new file mode 100644
index 000000000000000..e3d2a5a8af26086
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-firstprivate.f90
@@ -0,0 +1,29 @@
+! Test delayed privatization for the `firstprivate` clause.
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_firstprivate
+  implicit none
+  integer :: var1
+
+!$omp parallel firstprivate(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_firstprivateEvar1"}
+! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] {uniq_name = "_QFdelayed_privatization_firstprivateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : !fir.ref<i32>)
+! CHECK: } copy {
+! CHECK: ^bb0(%[[PRIV_ORIG_ARG:.*]]: !fir.ref<i32>, %[[PRIV_PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK:    %[[ORIG_VAL:.*]] = fir.load %[[PRIV_ORIG_ARG]] : !fir.ref<i32>
+! CHECK:    hlfir.assign %[[ORIG_VAL]] to %[[PRIV_PRIV_ARG]] temporary_lhs : i32, !fir.ref<i32>
+! CHECK:    omp.yield(%[[PRIV_PRIV_ARG]] : !fir.ref<i32>)
+! CHECK: }
+
+! CHECK-LABEL: @_QPdelayed_privatization_firstprivate
+! CHECK: omp.parallel private(@[[VAR1_PRIVATIZER_SYM]] %{{.*}} -> %{{.*}} : !fir.ref<i32>) {
+! CHECK: omp.terminator
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90 b/flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90
new file mode 100644
index 000000000000000..46eef6eb3bcf6af
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-private-firstprivate.f90
@@ -0,0 +1,34 @@
+! Test delayed privatization for both `private` and `firstprivate` clauses.
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_private_firstprivate
+  implicit none
+  integer :: var1
+  integer :: var2
+
+!$omp parallel private(var1) firstprivate(var2)
+  var1 = 10
+  var2 = var1 + var2
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = firstprivate}
+! CHECK-SAME: @[[VAR2_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK: } copy {
+! CHECK: }
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK: }
+
+! CHECK-LABEL: func.func @_QPdelayed_privatization_private_firstprivate() {
+! CHECK:  %[[VAR1_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1"
+! CHECK:  %[[VAR1_DECL:.*]]:2 = hlfir.declare %[[VAR1_ALLOC]]
+
+! CHECK:  %[[VAR2_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var2"
+! CHECK:  %[[VAR2_DECL:.*]]:2 = hlfir.declare %[[VAR2_ALLOC]]
+
+! CHECK:  omp.parallel private(
+! CHECK-SAME: @[[VAR1_PRIVATIZER_SYM]] %[[VAR1_DECL]]#0 -> %{{.*}} : !fir.ref<i32>, 
+! CHECK-SAME: @[[VAR2_PRIVATIZER_SYM]] %[[VAR2_DECL]]#0 -> %{{.*}} : !fir.ref<i32>) {
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-private.f90 b/flang/test/Lower/OpenMP/delayed-privatization-private.f90
new file mode 100644
index 000000000000000..240e0e71bfcd16a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-private.f90
@@ -0,0 +1,28 @@
+! Test delayed privatization for the `private` clause.
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine delayed_privatization_private
+  implicit none
+  integer :: var1
+
+!$omp parallel private(var1)
+  var1 = 10
+!$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+! CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !fir.ref<i32>):
+! CHECK-NEXT:   %[[PRIV_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", pinned, uniq_name = "_QFdelayed_privatization_privateEvar1"}
+! CHECK-NEXT:   %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[PRIV_ALLOC]] {uniq_name = "_QFdelayed_privatization_privateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-NEXT:   omp.yield(%[[PRIV_DECL]]#0 : !fir.ref<i32>)
+! CHECK-NOT: } copy {
+
+! CHECK-LABEL: @_QPdelayed_privatization_private
+! CHECK: %[[ORIG_ALLOC:.*]] = fir.alloca i32 {bindc_name = "var1", uniq_name = "_QFdelayed_privatization_privateEvar1"}
+! CHECK: %[[ORIG_DECL:.*]]:2 = hlfir.declare %[[ORIG_ALLOC]] {uniq_name = "_QFdelayed_privatization_privateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: omp.parallel private(@[[PRIVATIZER_SYM]] %[[ORIG_DECL]]#0 -> %[[PAR_ARG:.*]] : !fir.ref<i32>) {
+! CHECK: %[[PAR_ARG_DECL:.*]]:2 = hlfir.declare %[[PAR_ARG]] {uniq_name = "_QFdelayed_privatization_privateEvar1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: hlfir.assign %{{.*}} to %[[PAR_ARG_DECL]]#0 : i32, !fir.ref<i32>
+! CHECK: omp.terminator
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-reduction.f90 b/flang/test/Lower/OpenMP/delayed-privatization-reduction.f90
new file mode 100644
index 000000000000000..c61f352b9b055a0
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-reduction.f90
@@ -0,0 +1,30 @@
+! Test that reductions and delayed privatization work properly togehter. Since
+! both types of clauses add block arguments to the OpenMP region, we make sure
+! that the block arguments are added in the proper order (reductions first and
+! then delayed privatization.
+
+! RUN: bbc -emit-hlfir -fopenmp --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine red_and_delayed_private
+    integer :: red
+    integer :: prv
+
+    red = 0
+    prv = 10
+
+    !$omp parallel reduction(+:red) private(prv)
+    red = red + 1
+    prv = 20
+    !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+
+! CHECK-LABEL: omp.reduction.declare
+! CHECK-SAME: @[[REDUCTION_SYM:.*]] : i32 init
+
+! CHECK-LABEL: _QPred_and_delayed_private
+! CHECK: omp.parallel
+! CHECK-SAME: reduction(@[[REDUCTION_SYM]] %{{.*}} -> %arg0 : !fir.ref<i32>)
+! CHECK-SAME: private(@[[PRIVATIZER_SYM]] %{{.*}} -> %arg1 : !fir.ref<i32>) {

From 6008cd40b7bbdc66555550c2e38648d5ce99cc78 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 28 Feb 2024 10:22:45 +0100
Subject: [PATCH 047/114] [mlir][Transforms] Dialect conversion: Assert when
 accessing erased ops (#83132)

The dialect conversion maintains sets of "ignored" and "replaced" ops.
This change simplifies the two sets, such that all nested ops are
included. (This was previously not the case and sometimes only the
parent op was included.)

This change allows for more aggressive assertions to prevent incorrect
rewriter API usage. E.g., accessing ops/blocks/regions within an erased
op.

A concrete example: I have seen conversion patterns in downstream
projects where an op is replaced with a new op, and the region of the
old op is afterwards inlined into the newly created op. This is invalid
rewriter API usage: ops that were replaced/erased should not be
accessed. Nested ops will be considered "ignored", even if they are
moved to a different region after the region's parent op was erased
(which is illegal API usage). Instead, create a new op, inline the
regions, then replace the old op with the new op.
---
 .../Transforms/Utils/DialectConversion.cpp    | 93 +++++++++++--------
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   |  1 -
 2 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index f967e8352bf4c8b..5d399ce1eb9cf02 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -798,13 +798,12 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
                             PatternRewriter &rewriter, ValueRange values,
                             SmallVectorImpl<Value> &remapped);
 
-  /// Returns true if the given operation is ignored, and does not need to be
+  /// Return "true" if the given operation is ignored, and does not need to be
   /// converted.
   bool isOpIgnored(Operation *op) const;
 
-  /// Recursively marks the nested operations under 'op' as ignored. This
-  /// removes them from being considered for legalization.
-  void markNestedOpsIgnored(Operation *op);
+  /// Return "true" if the given operation was replaced or erased.
+  bool wasOpReplaced(Operation *op) const;
 
   //===--------------------------------------------------------------------===//
   // Type Conversion
@@ -946,18 +945,15 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// Ordered list of block operations (creations, splits, motions).
   SmallVector<std::unique_ptr<IRRewrite>> rewrites;
 
-  /// A set of operations that should no longer be considered for legalization,
-  /// but were not directly replace/erased/etc. by a pattern. These are
-  /// generally child operations of other operations who were
-  /// replaced/erased/etc. This is not meant to be an exhaustive list of all
-  /// operations, but the minimal set that can be used to detect if a given
-  /// operation should be `ignored`. For example, we may add the operations that
-  /// define non-empty regions to the set, but not any of the others. This
-  /// simplifies the amount of memory needed as we can query if the parent
-  /// operation was ignored.
+  /// A set of operations that should no longer be considered for legalization.
+  /// E.g., ops that are recursively legal. Ops that were replaced/erased are
+  /// tracked separately.
   SetVector<Operation *> ignoredOps;
 
-  // A set of operations that were erased.
+  /// A set of operations that were replaced/erased. Such ops are not erased
+  /// immediately but only when the dialect conversion succeeds. In the mean
+  /// time, they should no longer be considered for legalization and any attempt
+  /// to modify/access them is invalid rewriter API usage.
   SetVector<Operation *> replacedOps;
 
   /// The current type converter, or nullptr if no type converter is currently
@@ -1237,24 +1233,14 @@ LogicalResult ConversionPatternRewriterImpl::remapValues(
   return success();
 }
 
-// TODO: This function is a misnomer. It does not actually check if `op` is in
-// `ignoredOps`.
 bool ConversionPatternRewriterImpl::isOpIgnored(Operation *op) const {
-  // Check to see if this operation or the parent operation is ignored.
-  return ignoredOps.count(op->getParentOp()) || replacedOps.count(op);
+  // Check to see if this operation is ignored or was replaced.
+  return replacedOps.count(op) || ignoredOps.count(op);
 }
 
-void ConversionPatternRewriterImpl::markNestedOpsIgnored(Operation *op) {
-  // Walk this operation and collect nested operations that define non-empty
-  // regions. We mark such operations as 'ignored' so that we know we don't have
-  // to convert them, or their nested ops.
-  if (op->getNumRegions() == 0)
-    return;
-  op->walk([&](Operation *op) {
-    if (llvm::any_of(op->getRegions(),
-                     [](Region &region) { return !region.empty(); }))
-      ignoredOps.insert(op);
-  });
+bool ConversionPatternRewriterImpl::wasOpReplaced(Operation *op) const {
+  // Check to see if this operation was replaced.
+  return replacedOps.count(op);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1476,6 +1462,9 @@ void ConversionPatternRewriterImpl::notifyOperationInserted(
     logger.startLine() << "** Insert  : '" << op->getName() << "'(" << op
                        << ")\n";
   });
+  assert(!wasOpReplaced(op->getParentOp()) &&
+         "attempting to insert into a block within a replaced/erased op");
+
   if (!previous.isSet()) {
     // This is a newly created op.
     appendRewrite<CreateOperationRewrite>(op);
@@ -1490,7 +1479,7 @@ void ConversionPatternRewriterImpl::notifyOperationInserted(
 void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
                                                      ValueRange newValues) {
   assert(newValues.size() == op->getNumResults());
-  assert(!replacedOps.contains(op) && "operation was already replaced");
+  assert(!ignoredOps.contains(op) && "operation was already replaced");
 
   // Track if any of the results changed, e.g. erased and replaced with null.
   bool resultChanged = false;
@@ -1509,10 +1498,8 @@ void ConversionPatternRewriterImpl::notifyOpReplaced(Operation *op,
   appendRewrite<ReplaceOperationRewrite>(op, currentTypeConverter,
                                          resultChanged);
 
-  // Mark this operation as recursively ignored so that we don't need to
-  // convert any nested operations.
-  replacedOps.insert(op);
-  markNestedOpsIgnored(op);
+  // Mark this operation and all nested ops as replaced.
+  op->walk([&](Operation *op) { replacedOps.insert(op); });
 }
 
 void ConversionPatternRewriterImpl::notifyBlockIsBeingErased(Block *block) {
@@ -1523,6 +1510,9 @@ void ConversionPatternRewriterImpl::notifyBlockIsBeingErased(Block *block) {
 
 void ConversionPatternRewriterImpl::notifyBlockInserted(
     Block *block, Region *previous, Region::iterator previousIt) {
+  assert(!wasOpReplaced(block->getParentOp()) &&
+         "attempting to insert into a region within a replaced/erased op");
+
   if (!previous) {
     // This is a newly created block.
     appendRewrite<CreateBlockRewrite>(block);
@@ -1604,6 +1594,9 @@ void ConversionPatternRewriter::eraseOp(Operation *op) {
 }
 
 void ConversionPatternRewriter::eraseBlock(Block *block) {
+  assert(!impl->wasOpReplaced(block->getParentOp()) &&
+         "attempting to erase a block within a replaced/erased op");
+
   // Mark all ops for erasure.
   for (Operation &op : *block)
     eraseOp(&op);
@@ -1619,18 +1612,27 @@ void ConversionPatternRewriter::eraseBlock(Block *block) {
 Block *ConversionPatternRewriter::applySignatureConversion(
     Region *region, TypeConverter::SignatureConversion &conversion,
     const TypeConverter *converter) {
+  assert(!impl->wasOpReplaced(region->getParentOp()) &&
+         "attempting to apply a signature conversion to a block within a "
+         "replaced/erased op");
   return impl->applySignatureConversion(region, conversion, converter);
 }
 
 FailureOr<Block *> ConversionPatternRewriter::convertRegionTypes(
     Region *region, const TypeConverter &converter,
     TypeConverter::SignatureConversion *entryConversion) {
+  assert(!impl->wasOpReplaced(region->getParentOp()) &&
+         "attempting to apply a signature conversion to a block within a "
+         "replaced/erased op");
   return impl->convertRegionTypes(region, converter, entryConversion);
 }
 
 LogicalResult ConversionPatternRewriter::convertNonEntryRegionTypes(
     Region *region, const TypeConverter &converter,
     ArrayRef<TypeConverter::SignatureConversion> blockConversions) {
+  assert(!impl->wasOpReplaced(region->getParentOp()) &&
+         "attempting to apply a signature conversion to a block within a "
+         "replaced/erased op");
   return impl->convertNonEntryRegionTypes(region, converter, blockConversions);
 }
 
@@ -1665,6 +1667,8 @@ ConversionPatternRewriter::getRemappedValues(ValueRange keys,
 
 Block *ConversionPatternRewriter::splitBlock(Block *block,
                                              Block::iterator before) {
+  assert(!impl->wasOpReplaced(block->getParentOp()) &&
+         "attempting to split a block within a replaced/erased op");
   auto *continuation = block->splitBlock(before);
   impl->notifySplitBlock(block, continuation);
   return continuation;
@@ -1673,15 +1677,19 @@ Block *ConversionPatternRewriter::splitBlock(Block *block,
 void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
                                                   Block::iterator before,
                                                   ValueRange argValues) {
+#ifndef NDEBUG
   assert(argValues.size() == source->getNumArguments() &&
          "incorrect # of argument replacement values");
-#ifndef NDEBUG
+  assert(!impl->wasOpReplaced(source->getParentOp()) &&
+         "attempting to inline a block from a replaced/erased op");
+  assert(!impl->wasOpReplaced(dest->getParentOp()) &&
+         "attempting to inline a block into a replaced/erased op");
   auto opIgnored = [&](Operation *op) { return impl->isOpIgnored(op); };
-#endif // NDEBUG
   // The source block will be deleted, so it should not have any users (i.e.,
   // there should be no predecessors).
   assert(llvm::all_of(source->getUsers(), opIgnored) &&
          "expected 'source' to have no predecessors");
+#endif // NDEBUG
 
   impl->notifyBlockBeingInlined(dest, source, before);
   for (auto it : llvm::zip(source->getArguments(), argValues))
@@ -1691,6 +1699,8 @@ void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
 }
 
 void ConversionPatternRewriter::startOpModification(Operation *op) {
+  assert(!impl->wasOpReplaced(op) &&
+         "attempting to modify a replaced/erased op");
 #ifndef NDEBUG
   impl->pendingRootUpdates.insert(op);
 #endif
@@ -1698,6 +1708,8 @@ void ConversionPatternRewriter::startOpModification(Operation *op) {
 }
 
 void ConversionPatternRewriter::finalizeOpModification(Operation *op) {
+  assert(!impl->wasOpReplaced(op) &&
+         "attempting to modify a replaced/erased op");
   PatternRewriter::finalizeOpModification(op);
   // There is nothing to do here, we only need to track the operation at the
   // start of the update.
@@ -1912,8 +1924,13 @@ OperationLegalizer::legalize(Operation *op,
 
     // If this operation is recursively legal, mark its children as ignored so
     // that we don't consider them for legalization.
-    if (legalityInfo->isRecursivelyLegal)
-      rewriter.getImpl().markNestedOpsIgnored(op);
+    if (legalityInfo->isRecursivelyLegal) {
+      op->walk([&](Operation *nested) {
+        if (op != nested)
+          rewriter.getImpl().ignoredOps.insert(nested);
+      });
+    }
+
     return success();
   }
 
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index bde4255ee4b3680..abc0e43c7b7f2dd 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1768,7 +1768,6 @@ struct TestMergeSingleBlockOps
     rewriter.inlineBlockBefore(&innerBlock, op);
     rewriter.eraseOp(innerTerminator);
     rewriter.eraseOp(op);
-    rewriter.modifyOpInPlace(op, [] {});
     return success();
   }
 };

From 6e41d60a717132fadac74abe61ac6a9b1ca98778 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 28 Feb 2024 09:43:05 +0000
Subject: [PATCH 048/114] [SelectionDAG] Change computeAliasing signature from
 optional<uint64> to LocationSize. (#83017)

This is another smaller step of #70452, changing the signature of
computeAliasing() from optional<uint64_t> to LocationSize, and follow-up
changes in DAGCombiner::mayAlias(). There are some test change due to
the previous AA->isNoAlias call incorrectly using an unknown size
(~UINT64_T(0)). This should then be improved again in #70452 when the
types are known to be scalable.
---
 .../CodeGen/SelectionDAGAddressAnalysis.h     |  7 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 37 +++++----
 .../SelectionDAGAddressAnalysis.cpp           | 20 +++--
 .../alloca-load-store-scalable-array.ll       | 36 ++++-----
 .../alloca-load-store-scalable-struct.ll      | 12 +--
 .../rvv/alloca-load-store-scalable-array.ll   | 12 +--
 .../rvv/alloca-load-store-scalable-struct.ll  |  8 +-
 .../SelectionDAGAddressAnalysisTest.cpp       | 80 ++++++++-----------
 8 files changed, 102 insertions(+), 110 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
index 3d0f836b0c75784..29de6bd8685e061 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGAddressAnalysis.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CODEGEN_SELECTIONDAGADDRESSANALYSIS_H
 #define LLVM_CODEGEN_SELECTIONDAGADDRESSANALYSIS_H
 
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include <cstdint>
 
@@ -81,10 +82,8 @@ class BaseIndexOffset {
 
   // Returns true `Op0` and `Op1` can be proven to alias/not alias, in
   // which case `IsAlias` is set to true/false.
-  static bool computeAliasing(const SDNode *Op0,
-                              const std::optional<int64_t> NumBytes0,
-                              const SDNode *Op1,
-                              const std::optional<int64_t> NumBytes1,
+  static bool computeAliasing(const SDNode *Op0, const LocationSize NumBytes0,
+                              const SDNode *Op1, const LocationSize NumBytes1,
                               const SelectionDAG &DAG, bool &IsAlias);
 
   /// Parses tree in N for base, index, offset addresses.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c4d49adc21c4b7b..33ada3655dc731e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27835,7 +27835,7 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
     bool IsAtomic;
     SDValue BasePtr;
     int64_t Offset;
-    std::optional<int64_t> NumBytes;
+    LocationSize NumBytes;
     MachineMemOperand *MMO;
   };
 
@@ -27853,7 +27853,8 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
               LSN->isAtomic(),
               LSN->getBasePtr(),
               Offset /*base offset*/,
-              std::optional<int64_t>(Size),
+              Size != ~UINT64_C(0) ? LocationSize::precise(Size)
+                                   : LocationSize::beforeOrAfterPointer(),
               LSN->getMemOperand()};
     }
     if (const auto *LN = cast<LifetimeSDNode>(N))
@@ -27861,13 +27862,15 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
               /*isAtomic*/ false,
               LN->getOperand(1),
               (LN->hasOffset()) ? LN->getOffset() : 0,
-              (LN->hasOffset()) ? std::optional<int64_t>(LN->getSize())
-                                : std::optional<int64_t>(),
+              (LN->hasOffset()) ? LocationSize::precise(LN->getSize())
+                                : LocationSize::beforeOrAfterPointer(),
               (MachineMemOperand *)nullptr};
     // Default.
     return {false /*isvolatile*/,
-            /*isAtomic*/ false,          SDValue(),
-            (int64_t)0 /*offset*/,       std::optional<int64_t>() /*size*/,
+            /*isAtomic*/ false,
+            SDValue(),
+            (int64_t)0 /*offset*/,
+            LocationSize::beforeOrAfterPointer() /*size*/,
             (MachineMemOperand *)nullptr};
   };
 
@@ -27922,18 +27925,20 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
   int64_t SrcValOffset1 = MUC1.MMO->getOffset();
   Align OrigAlignment0 = MUC0.MMO->getBaseAlign();
   Align OrigAlignment1 = MUC1.MMO->getBaseAlign();
-  auto &Size0 = MUC0.NumBytes;
-  auto &Size1 = MUC1.NumBytes;
+  LocationSize Size0 = MUC0.NumBytes;
+  LocationSize Size1 = MUC1.NumBytes;
   if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
-      Size0.has_value() && Size1.has_value() && *Size0 == *Size1 &&
-      OrigAlignment0 > *Size0 && SrcValOffset0 % *Size0 == 0 &&
-      SrcValOffset1 % *Size1 == 0) {
+      Size0.hasValue() && Size1.hasValue() && Size0 == Size1 &&
+      OrigAlignment0 > Size0.getValue() &&
+      SrcValOffset0 % Size0.getValue() == 0 &&
+      SrcValOffset1 % Size1.getValue() == 0) {
     int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value();
     int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value();
 
     // There is no overlap between these relatively aligned accesses of
     // similar size. Return no alias.
-    if ((OffAlign0 + *Size0) <= OffAlign1 || (OffAlign1 + *Size1) <= OffAlign0)
+    if ((OffAlign0 + (int64_t)Size0.getValue()) <= OffAlign1 ||
+        (OffAlign1 + (int64_t)Size1.getValue()) <= OffAlign0)
       return false;
   }
 
@@ -27946,12 +27951,12 @@ bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
     UseAA = false;
 #endif
 
-  if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() && Size0 &&
-      Size1) {
+  if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() &&
+      Size0.hasValue() && Size1.hasValue()) {
     // Use alias analysis information.
     int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
-    int64_t Overlap0 = *Size0 + SrcValOffset0 - MinOffset;
-    int64_t Overlap1 = *Size1 + SrcValOffset1 - MinOffset;
+    int64_t Overlap0 = Size0.getValue() + SrcValOffset0 - MinOffset;
+    int64_t Overlap1 = Size1.getValue() + SrcValOffset1 - MinOffset;
     if (AA->isNoAlias(
             MemoryLocation(MUC0.MMO->getValue(), Overlap0,
                            UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index 66825d845c19109..9670c3ac8430eb9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -91,11 +91,10 @@ bool BaseIndexOffset::equalBaseIndex(const BaseIndexOffset &Other,
 }
 
 bool BaseIndexOffset::computeAliasing(const SDNode *Op0,
-                                      const std::optional<int64_t> NumBytes0,
+                                      const LocationSize NumBytes0,
                                       const SDNode *Op1,
-                                      const std::optional<int64_t> NumBytes1,
+                                      const LocationSize NumBytes1,
                                       const SelectionDAG &DAG, bool &IsAlias) {
-
   BaseIndexOffset BasePtr0 = match(Op0, DAG);
   if (!BasePtr0.getBase().getNode())
     return false;
@@ -105,27 +104,26 @@ bool BaseIndexOffset::computeAliasing(const SDNode *Op0,
     return false;
 
   int64_t PtrDiff;
-  if (NumBytes0 && NumBytes1 &&
-      BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) {
+  if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) {
     // If the size of memory access is unknown, do not use it to analysis.
     // One example of unknown size memory access is to load/store scalable
     // vector objects on the stack.
     // BasePtr1 is PtrDiff away from BasePtr0. They alias if none of the
     // following situations arise:
-    if (PtrDiff >= 0 &&
-        *NumBytes0 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+    if (PtrDiff >= 0 && NumBytes0.hasValue() && !NumBytes0.isScalable()) {
       // [----BasePtr0----]
       //                         [---BasePtr1--]
       // ========PtrDiff========>
-      IsAlias = !(*NumBytes0 <= PtrDiff);
+      IsAlias = !(static_cast<int64_t>(NumBytes0.getValue().getFixedValue()) <=
+                  PtrDiff);
       return true;
     }
-    if (PtrDiff < 0 &&
-        *NumBytes1 != static_cast<int64_t>(MemoryLocation::UnknownSize)) {
+    if (PtrDiff < 0 && NumBytes1.hasValue() && !NumBytes1.isScalable()) {
       //                     [----BasePtr0----]
       // [---BasePtr1--]
       // =====(-PtrDiff)====>
-      IsAlias = !((PtrDiff + *NumBytes1) <= 0);
+      IsAlias = !((PtrDiff + static_cast<int64_t>(
+                                 NumBytes1.getValue().getFixedValue())) <= 0);
       return true;
     }
     return false;
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
index 7244ac949ab88c3..9a4e01a29ecb6da 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
@@ -14,12 +14,12 @@ define void @array_1D(ptr %addr) #0 {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #3
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@@ -81,18 +81,18 @@ define void @array_2D(ptr %addr) #0 {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 48 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #5, mul vl]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x0, #4, mul vl]
-; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x0, #3, mul vl]
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #5, mul vl]
-; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #4, mul vl]
-; CHECK-NEXT:    st1d { z5.d }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    st1d { z4.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #5, mul vl]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT:    ld1d { z4.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #5, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    st1d { z5.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z4.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #6
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
index f03a6f018d34d0c..7292d52aaf4765a 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
@@ -13,12 +13,12 @@ define void @test(ptr %addr) #0 {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
-; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #3
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
index 1fe91c721f4dd2b..1d025a2f776f825 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-array.ll
@@ -18,15 +18,15 @@ define void @test(ptr %addr) {
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    vl1re64.v v8, (a2)
 ; CHECK-NEXT:    slli a2, a1, 1
-; CHECK-NEXT:    vl1re64.v v9, (a0)
-; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vl1re64.v v9, (a3)
 ; CHECK-NEXT:    vl1re64.v v10, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v9, (a0)
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vs1r.v v10, (a2)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vs1r.v v8, (a0)
+; CHECK-NEXT:    vs1r.v v9, (a2)
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    vs1r.v v8, (a1)
+; CHECK-NEXT:    vs1r.v v10, (a0)
 ; CHECK-NEXT:    csrrs a0, vlenb, zero
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
index a9a680d54d58977..64031f8a93598f6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/alloca-load-store-scalable-struct.ll
@@ -16,13 +16,13 @@ define <vscale x 1 x double> @test(ptr %addr, i64 %vl) {
 ; CHECK-NEXT:    sub sp, sp, a2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    csrrs a2, vlenb, zero
-; CHECK-NEXT:    vl1re64.v v8, (a0)
-; CHECK-NEXT:    add a0, a0, a2
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vl1re64.v v8, (a3)
 ; CHECK-NEXT:    vl1re64.v v9, (a0)
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs1r.v v8, (a0)
 ; CHECK-NEXT:    add a2, a0, a2
-; CHECK-NEXT:    vs1r.v v9, (a2)
+; CHECK-NEXT:    vs1r.v v8, (a2)
+; CHECK-NEXT:    vs1r.v v9, (a0)
 ; CHECK-NEXT:    vl1re64.v v8, (a2)
 ; CHECK-NEXT:    vl1re64.v v9, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
diff --git a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
index 7426884217a08e3..1f2b8c1754f6ef5 100644
--- a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
@@ -110,12 +110,12 @@ TEST_F(SelectionDAGAddressAnalysisTest, sameFrameObject) {
   SDValue Index = DAG->getMemBasePlusOffset(FIPtr, Offset, Loc);
   SDValue Store = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index,
                                 PtrInfo.getWithOffset(Offset));
-  std::optional<int64_t> NumBytes = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes = cast<StoreSDNode>(Store)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store.getNode(), NumBytes, Store.getNode(), NumBytes, *DAG, IsAlias);
+      Store.getNode(), LocationSize::precise(NumBytes), Store.getNode(),
+      LocationSize::precise(NumBytes), *DAG, IsAlias);
 
   EXPECT_TRUE(IsValid);
   EXPECT_TRUE(IsAlias);
@@ -134,14 +134,10 @@ TEST_F(SelectionDAGAddressAnalysisTest, sameFrameObjectUnknownSize) {
   SDValue Store = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index,
                                 PtrInfo.getWithOffset(Offset));
 
-  // Maybe unlikely that BaseIndexOffset::computeAliasing is used with the
-  // optional NumBytes being unset like in this test, but it would be confusing
-  // if that function determined IsAlias=false here.
-  std::optional<int64_t> NumBytes;
-
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store.getNode(), NumBytes, Store.getNode(), NumBytes, *DAG, IsAlias);
+      Store.getNode(), LocationSize::beforeOrAfterPointer(), Store.getNode(),
+      LocationSize::beforeOrAfterPointer(), *DAG, IsAlias);
 
   EXPECT_FALSE(IsValid);
 }
@@ -165,14 +161,13 @@ TEST_F(SelectionDAGAddressAnalysisTest, noAliasingFrameObjects) {
                                  PtrInfo.getWithOffset(Offset0));
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index1,
                                  PtrInfo.getWithOffset(Offset1));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
 
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
@@ -195,14 +190,13 @@ TEST_F(SelectionDAGAddressAnalysisTest, unknownSizeFrameObjects) {
       DAG->getStore(DAG->getEntryNode(), Loc, Value, FIPtr, PtrInfo);
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index1,
                                  MachinePointerInfo(PtrInfo.getAddrSpace()));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
 
   EXPECT_FALSE(IsValid);
 }
@@ -220,20 +214,19 @@ TEST_F(SelectionDAGAddressAnalysisTest, globalWithFrameObject) {
   SDValue Index = DAG->getMemBasePlusOffset(FIPtr, Offset, Loc);
   SDValue Store = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index,
                                 PtrInfo.getWithOffset(Offset));
-  std::optional<int64_t> NumBytes = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes = cast<StoreSDNode>(Store)->getMemoryVT().getStoreSize();
   EVT GTy = DAG->getTargetLoweringInfo().getValueType(DAG->getDataLayout(),
                                                       G->getType());
   SDValue GValue = DAG->getConstant(0, Loc, GTy);
   SDValue GAddr = DAG->getGlobalAddress(G, Loc, GTy);
   SDValue GStore = DAG->getStore(DAG->getEntryNode(), Loc, GValue, GAddr,
                                  MachinePointerInfo(G, 0));
-  std::optional<int64_t> GNumBytes = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(GStore)->getMemoryVT().getStoreSize());
+  TypeSize GNumBytes = cast<StoreSDNode>(GStore)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store.getNode(), NumBytes, GStore.getNode(), GNumBytes, *DAG, IsAlias);
+      Store.getNode(), LocationSize::precise(NumBytes), GStore.getNode(),
+      LocationSize::precise(GNumBytes), *DAG, IsAlias);
 
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
@@ -248,8 +241,7 @@ TEST_F(SelectionDAGAddressAnalysisTest, globalWithAliasedGlobal) {
   SDValue GAddr = DAG->getGlobalAddress(G, Loc, GTy);
   SDValue GStore = DAG->getStore(DAG->getEntryNode(), Loc, GValue, GAddr,
                                  MachinePointerInfo(G, 0));
-  std::optional<int64_t> GNumBytes = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(GStore)->getMemoryVT().getStoreSize());
+  TypeSize GNumBytes = cast<StoreSDNode>(GStore)->getMemoryVT().getStoreSize();
 
   SDValue AliasedGValue = DAG->getConstant(1, Loc, GTy);
   SDValue AliasedGAddr = DAG->getGlobalAddress(AliasedG, Loc, GTy);
@@ -258,9 +250,9 @@ TEST_F(SelectionDAGAddressAnalysisTest, globalWithAliasedGlobal) {
                     MachinePointerInfo(AliasedG, 0));
 
   bool IsAlias;
-  bool IsValid = BaseIndexOffset::computeAliasing(GStore.getNode(), GNumBytes,
-                                                  AliasedGStore.getNode(),
-                                                  GNumBytes, *DAG, IsAlias);
+  bool IsValid = BaseIndexOffset::computeAliasing(
+      GStore.getNode(), LocationSize::precise(GNumBytes),
+      AliasedGStore.getNode(), LocationSize::precise(GNumBytes), *DAG, IsAlias);
 
   // With some deeper analysis we could detect if G and AliasedG is aliasing or
   // not. But computeAliasing is currently defensive and assumes that a
@@ -290,19 +282,19 @@ TEST_F(SelectionDAGAddressAnalysisTest, fixedSizeFrameObjectsWithinDiff) {
                                  PtrInfo.getWithOffset(Offset0));
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value1, Index1,
                                  PtrInfo.getWithOffset(Offset1));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
 
   IsValid = BaseIndexOffset::computeAliasing(
-      Store1.getNode(), NumBytes1, Store0.getNode(), NumBytes0, *DAG, IsAlias);
+      Store1.getNode(), LocationSize::precise(NumBytes1), Store0.getNode(),
+      LocationSize::precise(NumBytes0), *DAG, IsAlias);
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
 }
@@ -331,14 +323,13 @@ TEST_F(SelectionDAGAddressAnalysisTest, fixedSizeFrameObjectsOutOfDiff) {
                                  PtrInfo.getWithOffset(Offset0));
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value1, Index1,
                                  PtrInfo.getWithOffset(Offset1));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
   EXPECT_TRUE(IsValid);
   EXPECT_TRUE(IsAlias);
 }
@@ -365,14 +356,13 @@ TEST_F(SelectionDAGAddressAnalysisTest, twoFixedStackObjects) {
                                  PtrInfo0.getWithOffset(Offset0));
   SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value1, Index1,
                                  PtrInfo1.getWithOffset(Offset0));
-  std::optional<int64_t> NumBytes0 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize());
-  std::optional<int64_t> NumBytes1 = MemoryLocation::getSizeOrUnknown(
-      cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize());
+  TypeSize NumBytes0 = cast<StoreSDNode>(Store0)->getMemoryVT().getStoreSize();
+  TypeSize NumBytes1 = cast<StoreSDNode>(Store1)->getMemoryVT().getStoreSize();
 
   bool IsAlias;
   bool IsValid = BaseIndexOffset::computeAliasing(
-      Store0.getNode(), NumBytes0, Store1.getNode(), NumBytes1, *DAG, IsAlias);
+      Store0.getNode(), LocationSize::precise(NumBytes0), Store1.getNode(),
+      LocationSize::precise(NumBytes1), *DAG, IsAlias);
   EXPECT_TRUE(IsValid);
   EXPECT_FALSE(IsAlias);
 }

From ba692301f1697183d1665cc0f410d4235b3036db Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh@arm.com>
Date: Wed, 28 Feb 2024 09:52:08 +0000
Subject: [PATCH 049/114] [AArch64][GlobalISel] Pre-Commit Test for G_STORE
 v4s8 (#82498)

---
 llvm/test/CodeGen/AArch64/store.ll | 337 +++++++++++++++++++++++++++++
 1 file changed, 337 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/store.ll

diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
new file mode 100644
index 000000000000000..251e5382ef396fc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -0,0 +1,337 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:         warning: Instruction selection used fallback path for store_v2i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for store_v4i8
+
+; ===== Legal Scalars =====
+define void @store_i8(i8 %a, ptr %ptr){
+; CHECK-LABEL: store_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    strb w0, [x1]
+; CHECK-NEXT:    ret
+    store i8 %a, ptr %ptr
+    ret void
+}
+
+define void @store_i16(i16 %a, ptr %ptr){
+; CHECK-LABEL: store_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    strh w0, [x1]
+; CHECK-NEXT:    ret
+    store i16 %a, ptr %ptr
+    ret void
+}
+
+define void @store_i32(i32 %a, ptr %ptr){
+; CHECK-LABEL: store_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str w0, [x1]
+; CHECK-NEXT:    ret
+    store i32 %a, ptr %ptr
+    ret void
+}
+
+define void @store_i64(i64 %a, ptr %ptr){
+; CHECK-LABEL: store_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ret
+    store i64 %a, ptr %ptr
+    ret void
+}
+
+; ===== Legal Vector Types =====
+
+define void @store_v8i8(<8 x i8> %a, ptr %ptr){
+; CHECK-LABEL: store_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+    store <8 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v16i8(<16 x i8> %a, ptr %ptr){
+; CHECK-LABEL: store_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+    store <16 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v4i16(<4 x i16> %a, ptr %ptr){
+; CHECK-LABEL: store_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+    store <4 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v8i16(<8 x i16> %a, ptr %ptr){
+; CHECK-LABEL: store_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+    store <8 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v2i32(<2 x i32> %a, ptr %ptr){
+; CHECK-LABEL: store_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+    store <2 x i32> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v4i32(<4 x i32> %a, ptr %ptr){
+; CHECK-LABEL: store_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+    store <4 x i32> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v2i64(<2 x i64> %a, ptr %ptr){
+; CHECK-LABEL: store_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+    store <2 x i64> %a, ptr %ptr
+    ret void
+}
+
+; ===== Smaller/Larger Width Vectors with Legal Element Sizes =====
+
+define void @store_v2i8(<2 x i8> %a, ptr %ptr){
+; CHECK-LABEL: store_v2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov w8, v0.s[1]
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    strb w9, [x0]
+; CHECK-NEXT:    strb w8, [x0, #1]
+; CHECK-NEXT:    ret
+    store <2 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v4i8(i32 %a, ptr %ptr) {
+; CHECK-LABEL: store_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str w0, [x1]
+; CHECK-NEXT:    ret
+    %c = bitcast i32 %a to <4 x i8>
+    store <4 x i8> %c, ptr %ptr
+    ret void
+}
+
+define void @store_v32i8(<32 x i8> %a, ptr %ptr){
+; CHECK-LABEL: store_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    store <32 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v2i16(<2 x i16> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strh w9, [x0]
+; CHECK-SD-NEXT:    strh w8, [x0, #2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    str h1, [x0, #2]
+; CHECK-GI-NEXT:    ret
+    store <2 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v16i16(<16 x i16> %a, ptr %ptr){
+; CHECK-LABEL: store_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    store <16 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v1i32(<1 x i32> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v1i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v1i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    ret
+    store <1 x i32> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v8i32(<8 x i32> %a, ptr %ptr){
+; CHECK-LABEL: store_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    store <8 x i32> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v4i64(<4 x i64> %a, ptr %ptr){
+; CHECK-LABEL: store_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    store <4 x i64> %a, ptr %ptr
+    ret void
+}
+
+; ===== Vectors with Non-Pow 2 Widths =====
+
+define void @store_v3i8(<3 x i8> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v3i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sub sp, sp, #16
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.h[1], w1
+; CHECK-SD-NEXT:    mov v0.h[2], w2
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
+; CHECK-SD-NEXT:    str s0, [sp, #12]
+; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    strb w2, [x3, #2]
+; CHECK-SD-NEXT:    strh w8, [x3]
+; CHECK-SD-NEXT:    add sp, sp, #16
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v3i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    strb w0, [x3]
+; CHECK-GI-NEXT:    strb w1, [x3, #1]
+; CHECK-GI-NEXT:    strb w2, [x3, #2]
+; CHECK-GI-NEXT:    ret
+    store <3 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v7i8(<7 x i8> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v7i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add x8, x0, #6
+; CHECK-SD-NEXT:    add x9, x0, #4
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    st1 { v0.b }[6], [x8]
+; CHECK-SD-NEXT:    st1 { v0.h }[2], [x9]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v7i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #1
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    add x9, x0, #2
+; CHECK-GI-NEXT:    st1 { v0.b }[0], [x0]
+; CHECK-GI-NEXT:    st1 { v0.b }[1], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #3
+; CHECK-GI-NEXT:    st1 { v0.b }[3], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #4
+; CHECK-GI-NEXT:    st1 { v0.b }[4], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #5
+; CHECK-GI-NEXT:    st1 { v0.b }[5], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #6
+; CHECK-GI-NEXT:    st1 { v0.b }[2], [x9]
+; CHECK-GI-NEXT:    st1 { v0.b }[6], [x8]
+; CHECK-GI-NEXT:    ret
+    store <7 x i8> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v3i16(<3 x i16> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v3i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add x8, x0, #4
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    st1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v3i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x0, #4
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    ret
+    store <3 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v7i16(<7 x i16> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v7i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add x8, x0, #12
+; CHECK-SD-NEXT:    add x9, x0, #8
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    st1 { v0.h }[6], [x8]
+; CHECK-SD-NEXT:    st1 { v0.s }[2], [x9]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v7i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #2
+; CHECK-GI-NEXT:    add x9, x0, #4
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.h }[1], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #6
+; CHECK-GI-NEXT:    st1 { v0.h }[3], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #8
+; CHECK-GI-NEXT:    st1 { v0.h }[4], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #10
+; CHECK-GI-NEXT:    st1 { v0.h }[5], [x8]
+; CHECK-GI-NEXT:    add x8, x0, #12
+; CHECK-GI-NEXT:    st1 { v0.h }[2], [x9]
+; CHECK-GI-NEXT:    st1 { v0.h }[6], [x8]
+; CHECK-GI-NEXT:    ret
+    store <7 x i16> %a, ptr %ptr
+    ret void
+}
+
+define void @store_v3i32(<3 x i32> %a, ptr %ptr){
+; CHECK-SD-LABEL: store_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    add x8, x0, #8
+; CHECK-SD-NEXT:    str d0, [x0]
+; CHECK-SD-NEXT:    st1 { v0.s }[2], [x8]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    add x8, x0, #4
+; CHECK-GI-NEXT:    add x9, x0, #8
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    st1 { v0.s }[1], [x8]
+; CHECK-GI-NEXT:    st1 { v0.s }[2], [x9]
+; CHECK-GI-NEXT:    ret
+    store <3 x i32> %a, ptr %ptr
+    ret void
+}

From 3fa91021257ec89ccbfa8aae80312700c2de9d11 Mon Sep 17 00:00:00 2001
From: jkorous-apple <jkorous@apple.com>
Date: Wed, 28 Feb 2024 02:05:20 -0800
Subject: [PATCH 050/114] [-Wunsafe-buffer-usage][NFC] clang-format
 UnsafeBufferUsage.cpp (#82027)

---
 clang/lib/Analysis/UnsafeBufferUsage.cpp | 259 +++++++++++------------
 1 file changed, 129 insertions(+), 130 deletions(-)

diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 701f1ac852c256a..e1ff0d92f6b2f87 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -130,42 +130,42 @@ class MatchDescendantVisitor
 
   bool TraverseGenericSelectionExpr(GenericSelectionExpr *Node) {
     // These are unevaluated, except the result expression.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return TraverseStmt(Node->getResultExpr());
     return VisitorBase::TraverseGenericSelectionExpr(Node);
   }
 
   bool TraverseUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr *Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseUnaryExprOrTypeTraitExpr(Node);
   }
 
   bool TraverseTypeOfExprTypeLoc(TypeOfExprTypeLoc Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseTypeOfExprTypeLoc(Node);
   }
 
   bool TraverseDecltypeTypeLoc(DecltypeTypeLoc Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseDecltypeTypeLoc(Node);
   }
 
   bool TraverseCXXNoexceptExpr(CXXNoexceptExpr *Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseCXXNoexceptExpr(Node);
   }
 
   bool TraverseCXXTypeidExpr(CXXTypeidExpr *Node) {
     // Unevaluated context.
-    if(ignoreUnevaluatedContext)
+    if (ignoreUnevaluatedContext)
       return true;
     return VisitorBase::TraverseCXXTypeidExpr(Node);
   }
@@ -213,24 +213,26 @@ class MatchDescendantVisitor
 
 // Because we're dealing with raw pointers, let's define what we mean by that.
 static auto hasPointerType() {
-    return hasType(hasCanonicalType(pointerType()));
+  return hasType(hasCanonicalType(pointerType()));
 }
 
-static auto hasArrayType() {
-    return hasType(hasCanonicalType(arrayType()));
-}
+static auto hasArrayType() { return hasType(hasCanonicalType(arrayType())); }
 
-AST_MATCHER_P(Stmt, forEachDescendantEvaluatedStmt, internal::Matcher<Stmt>, innerMatcher) {
+AST_MATCHER_P(Stmt, forEachDescendantEvaluatedStmt, internal::Matcher<Stmt>,
+              innerMatcher) {
   const DynTypedMatcher &DTM = static_cast<DynTypedMatcher>(innerMatcher);
 
-  MatchDescendantVisitor Visitor(&DTM, Finder, Builder, ASTMatchFinder::BK_All, true);
+  MatchDescendantVisitor Visitor(&DTM, Finder, Builder, ASTMatchFinder::BK_All,
+                                 true);
   return Visitor.findMatch(DynTypedNode::create(Node));
 }
 
-AST_MATCHER_P(Stmt, forEachDescendantStmt, internal::Matcher<Stmt>, innerMatcher) {
+AST_MATCHER_P(Stmt, forEachDescendantStmt, internal::Matcher<Stmt>,
+              innerMatcher) {
   const DynTypedMatcher &DTM = static_cast<DynTypedMatcher>(innerMatcher);
 
-  MatchDescendantVisitor Visitor(&DTM, Finder, Builder, ASTMatchFinder::BK_All, false);
+  MatchDescendantVisitor Visitor(&DTM, Finder, Builder, ASTMatchFinder::BK_All,
+                                 false);
   return Visitor.findMatch(DynTypedNode::create(Node));
 }
 
@@ -268,10 +270,9 @@ static auto isInUnspecifiedLvalueContext(internal::Matcher<Expr> innerMatcher) {
         hasLHS(innerMatcher)
       )
     ));
-// clang-format on
+  // clang-format on
 }
 
-
 // Returns a matcher that matches any expression `e` such that `InnerMatcher`
 // matches `e` and `e` is in an Unspecified Pointer Context (UPC).
 static internal::Matcher<Stmt>
@@ -315,7 +316,7 @@ isInUnspecifiedPointerContext(internal::Matcher<Stmt> InnerMatcher) {
   // clang-format on
 
   return stmt(anyOf(CallArgMatcher, CastOperandMatcher, CompOperandMatcher,
-		    PtrSubtractionMatcher));
+                    PtrSubtractionMatcher));
   // FIXME: any more cases? (UPC excludes the RHS of an assignment.  For now we
   // don't have to check that.)
 }
@@ -481,7 +482,9 @@ class Gadget {
 #ifndef NDEBUG
   StringRef getDebugName() const {
     switch (K) {
-#define GADGET(x) case Kind::x: return #x;
+#define GADGET(x)                                                              \
+  case Kind::x:                                                                \
+    return #x;
 #include "clang/Analysis/Analyses/UnsafeBufferUsageGadgets.def"
     }
     llvm_unreachable("Unhandled Gadget::Kind enum");
@@ -502,7 +505,6 @@ class Gadget {
   Kind K;
 };
 
-
 /// Warning gadgets correspond to unsafe code patterns that warrants
 /// an immediate warning.
 class WarningGadget : public Gadget {
@@ -513,10 +515,10 @@ class WarningGadget : public Gadget {
   bool isWarningGadget() const final { return true; }
 };
 
-/// Fixable gadgets correspond to code patterns that aren't always unsafe but need to be
-/// properly recognized in order to emit fixes. For example, if a raw pointer-type
-/// variable is replaced by a safe C++ container, every use of such variable must be
-/// carefully considered and possibly updated.
+/// Fixable gadgets correspond to code patterns that aren't always unsafe but
+/// need to be properly recognized in order to emit fixes. For example, if a raw
+/// pointer-type variable is replaced by a safe C++ container, every use of such
+/// variable must be carefully considered and possibly updated.
 class FixableGadget : public Gadget {
 public:
   FixableGadget(Kind K) : Gadget(K) {}
@@ -531,20 +533,19 @@ class FixableGadget : public Gadget {
     return std::nullopt;
   }
 
-  /// Returns a list of two elements where the first element is the LHS of a pointer assignment
-  /// statement and the second element is the RHS. This two-element list represents the fact that
-  /// the LHS buffer gets its bounds information from the RHS buffer. This information will be used
-  /// later to group all those variables whose types must be modified together to prevent type
-  /// mismatches.
+  /// Returns a list of two elements where the first element is the LHS of a
+  /// pointer assignment statement and the second element is the RHS. This
+  /// two-element list represents the fact that the LHS buffer gets its bounds
+  /// information from the RHS buffer. This information will be used later to
+  /// group all those variables whose types must be modified together to prevent
+  /// type mismatches.
   virtual std::optional<std::pair<const VarDecl *, const VarDecl *>>
   getStrategyImplications() const {
     return std::nullopt;
   }
 };
 
-static auto toSupportedVariable() {
-  return to(varDecl());
-}
+static auto toSupportedVariable() { return to(varDecl()); }
 
 using FixableGadgetList = std::vector<std::unique_ptr<FixableGadget>>;
 using WarningGadgetList = std::vector<std::unique_ptr<WarningGadget>>;
@@ -565,10 +566,10 @@ class IncrementGadget : public WarningGadget {
   }
 
   static Matcher matcher() {
-    return stmt(unaryOperator(
-      hasOperatorName("++"),
-      hasUnaryOperand(ignoringParenImpCasts(hasPointerType()))
-    ).bind(OpTag));
+    return stmt(
+        unaryOperator(hasOperatorName("++"),
+                      hasUnaryOperand(ignoringParenImpCasts(hasPointerType())))
+            .bind(OpTag));
   }
 
   const UnaryOperator *getBaseStmt() const override { return Op; }
@@ -600,10 +601,10 @@ class DecrementGadget : public WarningGadget {
   }
 
   static Matcher matcher() {
-    return stmt(unaryOperator(
-      hasOperatorName("--"),
-      hasUnaryOperand(ignoringParenImpCasts(hasPointerType()))
-    ).bind(OpTag));
+    return stmt(
+        unaryOperator(hasOperatorName("--"),
+                      hasUnaryOperand(ignoringParenImpCasts(hasPointerType())))
+            .bind(OpTag));
   }
 
   const UnaryOperator *getBaseStmt() const override { return Op; }
@@ -754,26 +755,25 @@ class PointerInitGadget : public FixableGadget {
 private:
   static constexpr const char *const PointerInitLHSTag = "ptrInitLHS";
   static constexpr const char *const PointerInitRHSTag = "ptrInitRHS";
-  const VarDecl * PtrInitLHS;         // the LHS pointer expression in `PI`
-  const DeclRefExpr * PtrInitRHS;         // the RHS pointer expression in `PI`
+  const VarDecl *PtrInitLHS;     // the LHS pointer expression in `PI`
+  const DeclRefExpr *PtrInitRHS; // the RHS pointer expression in `PI`
 
 public:
   PointerInitGadget(const MatchFinder::MatchResult &Result)
       : FixableGadget(Kind::PointerInit),
-    PtrInitLHS(Result.Nodes.getNodeAs<VarDecl>(PointerInitLHSTag)),
-    PtrInitRHS(Result.Nodes.getNodeAs<DeclRefExpr>(PointerInitRHSTag)) {}
+        PtrInitLHS(Result.Nodes.getNodeAs<VarDecl>(PointerInitLHSTag)),
+        PtrInitRHS(Result.Nodes.getNodeAs<DeclRefExpr>(PointerInitRHSTag)) {}
 
   static bool classof(const Gadget *G) {
     return G->getKind() == Kind::PointerInit;
   }
 
   static Matcher matcher() {
-    auto PtrInitStmt = declStmt(hasSingleDecl(varDecl(
-                                 hasInitializer(ignoringImpCasts(declRefExpr(
-                                                  hasPointerType(),
-                                                    toSupportedVariable()).
-                                                  bind(PointerInitRHSTag)))).
-                                              bind(PointerInitLHSTag)));
+    auto PtrInitStmt = declStmt(hasSingleDecl(
+        varDecl(hasInitializer(ignoringImpCasts(
+                    declRefExpr(hasPointerType(), toSupportedVariable())
+                        .bind(PointerInitRHSTag))))
+            .bind(PointerInitLHSTag)));
 
     return stmt(PtrInitStmt);
   }
@@ -793,8 +793,7 @@ class PointerInitGadget : public FixableGadget {
 
   virtual std::optional<std::pair<const VarDecl *, const VarDecl *>>
   getStrategyImplications() const override {
-      return std::make_pair(PtrInitLHS,
-                            cast<VarDecl>(PtrInitRHS->getDecl()));
+    return std::make_pair(PtrInitLHS, cast<VarDecl>(PtrInitRHS->getDecl()));
   }
 };
 
@@ -807,8 +806,8 @@ class PtrToPtrAssignmentGadget : public FixableGadget {
 private:
   static constexpr const char *const PointerAssignLHSTag = "ptrLHS";
   static constexpr const char *const PointerAssignRHSTag = "ptrRHS";
-  const DeclRefExpr * PtrLHS;         // the LHS pointer expression in `PA`
-  const DeclRefExpr * PtrRHS;         // the RHS pointer expression in `PA`
+  const DeclRefExpr *PtrLHS; // the LHS pointer expression in `PA`
+  const DeclRefExpr *PtrRHS; // the RHS pointer expression in `PA`
 
 public:
   PtrToPtrAssignmentGadget(const MatchFinder::MatchResult &Result)
@@ -821,13 +820,13 @@ class PtrToPtrAssignmentGadget : public FixableGadget {
   }
 
   static Matcher matcher() {
-    auto PtrAssignExpr = binaryOperator(allOf(hasOperatorName("="),
-      hasRHS(ignoringParenImpCasts(declRefExpr(hasPointerType(),
-                                               toSupportedVariable()).
-                                   bind(PointerAssignRHSTag))),
-                                   hasLHS(declRefExpr(hasPointerType(),
-                                                      toSupportedVariable()).
-                                          bind(PointerAssignLHSTag))));
+    auto PtrAssignExpr = binaryOperator(
+        allOf(hasOperatorName("="),
+              hasRHS(ignoringParenImpCasts(
+                  declRefExpr(hasPointerType(), toSupportedVariable())
+                      .bind(PointerAssignRHSTag))),
+              hasLHS(declRefExpr(hasPointerType(), toSupportedVariable())
+                         .bind(PointerAssignLHSTag))));
 
     return stmt(isInUnspecifiedUntypedContext(PtrAssignExpr));
   }
@@ -981,9 +980,8 @@ class ULCArraySubscriptGadget : public FixableGadget {
 
   static Matcher matcher() {
     auto ArrayOrPtr = anyOf(hasPointerType(), hasArrayType());
-    auto BaseIsArrayOrPtrDRE =
-        hasBase(ignoringParenImpCasts(declRefExpr(ArrayOrPtr,
-                                                  toSupportedVariable())));
+    auto BaseIsArrayOrPtrDRE = hasBase(
+        ignoringParenImpCasts(declRefExpr(ArrayOrPtr, toSupportedVariable())));
     auto Target =
         arraySubscriptExpr(BaseIsArrayOrPtrDRE).bind(ULCArraySubscriptTag);
 
@@ -1025,9 +1023,9 @@ class UPCStandalonePointerGadget : public FixableGadget {
 
   static Matcher matcher() {
     auto ArrayOrPtr = anyOf(hasPointerType(), hasArrayType());
-    auto target = expr(
-        ignoringParenImpCasts(declRefExpr(allOf(ArrayOrPtr,
-                              toSupportedVariable())).bind(DeclRefExprTag)));
+    auto target = expr(ignoringParenImpCasts(
+        declRefExpr(allOf(ArrayOrPtr, toSupportedVariable()))
+            .bind(DeclRefExprTag)));
     return stmt(isInUnspecifiedPointerContext(target));
   }
 
@@ -1036,9 +1034,7 @@ class UPCStandalonePointerGadget : public FixableGadget {
 
   virtual const Stmt *getBaseStmt() const override { return Node; }
 
-  virtual DeclUseList getClaimedVarUseSites() const override {
-    return {Node};
-  }
+  virtual DeclUseList getClaimedVarUseSites() const override { return {Node}; }
 };
 
 class PointerDereferenceGadget : public FixableGadget {
@@ -1103,10 +1099,10 @@ class UPCAddressofArraySubscriptGadget : public FixableGadget {
 
   static Matcher matcher() {
     return expr(isInUnspecifiedPointerContext(expr(ignoringImpCasts(
-        unaryOperator(hasOperatorName("&"),
-                      hasUnaryOperand(arraySubscriptExpr(
-                          hasBase(ignoringParenImpCasts(declRefExpr(
-                                                  toSupportedVariable()))))))
+        unaryOperator(
+            hasOperatorName("&"),
+            hasUnaryOperand(arraySubscriptExpr(hasBase(
+                ignoringParenImpCasts(declRefExpr(toSupportedVariable()))))))
             .bind(UPCAddressofArraySubscriptTag)))));
   }
 
@@ -1195,13 +1191,13 @@ class DeclUseTracker {
 class UPCPreIncrementGadget : public FixableGadget {
 private:
   static constexpr const char *const UPCPreIncrementTag =
-    "PointerPreIncrementUnderUPC";
+      "PointerPreIncrementUnderUPC";
   const UnaryOperator *Node; // the `++Ptr` node
 
 public:
   UPCPreIncrementGadget(const MatchFinder::MatchResult &Result)
-    : FixableGadget(Kind::UPCPreIncrement),
-      Node(Result.Nodes.getNodeAs<UnaryOperator>(UPCPreIncrementTag)) {
+      : FixableGadget(Kind::UPCPreIncrement),
+        Node(Result.Nodes.getNodeAs<UnaryOperator>(UPCPreIncrementTag)) {
     assert(Node != nullptr && "Expecting a non-null matching result");
   }
 
@@ -1215,10 +1211,9 @@ class UPCPreIncrementGadget : public FixableGadget {
     // can have the matcher be general, so long as `getClaimedVarUseSites` does
     // things right.
     return stmt(isInUnspecifiedPointerContext(expr(ignoringImpCasts(
-								    unaryOperator(isPreInc(),
-										  hasUnaryOperand(declRefExpr(
-                                                    toSupportedVariable()))
-										  ).bind(UPCPreIncrementTag)))));
+        unaryOperator(isPreInc(),
+                      hasUnaryOperand(declRefExpr(toSupportedVariable())))
+            .bind(UPCPreIncrementTag)))));
   }
 
   virtual std::optional<FixItList>
@@ -1782,9 +1777,9 @@ static SourceRange getSourceRangeToTokenEnd(const Decl *D,
                                             const LangOptions &LangOpts) {
   SourceLocation Begin = D->getBeginLoc();
   SourceLocation
-    End = // `D->getEndLoc` should always return the starting location of the
-    // last token, so we should get the end of the token
-    Lexer::getLocForEndOfToken(D->getEndLoc(), 0, SM, LangOpts);
+      End = // `D->getEndLoc` should always return the starting location of the
+      // last token, so we should get the end of the token
+      Lexer::getLocForEndOfToken(D->getEndLoc(), 0, SM, LangOpts);
 
   return SourceRange(Begin, End);
 }
@@ -1976,7 +1971,7 @@ PointerDereferenceGadget::getFixits(const FixitStrategy &S) const {
     if (auto LocPastOperand =
             getPastLoc(BaseDeclRefExpr, SM, Ctx.getLangOpts())) {
       return FixItList{{FixItHint::CreateRemoval(derefRange),
-			FixItHint::CreateInsertion(*LocPastOperand, "[0]")}};
+                        FixItHint::CreateInsertion(*LocPastOperand, "[0]")}};
     }
     break;
   }
@@ -2162,7 +2157,8 @@ FixVarInitializerWithSpan(const Expr *Init, ASTContext &Ctx,
   // NULL pointer, we use the default constructor to initialize the span
   // object, i.e., a `std:span` variable declaration with no initializer.
   // So the fix-it is just to remove the initializer.
-  if (Init->isNullPointerConstant(Ctx,
+  if (Init->isNullPointerConstant(
+          Ctx,
           // FIXME: Why does this function not ask for `const ASTContext
           // &`? It should. Maybe worth an NFC patch later.
           Expr::NullPointerConstantValueDependence::
@@ -2230,8 +2226,10 @@ FixVarInitializerWithSpan(const Expr *Init, ASTContext &Ctx,
 }
 
 #ifndef NDEBUG
-#define DEBUG_NOTE_DECL_FAIL(D, Msg)  \
-Handler.addDebugNoteForVar((D), (D)->getBeginLoc(), "failed to produce fixit for declaration '" + (D)->getNameAsString() + "'" + (Msg))
+#define DEBUG_NOTE_DECL_FAIL(D, Msg)                                           \
+  Handler.addDebugNoteForVar((D), (D)->getBeginLoc(),                          \
+                             "failed to produce fixit for declaration '" +     \
+                                 (D)->getNameAsString() + "'" + (Msg))
 #else
 #define DEBUG_NOTE_DECL_FAIL(D, Msg)
 #endif
@@ -2239,8 +2237,8 @@ Handler.addDebugNoteForVar((D), (D)->getBeginLoc(), "failed to produce fixit for
 // For the given variable declaration with a pointer-to-T type, returns the text
 // `std::span<T>`.  If it is unable to generate the text, returns
 // `std::nullopt`.
-static std::optional<std::string> createSpanTypeForVarDecl(const VarDecl *VD,
-                                                           const ASTContext &Ctx) {
+static std::optional<std::string>
+createSpanTypeForVarDecl(const VarDecl *VD, const ASTContext &Ctx) {
   assert(VD->getType()->isPointerType());
 
   std::optional<Qualifiers> PteTyQualifiers = std::nullopt;
@@ -2277,8 +2275,8 @@ static std::optional<std::string> createSpanTypeForVarDecl(const VarDecl *VD,
 //    the non-empty fix-it list, if fix-its are successfuly generated; empty
 //    list otherwise.
 static FixItList fixLocalVarDeclWithSpan(const VarDecl *D, ASTContext &Ctx,
-					 const StringRef UserFillPlaceHolder,
-					 UnsafeBufferUsageHandler &Handler) {
+                                         const StringRef UserFillPlaceHolder,
+                                         UnsafeBufferUsageHandler &Handler) {
   if (hasUnsupportedSpecifiers(D, Ctx.getSourceManager()))
     return {};
 
@@ -2431,9 +2429,9 @@ createOverloadsForFixedParams(const FixitStrategy &S, const FunctionDecl *FD,
         // print parameter name if provided:
         if (IdentifierInfo *II = Parm->getIdentifier())
           SS << ' ' << II->getName().str();
-      } else if (auto ParmTypeText = getRangeText(
-                     getSourceRangeToTokenEnd(Parm, SM, LangOpts),
-                     SM, LangOpts)) {
+      } else if (auto ParmTypeText =
+                     getRangeText(getSourceRangeToTokenEnd(Parm, SM, LangOpts),
+                                  SM, LangOpts)) {
         // print the whole `Parm` without modification:
         SS << ParmTypeText->str();
       } else
@@ -2577,7 +2575,8 @@ static FixItList fixVariableWithSpan(const VarDecl *VD,
                                      UnsafeBufferUsageHandler &Handler) {
   const DeclStmt *DS = Tracker.lookupDecl(VD);
   if (!DS) {
-    DEBUG_NOTE_DECL_FAIL(VD, " : variables declared this way not implemented yet");
+    DEBUG_NOTE_DECL_FAIL(VD,
+                         " : variables declared this way not implemented yet");
     return {};
   }
   if (!DS->isSingleDecl()) {
@@ -2979,8 +2978,8 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
 #endif
 
   assert(D && D->getBody());
-  // We do not want to visit a Lambda expression defined inside a method independently.
-  // Instead, it should be visited along with the outer method.
+  // We do not want to visit a Lambda expression defined inside a method
+  // independently. Instead, it should be visited along with the outer method.
   // FIXME: do we want to do the same thing for `BlockDecl`s?
   if (const auto *fd = dyn_cast<CXXMethodDecl>(D)) {
     if (fd->getParent()->isLambda() && fd->getParent()->isLocalClass())
@@ -2990,7 +2989,7 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   // Do not emit fixit suggestions for functions declared in an
   // extern "C" block.
   if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
-      for (FunctionDecl *FReDecl : FD->redecls()) {
+    for (FunctionDecl *FReDecl : FD->redecls()) {
       if (FReDecl->isExternC()) {
         EmitSuggestions = false;
         break;
@@ -3002,7 +3001,7 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   FixableGadgetSets FixablesForAllVars;
 
   auto [FixableGadgets, WarningGadgets, Tracker] =
-    findGadgets(D, Handler, EmitSuggestions);
+      findGadgets(D, Handler, EmitSuggestions);
 
   if (!EmitSuggestions) {
     // Our job is very easy without suggestions. Just warn about
@@ -3055,36 +3054,36 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   // Filter out non-local vars and vars with unclaimed DeclRefExpr-s.
   for (auto it = FixablesForAllVars.byVar.cbegin();
        it != FixablesForAllVars.byVar.cend();) {
-      // FIXME: need to deal with global variables later
-      if ((!it->first->isLocalVarDecl() && !isa<ParmVarDecl>(it->first))) {
+    // FIXME: need to deal with global variables later
+    if ((!it->first->isLocalVarDecl() && !isa<ParmVarDecl>(it->first))) {
 #ifndef NDEBUG
-          Handler.addDebugNoteForVar(
-              it->first, it->first->getBeginLoc(),
-              ("failed to produce fixit for '" + it->first->getNameAsString() +
-               "' : neither local nor a parameter"));
+      Handler.addDebugNoteForVar(it->first, it->first->getBeginLoc(),
+                                 ("failed to produce fixit for '" +
+                                  it->first->getNameAsString() +
+                                  "' : neither local nor a parameter"));
 #endif
-        it = FixablesForAllVars.byVar.erase(it);
-      } else if (it->first->getType().getCanonicalType()->isReferenceType()) {
+      it = FixablesForAllVars.byVar.erase(it);
+    } else if (it->first->getType().getCanonicalType()->isReferenceType()) {
 #ifndef NDEBUG
-        Handler.addDebugNoteForVar(it->first, it->first->getBeginLoc(),
-                                   ("failed to produce fixit for '" +
-                                    it->first->getNameAsString() +
-                                    "' : has a reference type"));
+      Handler.addDebugNoteForVar(it->first, it->first->getBeginLoc(),
+                                 ("failed to produce fixit for '" +
+                                  it->first->getNameAsString() +
+                                  "' : has a reference type"));
 #endif
-        it = FixablesForAllVars.byVar.erase(it);
-      } else if (Tracker.hasUnclaimedUses(it->first)) {
-        it = FixablesForAllVars.byVar.erase(it);
-      } else if (it->first->isInitCapture()) {
+      it = FixablesForAllVars.byVar.erase(it);
+    } else if (Tracker.hasUnclaimedUses(it->first)) {
+      it = FixablesForAllVars.byVar.erase(it);
+    } else if (it->first->isInitCapture()) {
 #ifndef NDEBUG
-        Handler.addDebugNoteForVar(
-            it->first, it->first->getBeginLoc(),
-                                   ("failed to produce fixit for '" + it->first->getNameAsString() +
-                                    "' : init capture"));
+      Handler.addDebugNoteForVar(it->first, it->first->getBeginLoc(),
+                                 ("failed to produce fixit for '" +
+                                  it->first->getNameAsString() +
+                                  "' : init capture"));
 #endif
-        it = FixablesForAllVars.byVar.erase(it);
-      } else {
-        ++it;
-      }
+      it = FixablesForAllVars.byVar.erase(it);
+    } else {
+      ++it;
+    }
   }
 
 #ifndef NDEBUG
@@ -3115,7 +3114,7 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   for (auto it : FixablesForAllVars.byVar) {
     for (const FixableGadget *fixable : it.second) {
       std::optional<std::pair<const VarDecl *, const VarDecl *>> ImplPair =
-                                  fixable->getStrategyImplications();
+          fixable->getStrategyImplications();
       if (ImplPair) {
         std::pair<const VarDecl *, const VarDecl *> Impl = std::move(*ImplPair);
         PtrAssignmentGraph[Impl.first].insert(Impl.second);
@@ -3144,10 +3143,10 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   for (const auto &[Var, ignore] : UnsafeOps.byVar) {
     if (VisitedVarsDirected.find(Var) == VisitedVarsDirected.end()) {
 
-      std::queue<const VarDecl*> QueueDirected{};
+      std::queue<const VarDecl *> QueueDirected{};
       QueueDirected.push(Var);
-      while(!QueueDirected.empty()) {
-        const VarDecl* CurrentVar = QueueDirected.front();
+      while (!QueueDirected.empty()) {
+        const VarDecl *CurrentVar = QueueDirected.front();
         QueueDirected.pop();
         VisitedVarsDirected.insert(CurrentVar);
         auto AdjacentNodes = PtrAssignmentGraph[CurrentVar];
@@ -3178,11 +3177,11 @@ void clang::checkUnsafeBufferUsage(const Decl *D,
   for (const auto &[Var, ignore] : UnsafeOps.byVar) {
     if (VisitedVars.find(Var) == VisitedVars.end()) {
       VarGrpTy &VarGroup = Groups.emplace_back();
-      std::queue<const VarDecl*> Queue{};
+      std::queue<const VarDecl *> Queue{};
 
       Queue.push(Var);
-      while(!Queue.empty()) {
-        const VarDecl* CurrentVar = Queue.front();
+      while (!Queue.empty()) {
+        const VarDecl *CurrentVar = Queue.front();
         Queue.pop();
         VisitedVars.insert(CurrentVar);
         VarGroup.push_back(CurrentVar);

From 5468f8841353cd56350a6ebe6898d2563e5c34b0 Mon Sep 17 00:00:00 2001
From: "Oleksandr \"Alex\" Zinenko" <zinenko@google.com>
Date: Wed, 28 Feb 2024 11:06:53 +0100
Subject: [PATCH 051/114] [mlir] update remaining transform tests to main pass
 (#81279)

Use the main transform interpreter pass instead of the test pass. The
only tests that are not updated are specific to the operation of the
test pass.
---
 .../Dialect/Transform/Transforms/Passes.td    |  24 +-
 .../Transform/Transforms/InterpreterPass.cpp  |  95 ++++-
 ...nterpreter-external-concurrent-source.mlir |  25 +-
 .../test-interpreter-external-source.mlir     |  14 +-
 .../Transform/multi-arg-top-level-ops.mlir    |  63 ++-
 .../Transform/multi-arg-top-level-params.mlir |  39 +-
 .../Transform/multi-arg-top-level-values.mlir |  32 +-
 .../Transform/test-interpreter-debug.mlir     |  58 +--
 .../test-interpreter-external-concurrent.mlir |   4 +-
 .../Transform/test-interpreter-external.mlir  |   4 +-
 .../Dialect/Transform/test-interpreter.mlir   |   1 -
 .../Transform/test-pass-application.mlir      |  78 ++--
 .../Transform/test-pattern-application.mlir   | 384 ++++++++++--------
 .../Dialect/Transform/test-pdl-extension.mlir |  68 ++--
 .../Transform/transform-state-extension.mlir  | 102 ++---
 .../Vector/CPU/ArmSVE/test-contraction.mlir   |  18 +-
 16 files changed, 584 insertions(+), 425 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
index 1d6eb24156e334c..86a2b3c21faf0d5 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
@@ -66,7 +66,25 @@ def InterpreterPass : Pass<"transform-interpreter"> {
   let description = [{
     This pass runs the transform dialect interpreter and applies the named
     sequence transformation specified by the provided name (defaults to
-    `TransformDialect::kTransformEntryPointSymbolName` (i.e. `__transform_main`)).
+    `TransformDialect::kTransformEntryPointSymbolName`,
+    i.e. `__transform_main`).
+
+    Additional options can be used to narrow down the pass applicability for
+    debugging purposes:
+      * `debugPayloadRootTag` makes the transform script apply to the payload
+        operation that has a `transform.target_tag` string attribute with the
+        given value, rather than to the anchor operation of the pass.
+      * `debugBindTrailingArgs` allows one to bind values to trailing arguments
+        of the transform entry point as follows:
+        * arguments of `TransformHandleTypeInterface` type can be bound to all
+          payload operations with the name provided as a simple string;
+        * arguments of `TransformValueHandleTypeInterface` type can be bound to
+          a flattened list of results of all operations with the name provided
+          as a string prefixed with `^`;
+        * arguments of `TransformParamTypeInterface` type can be bound to
+          integer constants provided as `;`-separated list prefixed with `#`.
+      * `entryPoint` specifies the name of the transform symbol to serve as the
+        entry point.
   }];
   let dependentDialects = ["::mlir::transform::TransformDialect"];
   let options = [
@@ -83,7 +101,9 @@ def InterpreterPass : Pass<"transform-interpreter"> {
            "false",
            "Disable expensive checks in the interpreter for a faster run.">,
     Option<"entryPoint", "entry-point", "std::string",
-           /*default=*/[{TransformDialect::kTransformEntryPointSymbolName.str()}],
+           /*default=*/[{
+              TransformDialect::kTransformEntryPointSymbolName.str()
+            }],
            "Entry point of the pass pipeline.">,
   ];
 }
diff --git a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
index 5073234a7e35e9f..7adf223f3440a59 100644
--- a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
@@ -50,12 +50,79 @@ static Operation *findPayloadRoot(Operation *passRoot, StringRef tag) {
     return WalkResult::interrupt();
   });
 
+  if (!target) {
+    passRoot->emitError()
+        << "could not find the operation with transform.target_tag=\"" << tag
+        << "\" attribute";
+    return nullptr;
+  }
+
   return walkResult.wasInterrupted() ? nullptr : target;
 }
 
 namespace {
 class InterpreterPass
     : public transform::impl::InterpreterPassBase<InterpreterPass> {
+  // Parses the pass arguments to bind trailing arguments of the entry point.
+  std::optional<RaggedArray<transform::MappedValue>>
+  parseArguments(Operation *payloadRoot) {
+    MLIRContext *context = payloadRoot->getContext();
+
+    SmallVector<SmallVector<transform::MappedValue>, 2> trailingBindings;
+    trailingBindings.resize(debugBindTrailingArgs.size());
+
+    // Construct lists of op names to match.
+    SmallVector<std::optional<OperationName>> debugBindNames;
+    debugBindNames.reserve(debugBindTrailingArgs.size());
+    for (auto &&[position, nameString] :
+         llvm::enumerate(debugBindTrailingArgs)) {
+      StringRef name = nameString;
+
+      // Parse the integer literals.
+      if (name.starts_with("#")) {
+        debugBindNames.push_back(std::nullopt);
+        StringRef lhs = "";
+        StringRef rhs = name.drop_front();
+        do {
+          std::tie(lhs, rhs) = rhs.split(';');
+          int64_t value;
+          if (lhs.getAsInteger(10, value)) {
+            emitError(UnknownLoc::get(context))
+                << "couldn't parse integer pass argument " << name;
+            return std::nullopt;
+          }
+          trailingBindings[position].push_back(
+              Builder(context).getI64IntegerAttr(value));
+        } while (!rhs.empty());
+      } else if (name.starts_with("^")) {
+        debugBindNames.emplace_back(OperationName(name.drop_front(), context));
+      } else {
+        debugBindNames.emplace_back(OperationName(name, context));
+      }
+    }
+
+    // Collect operations or results for extra bindings.
+    payloadRoot->walk([&](Operation *payload) {
+      for (auto &&[position, name] : llvm::enumerate(debugBindNames)) {
+        if (!name || payload->getName() != *name)
+          continue;
+
+        if (StringRef(*std::next(debugBindTrailingArgs.begin(), position))
+                .starts_with("^")) {
+          llvm::append_range(trailingBindings[position], payload->getResults());
+        } else {
+          trailingBindings[position].push_back(payload);
+        }
+      }
+    });
+
+    RaggedArray<transform::MappedValue> bindings;
+    bindings.push_back(ArrayRef<Operation *>{payloadRoot});
+    for (SmallVector<transform::MappedValue> &trailing : trailingBindings)
+      bindings.push_back(std::move(trailing));
+    return bindings;
+  }
+
 public:
   using Base::Base;
 
@@ -67,34 +134,18 @@ class InterpreterPass
         findPayloadRoot(getOperation(), debugPayloadRootTag);
     if (!payloadRoot)
       return signalPassFailure();
-    auto debugBindNames = llvm::map_to_vector(
-        debugBindTrailingArgs,
-        [&](const std::string &name) { return OperationName(name, context); });
-    SmallVector<SmallVector<Operation *>, 2> trailingBindings;
-    trailingBindings.resize(debugBindNames.size());
-    payloadRoot->walk([&](Operation *payload) {
-      for (auto &&[position, name] : llvm::enumerate(debugBindNames)) {
-        if (payload->getName() == name)
-          trailingBindings[position].push_back(payload);
-      }
-    });
 
     Operation *transformEntryPoint = transform::detail::findTransformEntryPoint(
         getOperation(), transformModule, entryPoint);
-    if (!transformEntryPoint) {
-      getOperation()->emitError()
-          << "could not find transform entry point: " << entryPoint
-          << " in either payload or transform module";
+    if (!transformEntryPoint)
       return signalPassFailure();
-    }
-
-    RaggedArray<transform::MappedValue> bindings;
-    bindings.push_back(ArrayRef<Operation *>{payloadRoot});
-    for (SmallVector<Operation *> &trailing : trailingBindings)
-      bindings.push_back(std::move(trailing));
 
+    std::optional<RaggedArray<transform::MappedValue>> bindings =
+        parseArguments(payloadRoot);
+    if (!bindings)
+      return signalPassFailure();
     if (failed(transform::applyTransformNamedSequence(
-            bindings,
+            *bindings,
             cast<transform::TransformOpInterface>(transformEntryPoint),
             transformModule,
             options.enableExpensiveChecks(!disableExpensiveChecks)))) {
diff --git a/mlir/test/Dialect/Transform/include/test-interpreter-external-concurrent-source.mlir b/mlir/test/Dialect/Transform/include/test-interpreter-external-concurrent-source.mlir
index 316b90f85236e47..255ff5f31ed3f5a 100644
--- a/mlir/test/Dialect/Transform/include/test-interpreter-external-concurrent-source.mlir
+++ b/mlir/test/Dialect/Transform/include/test-interpreter-external-concurrent-source.mlir
@@ -1,16 +1,21 @@
 // RUN: mlir-opt %s
 // No need to check anything else than parsing here, this is being used by another test as data.
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  pdl.pattern @func_return : benefit(1) {
-    %0 = pdl.operation "func.return"
-    pdl.rewrite %0 with "transform.dialect"
-  }
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      pdl.pattern @func_return : benefit(1) {
+        %0 = pdl.operation "func.return"
+        pdl.rewrite %0 with "transform.dialect"
+      }
 
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @func_return in %arg1 : (!transform.any_op) -> !transform.op<"func.return">
-    transform.debug.emit_remark_at %0, "matched" : !transform.op<"func.return">
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @func_return in %arg1 : (!transform.any_op) -> !transform.op<"func.return">
+        transform.debug.emit_remark_at %0, "matched" : !transform.op<"func.return">
+      }
+    }
+    transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Transform/include/test-interpreter-external-source.mlir b/mlir/test/Dialect/Transform/include/test-interpreter-external-source.mlir
index 5956c86ebbe4b24..f6b7f787cc2c38d 100644
--- a/mlir/test/Dialect/Transform/include/test-interpreter-external-source.mlir
+++ b/mlir/test/Dialect/Transform/include/test-interpreter-external-source.mlir
@@ -1,11 +1,13 @@
 // RUN: mlir-opt %s
 // No need to check anything else than parsing here, this is being used by another test as data.
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  transform.debug.emit_remark_at %arg0, "outer" : !transform.any_op
-  transform.sequence %arg0 : !transform.any_op failures(propagate) attributes {transform.target_tag="transform"} {
-  ^bb1(%arg1: !transform.any_op):
-    transform.debug.emit_remark_at %arg1, "inner" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.debug.emit_remark_at %arg0, "outer" : !transform.any_op
+    transform.sequence %arg0 : !transform.any_op failures(propagate) attributes {transform.target_tag="transform"} {
+    ^bb1(%arg1: !transform.any_op):
+      transform.debug.emit_remark_at %arg1, "inner" : !transform.any_op
+    }
+    transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Transform/multi-arg-top-level-ops.mlir b/mlir/test/Dialect/Transform/multi-arg-top-level-ops.mlir
index 9a7e7ca2f9536e5..1c018b1b1f7796b 100644
--- a/mlir/test/Dialect/Transform/multi-arg-top-level-ops.mlir
+++ b/mlir/test/Dialect/Transform/multi-arg-top-level-ops.mlir
@@ -1,10 +1,15 @@
-// RUN: mlir-opt %s --pass-pipeline='builtin.module(test-transform-dialect-interpreter{bind-first-extra-to-ops=func.func bind-second-extra-to-ops=func.return})' \
-// RUN:             --split-input-file --verify-diagnostics
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(transform-interpreter{\
+// RUN:       debug-bind-trailing-args=func.func,func.return})" \
+// RUN:   --split-input-file --verify-diagnostics
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_op):
-  transform.debug.emit_remark_at %arg1, "first extra" : !transform.any_op
-  transform.debug.emit_remark_at %arg2, "second extra" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      %arg2: !transform.any_op) {
+    transform.debug.emit_remark_at %arg1, "first extra" : !transform.any_op
+    transform.debug.emit_remark_at %arg2, "second extra" : !transform.any_op
+    transform.yield
+  }
 }
 
 // expected-remark @below {{first extra}}
@@ -26,9 +31,13 @@ func.func @bar(%arg0: i1) {
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.param<i64>):
-  // expected-error @above {{wrong kind of value provided for top-level parameter}}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      %arg2: !transform.param<i64>) {
+    // expected-error @above {{wrong kind of value provided for top-level parameter}}
+    transform.yield
+  }
 }
 
 func.func @foo() {
@@ -37,9 +46,13 @@ func.func @foo() {
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_value):
-  // expected-error @above {{wrong kind of value provided for the top-level value handle}}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      %arg2: !transform.any_value) {
+    // expected-error @above {{wrong kind of value provided for the top-level value handle}}
+    transform.yield
+  }
 }
 
 func.func @foo() {
@@ -48,19 +61,27 @@ func.func @foo() {
 
 // -----
 
-// expected-error @below {{operation expects 1 extra value bindings, but 2 were provided to the interpreter}}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op):
+
+module attributes {transform.with_named_sequence} {
+  // expected-error @below {{operation expects 1 extra value bindings, but 2 were provided to the interpreter}}
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op) {
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_op):
-  transform.sequence %arg0, %arg1, %arg2 : !transform.any_op, !transform.any_op, !transform.any_op failures(propagate) {
-  ^bb0(%arg3: !transform.any_op, %arg4: !transform.any_op, %arg5: !transform.any_op):
-    transform.debug.emit_remark_at %arg4, "first extra" : !transform.any_op
-    transform.debug.emit_remark_at %arg5, "second extra" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      %arg2: !transform.any_op) {
+    transform.sequence %arg0, %arg1, %arg2 : !transform.any_op, !transform.any_op, !transform.any_op failures(propagate) {
+    ^bb0(%arg3: !transform.any_op, %arg4: !transform.any_op, %arg5: !transform.any_op):
+      transform.debug.emit_remark_at %arg4, "first extra" : !transform.any_op
+      transform.debug.emit_remark_at %arg5, "second extra" : !transform.any_op
+    }
+    transform.yield
   }
 }
 
diff --git a/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir b/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir
index f59a4b6d4ccc32f..6486bcae3294e4a 100644
--- a/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir
+++ b/mlir/test/Dialect/Transform/multi-arg-top-level-params.mlir
@@ -1,24 +1,37 @@
-// RUN: mlir-opt %s --pass-pipeline='builtin.module(test-transform-dialect-interpreter{bind-first-extra-to-params=1,2,3 bind-second-extra-to-params=42,45})' \
+// RUN: mlir-opt %s --pass-pipeline='builtin.module(transform-interpreter{\
+// RUN:                             debug-bind-trailing-args=#1;2;3,#42;45})' \
 // RUN:          --split-input-file --verify-diagnostics
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.param<i64>, %arg2: !transform.param<i64>):
-  // expected-remark @below {{1 : i64, 2 : i64, 3 : i64}}
-  transform.debug.emit_param_as_remark %arg1 : !transform.param<i64>
-  // expected-remark @below {{42 : i64, 45 : i64}}
-  transform.debug.emit_param_as_remark %arg2 : !transform.param<i64>
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.param<i64>,
+      %arg2: !transform.param<i64>) {
+    // expected-remark @below {{1 : i64, 2 : i64, 3 : i64}}
+    transform.debug.emit_param_as_remark %arg1 : !transform.param<i64>
+    // expected-remark @below {{42 : i64, 45 : i64}}
+    transform.debug.emit_param_as_remark %arg2 : !transform.param<i64>
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.param<i64>):
-  // expected-error @above {{wrong kind of value provided for top-level operation handle}}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_op,
+      // expected-error @above {{wrong kind of value provided for top-level operation handle}}
+      %arg2: !transform.param<i64>) {
+    transform.yield
+  }
 }
 
 // -----
 
-// expected-error @below {{operation expects 3 extra value bindings, but 2 were provided to the interpreter}}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.param<i64>, %arg2: !transform.param<i64>, %arg3: !transform.param<i64>):
+module attributes {transform.with_named_sequence} {
+  // expected-error @below {{operation expects 3 extra value bindings, but 2 were provided to the interpreter}}
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.param<i64>,
+      %arg2: !transform.param<i64>, %arg3: !transform.param<i64>) {
+    transform.yield
+  }
 }
diff --git a/mlir/test/Dialect/Transform/multi-arg-top-level-values.mlir b/mlir/test/Dialect/Transform/multi-arg-top-level-values.mlir
index 38d7e28697774d0..dcc1079267dc7ce 100644
--- a/mlir/test/Dialect/Transform/multi-arg-top-level-values.mlir
+++ b/mlir/test/Dialect/Transform/multi-arg-top-level-values.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt %s --pass-pipeline='builtin.module(test-transform-dialect-interpreter{bind-first-extra-to-results-of-ops=test.some_returning_op bind-second-extra-to-results-of-ops=test.some_other_returning_op})' \
+// RUN: mlir-opt %s --pass-pipeline='builtin.module(transform-interpreter{\
+// RUN:     debug-bind-trailing-args=^test.some_returning_op,^test.some_other_returning_op})' \
 // RUN:             --split-input-file --verify-diagnostics
 
 // Note that diagnostic checker will merge two diagnostics with the same message
@@ -21,10 +22,14 @@
 // expected-note @below {{value handle points to an op result #1}}
 %2:2 = "test.some_other_returning_op"() : () -> (f32, f64)
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_value, %arg2: !transform.any_value):
-  transform.debug.emit_remark_at %arg1, "first extra" : !transform.any_value
-  transform.debug.emit_remark_at %arg2, "second extra" : !transform.any_value
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg0: !transform.any_op, %arg1: !transform.any_value,
+      %arg2: !transform.any_value) {
+    transform.debug.emit_remark_at %arg1, "first extra" : !transform.any_value
+    transform.debug.emit_remark_at %arg2, "second extra" : !transform.any_value
+    transform.yield
+  }
 }
 
 // -----
@@ -32,14 +37,19 @@ transform.sequence failures(propagate) {
 %0:2 = "test.some_returning_op"() : () -> (i32, i64)
 %1 = "test.some_returning_op"() : () -> index
 
-transform.sequence failures(propagate) {
-// expected-error @below {{wrong kind of value provided for top-level operation handle}}
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_value):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      // expected-error @below {{wrong kind of value provided for top-level operation handle}}
+      %arg0: !transform.any_op, %arg1: !transform.any_op, %arg2: !transform.any_value) {
+    transform.yield
+  }
 }
 
 // -----
 
-// expected-error @below {{operation expects 1 extra value bindings, but 2 were provided to the interpreter}}
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op, %arg1: !transform.any_value):
+module attributes {transform.with_named_sequence} {
+  // expected-error @below {{operation expects 1 extra value bindings, but 2 were provided to the interpreter}}
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op, %arg1: !transform.any_value) {
+    transform.yield
+  }
 }
diff --git a/mlir/test/Dialect/Transform/test-interpreter-debug.mlir b/mlir/test/Dialect/Transform/test-interpreter-debug.mlir
index c7dad582dd432c1..99301ea23c6f8d6 100644
--- a/mlir/test/Dialect/Transform/test-interpreter-debug.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter-debug.mlir
@@ -1,19 +1,21 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{debug-payload-root-tag=payload debug-transform-root-tag=transform})" \
-// RUN:             --allow-unregistered-dialect --split-input-file --verify-diagnostics
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(transform-interpreter{\
+// RUN:         debug-payload-root-tag=payload \
+// RUN:         entry-point=transform})" \
+// RUN:   --allow-unregistered-dialect --split-input-file --verify-diagnostics
 
 // expected-error @below {{could not find the operation with transform.target_tag="payload" attribute}}
-module {
-  transform.sequence failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @transform(%arg0: !transform.any_op) {
+    transform.yield
   }
 }
 
 // -----
 
-// expected-error @below {{could not find the operation with transform.target_tag="transform" attribute}}
-module {
-  transform.sequence failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
+// expected-error @below {{could not find a nested named sequence with name: transform}}
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @not_transform(%arg0: !transform.any_op) {
+    transform.yield
   }
 
   module attributes {transform.target_tag="payload"} {}
@@ -21,42 +23,16 @@ module {
 
 // -----
 
-// expected-error @below {{more than one operation with transform.target_tag="transform" attribute}}
-module {
-  // expected-note @below {{first operation}}
-  transform.sequence failures(propagate) attributes {transform.target_tag="transform"} {
-  ^bb0(%arg0: !transform.any_op):
-  }
-
-  // expected-note @below {{other operation}}
-  transform.sequence failures(propagate) attributes {transform.target_tag="transform"} {
-  ^bb0(%arg0: !transform.any_op):
-  }
-
-  module attributes {transform.target_tag="payload"} {}
-}
-
-// -----
-
-module {
-  // expected-error @below {{expected the transform entry point to be a top-level transform op}}
-  func.func private @foo() attributes {transform.target_tag="transform"}
-
-  module attributes {transform.target_tag="payload"} {}
-}
-
-// -----
-
-module {
-  transform.sequence failures(suppress) attributes {transform.target_tag="transform"} {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @transform(%arg0: !transform.any_op) {
     transform.debug.emit_remark_at %arg0, "payload" : !transform.any_op
+    transform.yield
   }
 
-  // This will not be executed because it's not tagged.
-  transform.sequence failures(suppress)  {
-  ^bb0(%arg0: !transform.any_op):
+  // This will not be executed.
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     transform.debug.emit_remark_at %arg0, "some other text that is not printed" : !transform.any_op
+    transform.yield
   }
 
   module {
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external-concurrent.mlir b/mlir/test/Dialect/Transform/test-interpreter-external-concurrent.mlir
index 59c2b672a6e6b10..9884102c6c0ff5b 100644
--- a/mlir/test/Dialect/Transform/test-interpreter-external-concurrent.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter-external-concurrent.mlir
@@ -1,4 +1,6 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(func.func(test-transform-dialect-interpreter{transform-file-name=%p%{fs-sep}include%{fs-sep}test-interpreter-external-concurrent-source.mlir}))" \
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(\
+// RUN:     transform-preload-library{transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-external-concurrent-source.mlir},\
+// RUN:     func.func(transform-interpreter))" \
 // RUN:             --verify-diagnostics
 
 // Exercising the pass on multiple functions of different lengths that may be
diff --git a/mlir/test/Dialect/Transform/test-interpreter-external.mlir b/mlir/test/Dialect/Transform/test-interpreter-external.mlir
index ba8e0c6870dbf8a..599ce05fcc40b1a 100644
--- a/mlir/test/Dialect/Transform/test-interpreter-external.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter-external.mlir
@@ -1,4 +1,6 @@
-// RUN: mlir-opt %s --pass-pipeline="builtin.module(test-transform-dialect-interpreter{transform-file-name=%p%{fs-sep}include%{fs-sep}test-interpreter-external-source.mlir})" \
+// RUN: mlir-opt %s --pass-pipeline="builtin.module(\
+// RUN:                 transform-preload-library{transform-library-paths=%p%{fs-sep}include%{fs-sep}test-interpreter-external-source.mlir},\
+// RUN:                 transform-interpreter)" \
 // RUN:             --verify-diagnostics
 
 // The schedule in the separate file emits remarks at the payload root.
diff --git a/mlir/test/Dialect/Transform/test-interpreter.mlir b/mlir/test/Dialect/Transform/test-interpreter.mlir
index de5807b2874b27a..b6850e2024d53dd 100644
--- a/mlir/test/Dialect/Transform/test-interpreter.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter.mlir
@@ -1411,7 +1411,6 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 // expected-error @below {{could not find a nested named sequence with name: __transform_main}}
-// expected-error @below {{could not find transform entry point: __transform_main in either payload or transform module}}
 module {
 }
 
diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir
index 65625457c868986..7cb5387b937d45c 100644
--- a/mlir/test/Dialect/Transform/test-pass-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pass-application.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s --transform-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL: func @successful_pass_application(
 //       CHECK:   %[[c5:.*]] = arith.constant 5 : index
@@ -9,10 +9,12 @@ func.func @successful_pass_application(%t: tensor<5xf32>) -> index {
   return %dim : index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_registered_pass "canonicalize" to %1 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "canonicalize" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -22,12 +24,14 @@ func.func @pass_pipeline() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // This pipeline does not do anything. Just make sure that the pipeline is
-  // found and no error is produced.
-  transform.apply_registered_pass "test-options-pass-pipeline" to %1 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // This pipeline does not do anything. Just make sure that the pipeline is
+    // found and no error is produced.
+    transform.apply_registered_pass "test-options-pass-pipeline" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -36,11 +40,13 @@ func.func @invalid_pass_name() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{unknown pass or pass pipeline: non-existing-pass}}
-  transform.apply_registered_pass "non-existing-pass" to %1 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{unknown pass or pass pipeline: non-existing-pass}}
+    transform.apply_registered_pass "non-existing-pass" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -53,11 +59,13 @@ func.func @not_isolated_from_above(%t: tensor<5xf32>) -> index {
   return %dim : index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["tensor.dim"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{pass pipeline failed}}
-  transform.apply_registered_pass "canonicalize" to %1 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["tensor.dim"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{pass pipeline failed}}
+    transform.apply_registered_pass "canonicalize" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -66,11 +74,13 @@ func.func @invalid_pass_option() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{failed to add pass or pass pipeline to pipeline: canonicalize}}
-  transform.apply_registered_pass "canonicalize" to %1 {options = "invalid-option=1"} : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{failed to add pass or pass pipeline to pipeline: canonicalize}}
+    transform.apply_registered_pass "canonicalize" to %1 {options = "invalid-option=1"} : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -80,27 +90,29 @@ func.func @valid_pass_option() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_registered_pass "canonicalize" to %1 {options = "top-down=false"} : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_registered_pass "canonicalize" to %1 {options = "top-down=false"} : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-module {
+module attributes {transform.with_named_sequence} {
   // expected-error @below {{trying to schedule a pass on an unsupported operation}}
   // expected-note @below {{target op}}
   func.func @invalid_target_op_type() {
     return
   }
 
-  transform.sequence failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
     %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
 
     // func-bufferize can be applied only to ModuleOps.
     // expected-error @below {{pass pipeline failed}}
     transform.apply_registered_pass "func-bufferize" to %1 : (!transform.any_op) -> !transform.any_op
+    transform.yield
   }
 }
diff --git a/mlir/test/Dialect/Transform/test-pattern-application.mlir b/mlir/test/Dialect/Transform/test-pattern-application.mlir
index 10cd9ef351fe544..0c41e81b17b522d 100644
--- a/mlir/test/Dialect/Transform/test-pattern-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pattern-application.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s --transform-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics | FileCheck %s
 
 // CHECK-LABEL: func @update_tracked_op_mapping()
 //       CHECK:   "test.container"() ({
@@ -11,15 +11,17 @@ func.func @update_tracked_op_mapping() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
-  // Add an attribute to %1, which is now mapped to a new op.
-  transform.annotate %1 "annotated" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    // Add an attribute to %1, which is now mapped to a new op.
+    transform.annotate %1 "annotated" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -33,19 +35,21 @@ func.func @replacement_op_not_found() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-note @below {{replacement is required because this handle must be updated}}
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below {{tracking listener failed to find replacement op during application of this transform op}}
-  // expected-note @below {{ran out of suitable replacement values}}
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
-  // %1 must be used in some way. If no replacement payload op could be found,
-  // an error is thrown only if the handle is not dead.
-  transform.annotate %1 "annotated" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-note @below {{replacement is required because this handle must be updated}}
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{tracking listener failed to find replacement op during application of this transform op}}
+    // expected-note @below {{ran out of suitable replacement values}}
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    // %1 must be used in some way. If no replacement payload op could be found,
+    // an error is thrown only if the handle is not dead.
+    transform.annotate %1 "annotated" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -61,14 +65,16 @@ func.func @replacement_op_for_dead_handle_not_found() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // No error because %1 is dead.
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // No error because %1 is dead.
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -84,14 +90,16 @@ func.func @replacement_op_not_found_silenced() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } {transform.silence_tracking_failures} : !transform.any_op
-  transform.annotate %1 "annotated" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } {transform.silence_tracking_failures} : !transform.any_op
+    transform.annotate %1 "annotated" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -103,12 +111,14 @@ func.func @patterns_apply_only_to_target_body() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-%0 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+  %0 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -125,16 +135,18 @@ func.func @erase_tracked_op() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.erase_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.debug.emit_remark_at %1, "matched op" : !transform.any_op
-  transform.apply_patterns to %0 {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
-  // No marker should be printed.
-  transform.debug.emit_remark_at %1, "op was deleted" : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.erase_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.debug.emit_remark_at %1, "matched op" : !transform.any_op
+    transform.apply_patterns to %0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    // No marker should be printed.
+    transform.debug.emit_remark_at %1, "op was deleted" : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -143,7 +155,7 @@ transform.sequence failures(propagate) {
 //       CHECK:   "test.container"() ({
 //  CHECK-NEXT:   ^bb0:
 //  CHECK-NEXT:   }) : () -> ()
-module {
+module attributes {transform.with_named_sequence} {
   func.func @erase_tracked_op_in_named_sequence() {
     "test.container"() ({
       // expected-remark @below {{matched op}}
@@ -152,23 +164,21 @@ module {
     return
   }
 
-  module attributes { transform.with_named_sequence } {
-    transform.named_sequence @foo(%arg0: !transform.any_op {transform.readonly}) -> () {
-      transform.apply_patterns to %arg0 {
-        transform.apply_patterns.transform.test_patterns
-      } : !transform.any_op
-      transform.yield
-    }
+  transform.named_sequence @foo(%arg0: !transform.any_op {transform.readonly}) -> () {
+    transform.apply_patterns to %arg0 {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    transform.yield
+  }
 
-    transform.sequence failures(propagate) {
-    ^bb1(%arg1: !transform.any_op):
-      %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      %1 = transform.structured.match ops{["test.erase_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-      transform.debug.emit_remark_at %1, "matched op" : !transform.any_op
-      include @foo failures(propagate) (%0) : (!transform.any_op) -> ()
-      // No marker should be printed.
-      transform.debug.emit_remark_at %1, "op was deleted" : !transform.any_op
-    }
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.erase_op"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.debug.emit_remark_at %1, "matched op" : !transform.any_op
+    transform.include @foo failures(propagate) (%0) : (!transform.any_op) -> ()
+    // No marker should be printed.
+    transform.debug.emit_remark_at %1, "op was deleted" : !transform.any_op
+    transform.yield
   }
 }
 
@@ -183,13 +193,15 @@ func.func @canonicalization(%t: tensor<5xf32>) -> index {
   return %dim : index
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["tensor.dim"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %1 {
-    transform.apply_patterns.canonicalization
-  } : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["tensor.dim"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %1 {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -200,13 +212,13 @@ module {
     return
   }
 
-  module {
-    transform.sequence failures(propagate) {
-    ^bb1(%arg1: !transform.any_op):
+  module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
       // expected-error @below {{cannot apply transform to itself (or one of its ancestors)}}
       transform.apply_patterns to %arg1 {
         transform.apply_patterns.canonicalization
       } : !transform.any_op
+      transform.yield
     }
   }
 }
@@ -224,12 +236,14 @@ func.func @canonicalization_and_cse(%m: memref<5xf32>) {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_patterns to %1 {
-    transform.apply_patterns.canonicalization
-  } {apply_cse} : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %1 {
+      transform.apply_patterns.canonicalization
+    } {apply_cse} : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -243,15 +257,17 @@ func.func @full_dialect_conversion() -> tensor<5xf32> {
   return %0 : tensor<5xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    transform.apply_conversion_patterns.transform.test_conversion_patterns
-  } with type_converter {
-    transform.apply_conversion_patterns.transform.test_type_converter
-  } {legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      transform.apply_conversion_patterns.transform.test_conversion_patterns
+    } with type_converter {
+      transform.apply_conversion_patterns.transform.test_type_converter
+    } {legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -266,16 +282,18 @@ func.func @full_dialect_conversion_failed() -> tensor<5xf32> {
   return %0 : tensor<5xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below{{dialect conversion failed}}
-  transform.apply_conversion_patterns to %0 {
-    transform.apply_conversion_patterns.transform.test_conversion_patterns
-  } with type_converter {
-    transform.apply_conversion_patterns.transform.test_type_converter
-  } {legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below{{dialect conversion failed}}
+    transform.apply_conversion_patterns to %0 {
+      transform.apply_conversion_patterns.transform.test_conversion_patterns
+    } with type_converter {
+      transform.apply_conversion_patterns.transform.test_type_converter
+    } {legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
@@ -294,98 +312,108 @@ func.func @partial_dialect_conversion() -> tensor<5xf32> {
   return %0 : tensor<5xf32>
 }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    transform.apply_conversion_patterns.transform.test_conversion_patterns
-  } with type_converter {
-    transform.apply_conversion_patterns.transform.test_type_converter
-  } {legal_ops = ["func.func", "func.return", "test.new_op"],
-     partial_conversion} : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      transform.apply_conversion_patterns.transform.test_conversion_patterns
+    } with type_converter {
+      transform.apply_conversion_patterns.transform.test_type_converter
+    } {legal_ops = ["func.func", "func.return", "test.new_op"],
+       partial_conversion} : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  // expected-error @below{{pattern descriptor does not specify type converter and apply_conversion_patterns op has no default type converter}}
-  transform.apply_conversion_patterns to %0 {
-    // expected-note @below{{pattern descriptor op}}
-    transform.apply_conversion_patterns.transform.test_conversion_patterns
-  } {illegal_ops = ["test.foo"]} : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below{{pattern descriptor does not specify type converter and apply_conversion_patterns op has no default type converter}}
+    transform.apply_conversion_patterns to %0 {
+      // expected-note @below{{pattern descriptor op}}
+      transform.apply_conversion_patterns.transform.test_conversion_patterns
+    } {illegal_ops = ["test.foo"]} : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    // expected-error @below{{expected LLVMTypeConverter}}
-    transform.apply_conversion_patterns.dialect_to_llvm "test"
-  } with type_converter {
-    transform.apply_conversion_patterns.transform.test_type_converter
-  } {illegal_ops = ["test.foo"],
-     legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      // expected-error @below{{expected LLVMTypeConverter}}
+      transform.apply_conversion_patterns.dialect_to_llvm "test"
+    } with type_converter {
+      transform.apply_conversion_patterns.transform.test_type_converter
+    } {illegal_ops = ["test.foo"],
+       legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    // expected-error @below{{unknown dialect or dialect not loaded: this_dialect_does_not_exist}}
-    transform.apply_conversion_patterns.dialect_to_llvm "this_dialect_does_not_exist"
-  } with type_converter {
-    transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
-  } {illegal_ops = ["test.foo"],
-     legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      // expected-error @below{{unknown dialect or dialect not loaded: this_dialect_does_not_exist}}
+      transform.apply_conversion_patterns.dialect_to_llvm "this_dialect_does_not_exist"
+    } with type_converter {
+      transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
+    } {illegal_ops = ["test.foo"],
+       legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.apply_conversion_patterns to %0 {
-    // expected-error @below{{dialect does not implement ConvertToLLVMPatternInterface or extension was not loaded: transform}}
-    transform.apply_conversion_patterns.dialect_to_llvm "transform"
-  } with type_converter {
-    transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
-  } {illegal_ops = ["test.foo"],
-     legal_ops = ["func.func", "func.return", "test.new_op"]}
-      : !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.apply_conversion_patterns to %0 {
+      // expected-error @below{{dialect does not implement ConvertToLLVMPatternInterface or extension was not loaded: transform}}
+      transform.apply_conversion_patterns.dialect_to_llvm "transform"
+    } with type_converter {
+      transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
+    } {illegal_ops = ["test.foo"],
+       legal_ops = ["func.func", "func.return", "test.new_op"]}
+        : !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
 module attributes { transform.with_named_sequence } {
-func.func @replacement_op_not_found() {
-  // No op replacement can be found, but there are no handles that must be
-  // updated. No error should be reported.
-  "test.container"() ({
-    %0 = "test.foo"() {replace_with_new_op = "test.bar"} : () -> (i32)
-  }) : () -> ()
-  return
-}
+  func.func @replacement_op_not_found() {
+    // No op replacement can be found, but there are no handles that must be
+    // updated. No error should be reported.
+    "test.container"() ({
+      %0 = "test.foo"() {replace_with_new_op = "test.bar"} : () -> (i32)
+    }) : () -> ()
+    return
+  }
 
-transform.named_sequence @patterns(%container: !transform.any_op {transform.readonly}) {
-  transform.apply_patterns to %container {
-    transform.apply_patterns.transform.test_patterns
-  } : !transform.any_op
-  transform.yield
-}
+  transform.named_sequence @patterns(%container: !transform.any_op {transform.readonly}) {
+    transform.apply_patterns to %container {
+      transform.apply_patterns.transform.test_patterns
+    } : !transform.any_op
+    transform.yield
+  }
 
-transform.sequence failures(propagate) {
-^bb1(%arg1: !transform.any_op):
-  %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-  transform.annotate %1 "annotated" : !transform.any_op
-  transform.include @patterns failures(propagate) (%0) : (!transform.any_op) -> ()
-}
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op) {
+    %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    transform.annotate %1 "annotated" : !transform.any_op
+    transform.include @patterns failures(propagate) (%0) : (!transform.any_op) -> ()
+    transform.yield
+  }
 }
diff --git a/mlir/test/Dialect/Transform/test-pdl-extension.mlir b/mlir/test/Dialect/Transform/test-pdl-extension.mlir
index a9710f75531262b..a3349c1ba505942 100644
--- a/mlir/test/Dialect/Transform/test-pdl-extension.mlir
+++ b/mlir/test/Dialect/Transform/test-pdl-extension.mlir
@@ -1,21 +1,26 @@
-// RUN: mlir-opt %s --test-transform-dialect-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics
-
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb0(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-    transform.debug.emit_remark_at %0, "matched" : !transform.any_op
-  }
-
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operation "test.some_op"
-    pdl.rewrite %0 with "transform.dialect"
-  }
-
-  pdl.pattern @other : benefit(1) {
-    %0 = pdl.operation "test.other_op"
-    pdl.rewrite %0 with "transform.dialect"
+// RUN: mlir-opt %s --transform-interpreter -allow-unregistered-dialect --split-input-file --verify-diagnostics
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb0(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+        transform.debug.emit_remark_at %0, "matched" : !transform.any_op
+      }
+
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operation "test.some_op"
+        pdl.rewrite %0 with "transform.dialect"
+      }
+
+      pdl.pattern @other : benefit(1) {
+        %0 = pdl.operation "test.other_op"
+        pdl.rewrite %0 with "transform.dialect"
+      }
+    }
+    transform.yield
   }
 }
 
@@ -28,17 +33,22 @@ transform.with_pdl_patterns {
 
 // -----
 
-transform.with_pdl_patterns {
-^bb0(%arg0: !transform.any_op):
-  sequence %arg0 : !transform.any_op failures(propagate) {
-  ^bb1(%arg1: !transform.any_op):
-    %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
-  }
-
-  pdl.pattern @some : benefit(1) {
-    %0 = pdl.operation "test.some_op"
-    pdl.apply_native_constraint "verbose_constraint"(%0 : !pdl.operation)
-    pdl.rewrite %0 with "transform.dialect"
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.with_pdl_patterns %root : !transform.any_op {
+    ^bb0(%arg0: !transform.any_op):
+      sequence %arg0 : !transform.any_op failures(propagate) {
+      ^bb1(%arg1: !transform.any_op):
+        %0 = pdl_match @some in %arg1 : (!transform.any_op) -> !transform.any_op
+      }
+
+      pdl.pattern @some : benefit(1) {
+        %0 = pdl.operation "test.some_op"
+        pdl.apply_native_constraint "verbose_constraint"(%0 : !pdl.operation)
+        pdl.rewrite %0 with "transform.dialect"
+      }
+    }
+    transform.yield
   }
 }
 
diff --git a/mlir/test/Dialect/Transform/transform-state-extension.mlir b/mlir/test/Dialect/Transform/transform-state-extension.mlir
index a26293fbe51ca61..e8c0b7a8a3aac00 100644
--- a/mlir/test/Dialect/Transform/transform-state-extension.mlir
+++ b/mlir/test/Dialect/Transform/transform-state-extension.mlir
@@ -1,89 +1,95 @@
-// RUN: mlir-opt %s -test-transform-dialect-interpreter -verify-diagnostics -split-input-file
+// RUN: mlir-opt %s -transform-interpreter -verify-diagnostics -split-input-file
 
 // expected-note @below {{associated payload op}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     // expected-remark @below {{extension absent}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
-    test_add_test_extension "A"
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_add_test_extension "A"
     // expected-remark @below {{extension present, A}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
-    test_remove_test_extension
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_remove_test_extension
     // expected-remark @below {{extension absent}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.yield
   }
 }
 
 // -----
 
 // expected-note @below {{associated payload op}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
-    test_add_test_extension "A"
-    test_remove_test_extension
-    test_add_test_extension "B"
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
+    transform.test_remove_test_extension
+    transform.test_add_test_extension "B"
     // expected-remark @below {{extension present, B}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.yield
   }
 }
 
 // -----
 
 // expected-note @below {{associated payload op}}
-module {
-  transform.sequence failures(propagate) {
-  ^bb0(%arg0: !transform.any_op):
-    test_add_test_extension "A"
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
     // expected-remark @below {{extension present, A}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
     // expected-note @below {{associated payload op}}
-    test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
     // expected-remark @below {{extension present, A}}
-    test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.test_check_if_test_extension_present %arg0 : !transform.any_op
+    transform.yield
   }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  test_add_test_extension "A"
-   // This is okay because we are replacing the top-level module operation
-   // (0 results) with this operation that has _more_ (1) results.
-  %dummy = test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
+     // This is okay because we are replacing the top-level module operation
+     // (0 results) with this operation that has _more_ (1) results.
+    %dummy = transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  test_add_test_extension "A"
-  %dummy = test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
-  // This is still okay. Even though we are replacing the previous
-  // operation with (1 result) with this operation that has less (0) results,
-  // there is no handle to the result, hence no issue with value handle update.
-  test_remap_operand_to_self %dummy : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
+    %dummy = transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    // This is still okay. Even though we are replacing the previous
+    // operation with (1 result) with this operation that has less (0) results,
+    // there is no handle to the result, hence no issue with value handle update.
+    transform.test_remap_operand_to_self %dummy : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
 }
 
 // -----
 
-transform.sequence failures(propagate) {
-^bb0(%arg0: !transform.any_op):
-  test_add_test_extension "A"
-  // expected-error @below {{cannot replace an op with another op producing fewer results while tracking handles}}
-  %dummy = test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
-  %valuehandle = transform.get_result %dummy[0] : (!transform.any_op) -> !transform.any_value
-  test_remap_operand_to_self %dummy : (!transform.any_op) -> ()
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    transform.test_add_test_extension "A"
+    // expected-error @below {{cannot replace an op with another op producing fewer results while tracking handles}}
+    %dummy = transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    %valuehandle = transform.get_result %dummy[0] : (!transform.any_op) -> !transform.any_value
+    transform.test_remap_operand_to_self %dummy : (!transform.any_op) -> ()
+    transform.yield
+  }
 }
 
 // -----
 
-module {
-  transform.sequence failures(suppress) {
-  ^bb0(%arg0: !transform.any_op):
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
     // expected-error @below {{TestTransformStateExtension missing}}
-    test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.test_remap_operand_to_self %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
   }
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
index d86ff56d79e33c5..79121bf31c26ed5 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
@@ -1,4 +1,4 @@
-// DEFINE: %{compile} = mlir-opt %s  -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule\
+// DEFINE: %{compile} = mlir-opt %s  -transform-interpreter -test-transform-dialect-erase-schedule\
 // DEFINE:    -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage\
 // DEFINE:    -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
 // DEFINE: %{entry} =
@@ -188,12 +188,14 @@ func.func @matmul_f32() {
   return
 }
 
-transform.sequence failures(propagate) {
-^bb1(%module_op: !transform.any_op):
-  %f = transform.structured.match ops{["func.func"]} in %module_op
-    : (!transform.any_op) -> !transform.any_op
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op) {
+    %f = transform.structured.match ops{["func.func"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
 
-  transform.apply_patterns to %f {
-    transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
-  } : !transform.any_op
+    transform.apply_patterns to %f {
+      transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
+    } : !transform.any_op
+    transform.yield
+  }
 }

From 686ec7c2e9638d5b96d922886827a532d339856d Mon Sep 17 00:00:00 2001
From: chuongg3 <chuong.goh@arm.com>
Date: Wed, 28 Feb 2024 10:26:41 +0000
Subject: [PATCH 052/114] [AArch64][GlobalISel] Legalize G_STORE for v4s8
 vector (#82498)

Lowers `G_STORE v4s8, ptr` into

`s32 = G_BITCAST v4s8`
`G_STORE s32, ptr`
---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  9 +++++-
 llvm/test/CodeGen/AArch64/store.ll            | 29 +++++++++++--------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index a2e805e8cb56dc2..3205484bb0631d3 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -52,6 +52,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   const LLT v16s8 = LLT::fixed_vector(16, 8);
   const LLT v8s8 = LLT::fixed_vector(8, 8);
   const LLT v4s8 = LLT::fixed_vector(4, 8);
+  const LLT v2s8 = LLT::fixed_vector(2, 8);
   const LLT v8s16 = LLT::fixed_vector(8, 16);
   const LLT v4s16 = LLT::fixed_vector(4, 16);
   const LLT v2s16 = LLT::fixed_vector(2, 16);
@@ -422,8 +423,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(0, s64, 2)
       .clampMaxNumElements(0, p0, 2)
       .lowerIfMemSizeNotPow2()
+      // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
+      .bitcastIf(typeInSet(0, {v4s8}),
+                 [=](const LegalityQuery &Query) {
+                   const LLT VecTy = Query.Types[0];
+                   return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
+                 })
       .customIf(IsPtrVecPred)
-      .scalarizeIf(typeIs(0, v2s16), 0);
+      .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0);
 
   getActionDefinitionsBuilder(G_INDEXED_STORE)
       // Idx 0 == Ptr, Idx 1 == Val
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 251e5382ef396fc..bf22d79a4df994b 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -1,9 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:         warning: Instruction selection used fallback path for store_v2i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for store_v4i8
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; ===== Legal Scalars =====
 define void @store_i8(i8 %a, ptr %ptr){
@@ -110,14 +107,22 @@ define void @store_v2i64(<2 x i64> %a, ptr %ptr){
 ; ===== Smaller/Larger Width Vectors with Legal Element Sizes =====
 
 define void @store_v2i8(<2 x i8> %a, ptr %ptr){
-; CHECK-LABEL: store_v2i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov w8, v0.s[1]
-; CHECK-NEXT:    fmov w9, s0
-; CHECK-NEXT:    strb w9, [x0]
-; CHECK-NEXT:    strb w8, [x0, #1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: store_v2i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    mov w8, v0.s[1]
+; CHECK-SD-NEXT:    fmov w9, s0
+; CHECK-SD-NEXT:    strb w9, [x0]
+; CHECK-SD-NEXT:    strb w8, [x0, #1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v2i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    str b0, [x0]
+; CHECK-GI-NEXT:    str b1, [x0, #1]
+; CHECK-GI-NEXT:    ret
     store <2 x i8> %a, ptr %ptr
     ret void
 }

From 41427b0e8eeed9891d4e98ea9ac6a812682fd507 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Wed, 28 Feb 2024 10:42:16 +0000
Subject: [PATCH 053/114] [AArch64] Disable FastISel/GlobalISel for ZT0 state
 (#82768)

For __arm_new("zt0") we need to have special setup code in the prologue.
For calls that don't preserve zt0, we need to emit code preserve ZT0
around the call.
This is only emitted by SelectionDAG ISel at the moment.
---
 llvm/lib/Target/AArch64/AArch64FastISel.cpp   |  3 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  3 +-
 .../AArch64/GISel/AArch64CallLowering.cpp     |  3 +-
 .../AArch64/sme-disable-gisel-fisel.ll        | 65 ++++++++++++++++++-
 4 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 635beeed0df8339..49bcab588e5204d 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -5179,7 +5179,8 @@ FastISel *AArch64::createFastISel(FunctionLoweringInfo &FuncInfo,
                                         const TargetLibraryInfo *LibInfo) {
 
   SMEAttrs CallerAttrs(*FuncInfo.Fn);
-  if (CallerAttrs.hasZAState() || CallerAttrs.hasStreamingInterfaceOrBody() ||
+  if (CallerAttrs.hasZAState() || CallerAttrs.hasZT0State() ||
+      CallerAttrs.hasStreamingInterfaceOrBody() ||
       CallerAttrs.hasStreamingCompatibleInterface())
     return nullptr;
   return new AArch64FastISel(FuncInfo, LibInfo);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3b92e95d7c2876f..b86661c8a4e0d41 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -25892,7 +25892,8 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
     auto CallerAttrs = SMEAttrs(*Inst.getFunction());
     auto CalleeAttrs = SMEAttrs(*Base);
     if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
-        CallerAttrs.requiresLazySave(CalleeAttrs))
+        CallerAttrs.requiresLazySave(CalleeAttrs) ||
+        CallerAttrs.requiresPreservingZT0(CalleeAttrs))
       return true;
   }
   return false;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 3dc3d31a34e84e4..26dbad713594aa3 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -535,7 +535,8 @@ bool AArch64CallLowering::fallBackToDAGISel(const MachineFunction &MF) const {
   }
 
   SMEAttrs Attrs(F);
-  if (Attrs.hasZAState() || Attrs.hasStreamingInterfaceOrBody() ||
+  if (Attrs.hasZAState() || Attrs.hasZT0State() ||
+      Attrs.hasStreamingInterfaceOrBody() ||
       Attrs.hasStreamingCompatibleInterface())
     return true;
 
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 2a78012045ff427..cd348be5d771d18 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme < %s \
+; RUN: llc -fast-isel=true -global-isel=false -fast-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-FISEL
-; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme < %s \
+; RUN: llc -fast-isel=false -global-isel=true -global-isel-abort=0 -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK-GISEL
 
 
@@ -447,3 +447,64 @@ define float @frem_call_sm_compat(float %a, float %b) "aarch64_pstate_sm_compati
   %res = frem float %a, %b
   ret float %res
 }
+
+;
+; Check ZT0 State
+;
+
+declare double @zt0_shared_callee(double) "aarch64_inout_zt0"
+
+define double  @zt0_new_caller_to_zt0_shared_callee(double %x) nounwind noinline optnone "aarch64_new_zt0" {
+; CHECK-COMMON-LABEL: zt0_new_caller_to_zt0_shared_callee:
+; CHECK-COMMON:       // %bb.0: // %prelude
+; CHECK-COMMON-NEXT:    sub sp, sp, #80
+; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-COMMON-NEXT:    cbz x8, .LBB13_2
+; CHECK-COMMON-NEXT:    b .LBB13_1
+; CHECK-COMMON-NEXT:  .LBB13_1: // %save.za
+; CHECK-COMMON-NEXT:    mov x8, sp
+; CHECK-COMMON-NEXT:    str zt0, [x8]
+; CHECK-COMMON-NEXT:    bl __arm_tpidr2_save
+; CHECK-COMMON-NEXT:    ldr zt0, [x8]
+; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-COMMON-NEXT:    b .LBB13_2
+; CHECK-COMMON-NEXT:  .LBB13_2: // %entry
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    zero { zt0 }
+; CHECK-COMMON-NEXT:    bl zt0_shared_callee
+; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
+; CHECK-COMMON-NEXT:    fmov d1, x8
+; CHECK-COMMON-NEXT:    fadd d0, d0, d1
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #80
+; CHECK-COMMON-NEXT:    ret
+entry:
+  %call = call double @zt0_shared_callee(double %x)
+  %add = fadd double %call, 4.200000e+01
+  ret double %add;
+}
+
+define double  @zt0_shared_caller_to_normal_callee(double %x) nounwind noinline optnone "aarch64_inout_zt0" {
+; CHECK-COMMON-LABEL: zt0_shared_caller_to_normal_callee:
+; CHECK-COMMON:       // %bb.0: // %entry
+; CHECK-COMMON-NEXT:    sub sp, sp, #80
+; CHECK-COMMON-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    mov x19, sp
+; CHECK-COMMON-NEXT:    str zt0, [x19]
+; CHECK-COMMON-NEXT:    smstop za
+; CHECK-COMMON-NEXT:    bl normal_callee
+; CHECK-COMMON-NEXT:    smstart za
+; CHECK-COMMON-NEXT:    ldr zt0, [x19]
+; CHECK-COMMON-NEXT:    mov x8, #4631107791820423168 // =0x4045000000000000
+; CHECK-COMMON-NEXT:    fmov d1, x8
+; CHECK-COMMON-NEXT:    fadd d0, d0, d1
+; CHECK-COMMON-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    add sp, sp, #80
+; CHECK-COMMON-NEXT:    ret
+entry:
+  %call = call double @normal_callee(double %x)
+  %add = fadd double %call, 4.200000e+01
+  ret double %add;
+}

From fd336c33b6c94ab1f4cfd7e13ce4ab0e6ddda92e Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh@arm.com>
Date: Wed, 28 Feb 2024 10:57:10 +0000
Subject: [PATCH 054/114] [AArch64][GlobalISel] Pre-Commit Test for Legalize
 G_LOAD v4i8 (#82989)

---
 llvm/test/CodeGen/AArch64/load.ll | 313 ++++++++++++++++++++++++++++++
 1 file changed, 313 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/load.ll

diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
new file mode 100644
index 000000000000000..d82c1fe55a2dd2c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -0,0 +1,313 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:         warning: Instruction selection used fallback path for load_v2i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for load_v4i8
+
+; ===== Legal Scalars =====
+
+define i8 @load_i8(ptr %ptr){
+; CHECK-LABEL: load_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w0, [x0]
+; CHECK-NEXT:    ret
+    %a = load i8 , ptr %ptr
+    ret i8 %a
+}
+
+define i16 @load_i16(ptr %ptr){
+; CHECK-LABEL: load_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w0, [x0]
+; CHECK-NEXT:    ret
+    %a = load i16 , ptr %ptr
+    ret i16 %a
+}
+
+define i32 @load_i32(ptr %ptr){
+; CHECK-LABEL: load_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:    ret
+    %a = load i32 , ptr %ptr
+    ret i32 %a
+}
+
+define i64 @load_i64(ptr %ptr){
+; CHECK-LABEL: load_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x0, [x0]
+; CHECK-NEXT:    ret
+    %a = load i64 , ptr %ptr
+    ret i64 %a
+}
+
+; ===== Legal Vector Types =====
+
+define <8 x i8> @load_v8i8(ptr %ptr){
+; CHECK-LABEL: load_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <8 x i8>, ptr %ptr
+    ret <8 x i8> %a
+}
+
+define <16 x i8> @load_v16i8(ptr %ptr){
+; CHECK-LABEL: load_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <16 x i8>, ptr %ptr
+    ret <16 x i8> %a
+}
+
+define <4 x i16> @load_v4i16(ptr %ptr){
+; CHECK-LABEL: load_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <4 x i16>, ptr %ptr
+    ret <4 x i16> %a
+}
+
+define <8 x i16> @load_v8i16(ptr %ptr){
+; CHECK-LABEL: load_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <8 x i16>, ptr %ptr
+    ret <8 x i16> %a
+}
+
+define <2 x i32> @load_v2i32(ptr %ptr){
+; CHECK-LABEL: load_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <2 x i32>, ptr %ptr
+    ret <2 x i32> %a
+}
+
+define <4 x i32> @load_v4i32(ptr %ptr){
+; CHECK-LABEL: load_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <4 x i32>, ptr %ptr
+    ret <4 x i32> %a
+}
+
+define <2 x i64> @load_v2i64(ptr %ptr){
+; CHECK-LABEL: load_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <2 x i64>, ptr %ptr
+    ret <2 x i64> %a
+}
+
+; ===== Smaller/Larger Width Vectors with Legal Element Sizes =====
+
+define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b){
+; CHECK-LABEL: load_v2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-NEXT:    add x8, x0, #1
+; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %a = load <2 x i8>, ptr %ptr
+    ret <2 x i8> %a
+}
+
+define i32 @load_v4i8(ptr %ptr, <4 x i8> %b){
+; CHECK-LABEL: load_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <4 x i8>, ptr %ptr
+    %c = bitcast <4 x i8> %a to i32
+    ret i32 %c
+}
+
+define <32 x i8> @load_v32i8(ptr %ptr){
+; CHECK-LABEL: load_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    %a = load <32 x i8>, ptr %ptr
+    ret <32 x i8> %a
+}
+
+define <2 x i16> @load_v2i16(ptr %ptr){
+; CHECK-SD-LABEL: load_v2i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
+; CHECK-SD-NEXT:    add x8, x0, #2
+; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v2i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %a = load <2 x i16>, ptr %ptr
+    ret <2 x i16> %a
+}
+
+define <16 x i16> @load_v16i16(ptr %ptr){
+; CHECK-LABEL: load_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    %a = load <16 x i16>, ptr %ptr
+    ret <16 x i16> %a
+}
+
+define <1 x i32> @load_v1i32(ptr %ptr){
+; CHECK-LABEL: load_v1i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+    %a = load <1 x i32>, ptr %ptr
+    ret <1 x i32> %a
+}
+
+define <8 x i32> @load_v8i32(ptr %ptr){
+; CHECK-LABEL: load_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    %a = load <8 x i32>, ptr %ptr
+    ret <8 x i32> %a
+}
+
+define <4 x i64> @load_v4i64(ptr %ptr){
+; CHECK-LABEL: load_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+    %a = load <4 x i64>, ptr %ptr
+    ret <4 x i64> %a
+}
+
+; ===== Vectors with Non-Pow 2 Widths =====
+
+define <3 x i8> @load_v3i8(ptr %ptr){
+; CHECK-SD-LABEL: load_v3i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    umov w0, v0.b[0]
+; CHECK-SD-NEXT:    umov w1, v0.b[1]
+; CHECK-SD-NEXT:    umov w2, v0.b[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v3i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldrb w8, [x0]
+; CHECK-GI-NEXT:    ldrb w1, [x0, #1]
+; CHECK-GI-NEXT:    ldrb w2, [x0, #2]
+; CHECK-GI-NEXT:    mov w0, w8
+; CHECK-GI-NEXT:    ret
+    %a = load <3 x i8>, ptr %ptr
+    ret <3 x i8> %a
+}
+
+define <7 x i8> @load_v7i8(ptr %ptr){
+; CHECK-SD-LABEL: load_v7i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v7i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #3]
+; CHECK-GI-NEXT:    mov v0.b[3], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #4]
+; CHECK-GI-NEXT:    mov v0.b[4], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #5]
+; CHECK-GI-NEXT:    mov v0.b[5], v1.b[0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #6]
+; CHECK-GI-NEXT:    mov v0.b[6], v1.b[0]
+; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %a = load <7 x i8>, ptr %ptr
+    ret <7 x i8> %a
+}
+
+define <3 x i16> @load_v3i16(ptr %ptr){
+; CHECK-SD-LABEL: load_v3i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr d0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v3i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #4]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %a = load <3 x i16>, ptr %ptr
+    ret <3 x i16> %a
+}
+
+define <7 x i16> @load_v7i16(ptr %ptr){
+; CHECK-SD-LABEL: load_v7i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v7i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr h0, [x0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #4]
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #6]
+; CHECK-GI-NEXT:    mov v0.h[3], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #8]
+; CHECK-GI-NEXT:    mov v0.h[4], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #10]
+; CHECK-GI-NEXT:    mov v0.h[5], v1.h[0]
+; CHECK-GI-NEXT:    ldr h1, [x0, #12]
+; CHECK-GI-NEXT:    mov v0.h[6], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[7], v0.h[0]
+; CHECK-GI-NEXT:    ret
+    %a = load <7 x i16>, ptr %ptr
+    ret <7 x i16> %a
+}
+
+define <3 x i32> @load_v3i32(ptr %ptr){
+; CHECK-SD-LABEL: load_v3i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v3i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldp s0, s1, [x0]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    ldr s1, [x0, #8]
+; CHECK-GI-NEXT:    mov v0.s[2], v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[3], v0.s[0]
+; CHECK-GI-NEXT:    ret
+    %a = load <3 x i32>, ptr %ptr
+    ret <3 x i32> %a
+}

From 37daff028fcec27f2be1bb990df77e19c0244ccf Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 28 Feb 2024 11:19:48 +0000
Subject: [PATCH 055/114] [X86] setcc-lowering.ll - regenerate with AVX2 test
 coverage

Added while triaging a regression from #82290
---
 llvm/test/CodeGen/X86/setcc-lowering.ll | 97 +++++++++++++++----------
 1 file changed, 60 insertions(+), 37 deletions(-)

diff --git a/llvm/test/CodeGen/X86/setcc-lowering.ll b/llvm/test/CodeGen/X86/setcc-lowering.ll
index 705e48ca4c9c9bb..6636d6948b1096e 100644
--- a/llvm/test/CodeGen/X86/setcc-lowering.ll
+++ b/llvm/test/CodeGen/X86/setcc-lowering.ll
@@ -1,22 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
-; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=knl < %s   | FileCheck %s --check-prefix=KNL-32
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx  < %s | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=knl < %s   | FileCheck %s --check-prefixes=AVX,KNL-32
 
 
 ; Verify that we don't crash during codegen due to a wrong lowering
 ; of a setcc node with illegal operand types and return type.
 
 define <8 x i16> @pr25080(<8 x i32> %a) {
-; AVX-LABEL: pr25080:
-; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; AVX1-LABEL: pr25080:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: pr25080:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
 ; KNL-32-LABEL: pr25080:
 ; KNL-32:       # %bb.0: # %entry
@@ -38,23 +51,40 @@ entry:
 }
 
 define void @pr26232(i64 %a, <16 x i1> %b) {
-; AVX-LABEL: pr26232:
-; AVX:       # %bb.0: # %allocas
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    .p2align 4, 0x90
-; AVX-NEXT:  .LBB1_1: # %for_loop599
-; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
-; AVX-NEXT:    cmpq $65536, %rdi # imm = 0x10000
-; AVX-NEXT:    setl %al
-; AVX-NEXT:    vmovd %eax, %xmm2
-; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vpand %xmm0, %xmm2, %xmm2
-; AVX-NEXT:    vpsllw $7, %xmm2, %xmm2
-; AVX-NEXT:    vpmovmskb %xmm2, %eax
-; AVX-NEXT:    testl %eax, %eax
-; AVX-NEXT:    jne .LBB1_1
-; AVX-NEXT:  # %bb.2: # %for_exit600
-; AVX-NEXT:    retq
+; AVX1-LABEL: pr26232:
+; AVX1:       # %bb.0: # %allocas
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    .p2align 4, 0x90
+; AVX1-NEXT:  .LBB1_1: # %for_loop599
+; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX1-NEXT:    cmpq $65536, %rdi # imm = 0x10000
+; AVX1-NEXT:    setl %al
+; AVX1-NEXT:    vmovd %eax, %xmm2
+; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $7, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovmskb %xmm2, %eax
+; AVX1-NEXT:    testl %eax, %eax
+; AVX1-NEXT:    jne .LBB1_1
+; AVX1-NEXT:  # %bb.2: # %for_exit600
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: pr26232:
+; AVX2:       # %bb.0: # %allocas
+; AVX2-NEXT:    .p2align 4, 0x90
+; AVX2-NEXT:  .LBB1_1: # %for_loop599
+; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT:    cmpq $65536, %rdi # imm = 0x10000
+; AVX2-NEXT:    setl %al
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm0, %xmm1, %xmm1
+; AVX2-NEXT:    vpsllw $7, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovmskb %xmm1, %eax
+; AVX2-NEXT:    testl %eax, %eax
+; AVX2-NEXT:    jne .LBB1_1
+; AVX2-NEXT:  # %bb.2: # %for_exit600
+; AVX2-NEXT:    retq
 ;
 ; KNL-32-LABEL: pr26232:
 ; KNL-32:       # %bb.0: # %allocas
@@ -108,14 +138,7 @@ define <4 x i32> @pcmpgt(<4 x i8> %x) {
 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
-;
-; KNL-32-LABEL: pcmpgt:
-; KNL-32:       # %bb.0:
-; KNL-32-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; KNL-32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-32-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; KNL-32-NEXT:    retl
+; AVX-NEXT:    ret{{[l|q]}}
   %zext = zext <4 x i8> %x to <4 x i32>
   %icmp = icmp ne <4 x i32> %zext, zeroinitializer
   %sext = sext <4 x i1> %icmp to <4 x i32>

From ffe7049b543adb9739261d28a60d4a47a00aa2e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20=C3=81lvarez=20Ayll=C3=B3n?=
 <alejandro.alvarez@sonarsource.com>
Date: Wed, 28 Feb 2024 12:22:57 +0100
Subject: [PATCH 056/114] [clang][analyzer] StreamChecker: Model getc, vfscanf,
 putc, vfprintf (#82476)

Model `getc` and `putc` as equivalent to `fgetc` and `fputc` respectively.

Model `vfscanf` and `vfprintf` as `fscanf` and `fprintf`, except that
`vfscanf` can not invalidate the parameters due to the indirection via a
`va_list`. Nevertheless, we can still track EOF and errors as for `fscanf`.
---
 .../StaticAnalyzer/Checkers/StreamChecker.cpp | 23 ++++++++--
 ...ystem-header-simulator-for-simple-stream.h |  2 +-
 .../system-header-simulator-for-valist.h      |  6 +++
 .../Analysis/Inputs/system-header-simulator.h |  3 ++
 clang/test/Analysis/stream-invalidate.c       | 42 +++++++++++++++++++
 clang/test/Analysis/stream.c                  | 34 +++++++++++++++
 6 files changed, 105 insertions(+), 5 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 65bdc4cac309409..29956fed2b3c24b 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -348,18 +348,30 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
       {{{"fgets"}, 3},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, true),
         std::bind(&StreamChecker::evalFgetx, _1, _2, _3, _4, false), 2}},
+      {{{"getc"}, 1},
+       {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, true),
+        std::bind(&StreamChecker::evalFgetx, _1, _2, _3, _4, true), 0}},
       {{{"fputc"}, 2},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
         std::bind(&StreamChecker::evalFputx, _1, _2, _3, _4, true), 1}},
       {{{"fputs"}, 2},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
         std::bind(&StreamChecker::evalFputx, _1, _2, _3, _4, false), 1}},
+      {{{"putc"}, 2},
+       {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
+        std::bind(&StreamChecker::evalFputx, _1, _2, _3, _4, true), 1}},
       {{{"fprintf"}},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
         std::bind(&StreamChecker::evalFprintf, _1, _2, _3, _4), 0}},
+      {{{"vfprintf"}, 3},
+       {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
+        std::bind(&StreamChecker::evalFprintf, _1, _2, _3, _4), 0}},
       {{{"fscanf"}},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, true),
         std::bind(&StreamChecker::evalFscanf, _1, _2, _3, _4), 0}},
+      {{{"vfscanf"}, 3},
+       {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, true),
+        std::bind(&StreamChecker::evalFscanf, _1, _2, _3, _4), 0}},
       {{{"ungetc"}, 2},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
         std::bind(&StreamChecker::evalUngetc, _1, _2, _3, _4), 1}},
@@ -1038,10 +1050,13 @@ void StreamChecker::evalFscanf(const FnDescription *Desc, const CallEvent &Call,
     if (!StateNotFailed)
       return;
 
-    SmallVector<unsigned int> EscArgs;
-    for (auto EscArg : llvm::seq(2u, Call.getNumArgs()))
-      EscArgs.push_back(EscArg);
-    StateNotFailed = escapeArgs(StateNotFailed, C, Call, EscArgs);
+    if (auto const *Callee = Call.getCalleeIdentifier();
+        !Callee || !Callee->getName().equals("vfscanf")) {
+      SmallVector<unsigned int> EscArgs;
+      for (auto EscArg : llvm::seq(2u, Call.getNumArgs()))
+        EscArgs.push_back(EscArg);
+      StateNotFailed = escapeArgs(StateNotFailed, C, Call, EscArgs);
+    }
 
     if (StateNotFailed)
       C.addTransition(StateNotFailed);
diff --git a/clang/test/Analysis/Inputs/system-header-simulator-for-simple-stream.h b/clang/test/Analysis/Inputs/system-header-simulator-for-simple-stream.h
index 098a2208fecbe91..c26d3582149120e 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator-for-simple-stream.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator-for-simple-stream.h
@@ -5,7 +5,7 @@
 // suppressed.
 #pragma clang system_header
 
-typedef struct __sFILE {
+typedef struct _FILE {
   unsigned char *_p;
 } FILE;
 FILE *fopen(const char *restrict, const char *restrict) __asm("_" "fopen" );
diff --git a/clang/test/Analysis/Inputs/system-header-simulator-for-valist.h b/clang/test/Analysis/Inputs/system-header-simulator-for-valist.h
index 7299b61353d460d..720944abb8ad47c 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator-for-valist.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator-for-valist.h
@@ -10,6 +10,8 @@
 #define restrict /*restrict*/
 #endif
 
+typedef struct _FILE FILE;
+
 typedef __builtin_va_list va_list;
 
 #define va_start(ap, param) __builtin_va_start(ap, param)
@@ -21,6 +23,10 @@ int vprintf (const char *restrict format, va_list arg);
 
 int vsprintf (char *restrict s, const char *restrict format, va_list arg);
 
+int vfprintf(FILE *stream, const char *format, va_list ap);
+
+int vfscanf(FILE *stream, const char *format, va_list ap);
+
 int some_library_function(int n, va_list arg);
 
 // No warning from system header.
diff --git a/clang/test/Analysis/Inputs/system-header-simulator.h b/clang/test/Analysis/Inputs/system-header-simulator.h
index 15986984802c0e6..8fd51449ecc0a42 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator.h
@@ -73,6 +73,9 @@ int ferror(FILE *stream);
 int fileno(FILE *stream);
 int fflush(FILE *stream);
 
+
+int getc(FILE *stream);
+
 size_t strlen(const char *);
 
 char *strcpy(char *restrict, const char *restrict);
diff --git a/clang/test/Analysis/stream-invalidate.c b/clang/test/Analysis/stream-invalidate.c
index 6745d11a2fe7017..5046a356d0583d5 100644
--- a/clang/test/Analysis/stream-invalidate.c
+++ b/clang/test/Analysis/stream-invalidate.c
@@ -4,6 +4,7 @@
 // RUN: -analyzer-checker=debug.ExprInspection
 
 #include "Inputs/system-header-simulator.h"
+#include "Inputs/system-header-simulator-for-valist.h"
 
 void clang_analyzer_eval(int);
 void clang_analyzer_dump(int);
@@ -145,3 +146,44 @@ void test_fgetpos() {
 
   fclose(F);
 }
+
+void test_fprintf() {
+  FILE *F1 = tmpfile();
+  if (!F1)
+    return;
+
+  unsigned a = 42;
+  char *output = "HELLO";
+  int r = fprintf(F1, "%s\t%u\n", output, a);
+  // fprintf does not invalidate any of its input
+  // 69 is ascii for 'E'
+  clang_analyzer_dump(a); // expected-warning {{42 S32b}}
+  clang_analyzer_dump(output[1]); // expected-warning {{69 S32b}}
+  fclose(F1);
+}
+
+int test_vfscanf_inner(const char *fmt, ...) {
+  FILE *F1 = tmpfile();
+  if (!F1)
+    return EOF;
+
+  va_list ap;
+  va_start(ap, fmt);
+
+  int r = vfscanf(F1, fmt, ap);
+
+  fclose(F1);
+  va_end(ap);
+  return r;
+}
+
+void test_vfscanf() {
+  int i = 42;
+  int j = 43;
+  int r = test_vfscanf_inner("%d", &i);
+  if (r != EOF) {
+    // i gets invalidated by the call to test_vfscanf_inner, not by vfscanf.
+    clang_analyzer_dump(i); // expected-warning {{conj_$}}
+    clang_analyzer_dump(j); // expected-warning {{43 S32b}}
+  }
+}
diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c
index 378c9154f8f6a87..7c7f68abeecac79 100644
--- a/clang/test/Analysis/stream.c
+++ b/clang/test/Analysis/stream.c
@@ -1,6 +1,7 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection -verify %s
 
 #include "Inputs/system-header-simulator.h"
+#include "Inputs/system-header-simulator-for-valist.h"
 
 void clang_analyzer_eval(int);
 
@@ -65,12 +66,24 @@ void check_fseek(void) {
   fclose(fp);
 }
 
+void check_fseeko(void) {
+  FILE *fp = tmpfile();
+  fseeko(fp, 0, 0); // expected-warning {{Stream pointer might be NULL}}
+  fclose(fp);
+}
+
 void check_ftell(void) {
   FILE *fp = tmpfile();
   ftell(fp); // expected-warning {{Stream pointer might be NULL}}
   fclose(fp);
 }
 
+void check_ftello(void) {
+  FILE *fp = tmpfile();
+  ftello(fp); // expected-warning {{Stream pointer might be NULL}}
+  fclose(fp);
+}
+
 void check_rewind(void) {
   FILE *fp = tmpfile();
   rewind(fp); // expected-warning {{Stream pointer might be NULL}}
@@ -129,6 +142,18 @@ void f_dopen(int fd) {
   fclose(F);
 }
 
+void f_vfprintf(int fd, va_list args) {
+  FILE *F = fdopen(fd, "r");
+  vfprintf(F, "%d", args); // expected-warning {{Stream pointer might be NULL}}
+  fclose(F);
+}
+
+void f_vfscanf(int fd, va_list args) {
+  FILE *F = fdopen(fd, "r");
+  vfscanf(F, "%u", args); // expected-warning {{Stream pointer might be NULL}}
+  fclose(F);
+}
+
 void f_seek(void) {
   FILE *p = fopen("foo", "r");
   if (!p)
@@ -138,6 +163,15 @@ void f_seek(void) {
   fclose(p);
 }
 
+void f_seeko(void) {
+  FILE *p = fopen("foo", "r");
+  if (!p)
+    return;
+  fseeko(p, 1, SEEK_SET); // no-warning
+  fseeko(p, 1, 3); // expected-warning {{The whence argument to fseek() should be SEEK_SET, SEEK_END, or SEEK_CUR}}
+  fclose(p);
+}
+
 void f_double_close(void) {
   FILE *p = fopen("foo", "r");
   if (!p)

From fe97a59a7be51dfc7e92619536963051604d155a Mon Sep 17 00:00:00 2001
From: Julian Schmidt <git.julian.schmidt@gmail.com>
Date: Wed, 28 Feb 2024 12:43:43 +0100
Subject: [PATCH 057/114] [clang] remove (clang::)ast_matchers:: namespace from
 AST matcher args for docs (#81437)

When parsing the ASTMatchers.h file, a matcher could specify an argument
that is a matcher using the not needed namespace
`(clang::)ast_matchers::`.
Change the argument parsing in dump_ast_matchers.py to remove those
namespaces such that when parameters with these namespaces slip through,
the namespaces will be not be shown in the matchers reference, like it
is done with the `internal` namespace.
Additionally, remove the not needed namespaces from arguments in
ASTMatchers.h.
---
 clang/docs/LibASTMatchersReference.html       | 14 +++++++-------
 clang/docs/tools/dump_ast_matchers.py         |  2 ++
 clang/include/clang/ASTMatchers/ASTMatchers.h | 16 +++++++---------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index c40d679e383bb2a..8a06084955aa6b8 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -7049,7 +7049,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXFoldExpr.html">CXXFoldExpr</a>&gt;</td><td class="name" onclick="toggle('hasFoldInit0')"><a name="hasFoldInit0Anchor">hasFoldInit</a></td><td>ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMacher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXFoldExpr.html">CXXFoldExpr</a>&gt;</td><td class="name" onclick="toggle('hasFoldInit0')"><a name="hasFoldInit0Anchor">hasFoldInit</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMacher</td></tr>
 <tr><td colspan="4" class="doc" id="hasFoldInit0"><pre>Matches the operand that does not contain the parameter pack.
 
 Example matches `(0 + ... + args)` and `(args * ... * 1)`
@@ -7089,7 +7089,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXFoldExpr.html">CXXFoldExpr</a>&gt;</td><td class="name" onclick="toggle('hasPattern0')"><a name="hasPattern0Anchor">hasPattern</a></td><td>ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMacher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1CXXFoldExpr.html">CXXFoldExpr</a>&gt;</td><td class="name" onclick="toggle('hasPattern0')"><a name="hasPattern0Anchor">hasPattern</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMacher</td></tr>
 <tr><td colspan="4" class="doc" id="hasPattern0"><pre>Matches the operand that contains the parameter pack.
 
 Example matches `(0 + ... + args)`
@@ -7859,7 +7859,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument0')"><a name="forEachTemplateArgument0Anchor">forEachTemplateArgument</a></td><td>clang::ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1ClassTemplateSpecializationDecl.html">ClassTemplateSpecializationDecl</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument0')"><a name="forEachTemplateArgument0Anchor">forEachTemplateArgument</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachTemplateArgument0"><pre>Matches classTemplateSpecialization, templateSpecializationType and
 functionDecl nodes where the template argument matches the inner matcher.
 This matcher may produce multiple matches.
@@ -8454,7 +8454,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('ignoringElidableConstructorCall0')"><a name="ignoringElidableConstructorCall0Anchor">ignoringElidableConstructorCall</a></td><td>ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt;</td><td class="name" onclick="toggle('ignoringElidableConstructorCall0')"><a name="ignoringElidableConstructorCall0Anchor">ignoringElidableConstructorCall</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="ignoringElidableConstructorCall0"><pre>Matches expressions that match InnerMatcher that are possibly wrapped in an
 elidable constructor and other corresponding bookkeeping nodes.
 
@@ -8691,7 +8691,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument2')"><a name="forEachTemplateArgument2Anchor">forEachTemplateArgument</a></td><td>clang::ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1FunctionDecl.html">FunctionDecl</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument2')"><a name="forEachTemplateArgument2Anchor">forEachTemplateArgument</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachTemplateArgument2"><pre>Matches classTemplateSpecialization, templateSpecializationType and
 functionDecl nodes where the template argument matches the inner matcher.
 This matcher may produce multiple matches.
@@ -8959,7 +8959,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InitListExpr.html">InitListExpr</a>&gt;</td><td class="name" onclick="toggle('hasInit0')"><a name="hasInit0Anchor">hasInit</a></td><td>unsigned N, ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1InitListExpr.html">InitListExpr</a>&gt;</td><td class="name" onclick="toggle('hasInit0')"><a name="hasInit0Anchor">hasInit</a></td><td>unsigned N, Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1Expr.html">Expr</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="hasInit0"><pre>Matches the n'th item of an initializer list expression.
 
 Example matches y.
@@ -10026,7 +10026,7 @@ <h2 id="traversal-matchers">AST Traversal Matchers</h2>
 </pre></td></tr>
 
 
-<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument1')"><a name="forEachTemplateArgument1Anchor">forEachTemplateArgument</a></td><td>clang::ast_matchers::Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
+<tr><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateSpecializationType.html">TemplateSpecializationType</a>&gt;</td><td class="name" onclick="toggle('forEachTemplateArgument1')"><a name="forEachTemplateArgument1Anchor">forEachTemplateArgument</a></td><td>Matcher&lt;<a href="https://clang.llvm.org/doxygen/classclang_1_1TemplateArgument.html">TemplateArgument</a>&gt; InnerMatcher</td></tr>
 <tr><td colspan="4" class="doc" id="forEachTemplateArgument1"><pre>Matches classTemplateSpecialization, templateSpecializationType and
 functionDecl nodes where the template argument matches the inner matcher.
 This matcher may produce multiple matches.
diff --git a/clang/docs/tools/dump_ast_matchers.py b/clang/docs/tools/dump_ast_matchers.py
index cc7024d1627b976..705ff0d4d40985c 100755
--- a/clang/docs/tools/dump_ast_matchers.py
+++ b/clang/docs/tools/dump_ast_matchers.py
@@ -116,6 +116,8 @@ def strip_doxygen(comment):
 
 def unify_arguments(args):
     """Gets rid of anything the user doesn't care about in the argument list."""
+    args = re.sub(r"clang::ast_matchers::internal::", r"", args)
+    args = re.sub(r"ast_matchers::internal::", r"", args)
     args = re.sub(r"internal::", r"", args)
     args = re.sub(r"extern const\s+(.*)&", r"\1 ", args)
     args = re.sub(r"&", r" ", args)
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index dc1f49525a004a0..ced89ff127ab3ec 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -4580,8 +4580,7 @@ AST_POLYMORPHIC_MATCHER_P2(hasArgument,
 ///       return (args * ... * 1);
 ///   }
 /// \endcode
-AST_MATCHER_P(CXXFoldExpr, hasFoldInit, ast_matchers::internal::Matcher<Expr>,
-              InnerMacher) {
+AST_MATCHER_P(CXXFoldExpr, hasFoldInit, internal::Matcher<Expr>, InnerMacher) {
   const auto *const Init = Node.getInit();
   return Init && InnerMacher.matches(*Init, Finder, Builder);
 }
@@ -4603,8 +4602,7 @@ AST_MATCHER_P(CXXFoldExpr, hasFoldInit, ast_matchers::internal::Matcher<Expr>,
 ///       return (args * ... * 1);
 ///   }
 /// \endcode
-AST_MATCHER_P(CXXFoldExpr, hasPattern, ast_matchers::internal::Matcher<Expr>,
-              InnerMacher) {
+AST_MATCHER_P(CXXFoldExpr, hasPattern, internal::Matcher<Expr>, InnerMacher) {
   const Expr *const Pattern = Node.getPattern();
   return Pattern && InnerMacher.matches(*Pattern, Finder, Builder);
 }
@@ -4685,8 +4683,8 @@ AST_MATCHER(CXXFoldExpr, isBinaryFold) { return Node.getInit() != nullptr; }
 /// \code
 ///   int x{y}.
 /// \endcode
-AST_MATCHER_P2(InitListExpr, hasInit, unsigned, N,
-               ast_matchers::internal::Matcher<Expr>, InnerMatcher) {
+AST_MATCHER_P2(InitListExpr, hasInit, unsigned, N, internal::Matcher<Expr>,
+               InnerMatcher) {
   return N < Node.getNumInits() &&
           InnerMatcher.matches(*Node.getInit(N), Finder, Builder);
 }
@@ -5309,7 +5307,7 @@ AST_POLYMORPHIC_MATCHER_P(
     forEachTemplateArgument,
     AST_POLYMORPHIC_SUPPORTED_TYPES(ClassTemplateSpecializationDecl,
                                     TemplateSpecializationType, FunctionDecl),
-    clang::ast_matchers::internal::Matcher<TemplateArgument>, InnerMatcher) {
+    internal::Matcher<TemplateArgument>, InnerMatcher) {
   ArrayRef<TemplateArgument> TemplateArgs =
       clang::ast_matchers::internal::getTemplateSpecializationArgs(Node);
   clang::ast_matchers::internal::BoundNodesTreeBuilder Result;
@@ -8525,8 +8523,8 @@ AST_MATCHER(FunctionDecl, hasTrailingReturn) {
 ///
 /// ``varDecl(hasInitializer(ignoringElidableConstructorCall(callExpr())))``
 /// matches ``H D = G()`` in C++11 through C++17 (and beyond).
-AST_MATCHER_P(Expr, ignoringElidableConstructorCall,
-              ast_matchers::internal::Matcher<Expr>, InnerMatcher) {
+AST_MATCHER_P(Expr, ignoringElidableConstructorCall, internal::Matcher<Expr>,
+              InnerMatcher) {
   // E tracks the node that we are examining.
   const Expr *E = &Node;
   // If present, remove an outer `ExprWithCleanups` corresponding to the

From 3d454d2895820cd1e6caa92f1e82ba468b5b5f09 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Wed, 28 Feb 2024 11:48:53 +0000
Subject: [PATCH 058/114] [LLVM][TypeSize] Remove default constructor. (#82810)

---
 clang/lib/CodeGen/CGCall.cpp            | 15 ++++++---------
 llvm/include/llvm/Support/TypeSize.h    |  2 --
 llvm/unittests/Support/TypeSizeTest.cpp |  1 -
 3 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index d05cf1c6e1814eb..0d86fcf544d0fdd 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -3221,12 +3221,10 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
 
       llvm::StructType *STy =
           dyn_cast<llvm::StructType>(ArgI.getCoerceToType());
-      llvm::TypeSize StructSize;
-      llvm::TypeSize PtrElementSize;
       if (ArgI.isDirect() && !ArgI.getCanBeFlattened() && STy &&
           STy->getNumElements() > 1) {
-        StructSize = CGM.getDataLayout().getTypeAllocSize(STy);
-        PtrElementSize =
+        llvm::TypeSize StructSize = CGM.getDataLayout().getTypeAllocSize(STy);
+        llvm::TypeSize PtrElementSize =
             CGM.getDataLayout().getTypeAllocSize(ConvertTypeForMem(Ty));
         if (STy->containsHomogeneousScalableVectorTypes()) {
           assert(StructSize == PtrElementSize &&
@@ -5310,12 +5308,11 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
 
       llvm::StructType *STy =
           dyn_cast<llvm::StructType>(ArgInfo.getCoerceToType());
-      llvm::Type *SrcTy = ConvertTypeForMem(I->Ty);
-      llvm::TypeSize SrcTypeSize;
-      llvm::TypeSize DstTypeSize;
       if (STy && ArgInfo.isDirect() && !ArgInfo.getCanBeFlattened()) {
-        SrcTypeSize = CGM.getDataLayout().getTypeAllocSize(SrcTy);
-        DstTypeSize = CGM.getDataLayout().getTypeAllocSize(STy);
+        llvm::Type *SrcTy = ConvertTypeForMem(I->Ty);
+        llvm::TypeSize SrcTypeSize =
+            CGM.getDataLayout().getTypeAllocSize(SrcTy);
+        llvm::TypeSize DstTypeSize = CGM.getDataLayout().getTypeAllocSize(STy);
         if (STy->containsHomogeneousScalableVectorTypes()) {
           assert(SrcTypeSize == DstTypeSize &&
                  "Only allow non-fractional movement of structure with "
diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 1b793b0eccf3c7b..68dbe1ea3062abc 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -321,8 +321,6 @@ class TypeSize : public details::FixedOrScalableQuantity<TypeSize, uint64_t> {
       : FixedOrScalableQuantity(V) {}
 
 public:
-  constexpr TypeSize() : FixedOrScalableQuantity(0, false) {}
-
   constexpr TypeSize(ScalarTy Quantity, bool Scalable)
       : FixedOrScalableQuantity(Quantity, Scalable) {}
 
diff --git a/llvm/unittests/Support/TypeSizeTest.cpp b/llvm/unittests/Support/TypeSizeTest.cpp
index 34fe376989e7ba7..b02b7e600953593 100644
--- a/llvm/unittests/Support/TypeSizeTest.cpp
+++ b/llvm/unittests/Support/TypeSizeTest.cpp
@@ -81,7 +81,6 @@ static_assert(INT64_C(2) * TSFixed32 == TypeSize::getFixed(64));
 static_assert(UINT64_C(2) * TSFixed32 == TypeSize::getFixed(64));
 static_assert(alignTo(TypeSize::getFixed(7), 8) == TypeSize::getFixed(8));
 
-static_assert(TypeSize() == TypeSize::getFixed(0));
 static_assert(TypeSize::getZero() == TypeSize::getFixed(0));
 static_assert(TypeSize::getZero() != TypeSize::getScalable(0));
 static_assert(TypeSize::getFixed(0) != TypeSize::getScalable(0));

From 2703f7ec4fb603a7c38288409c9ac919a47cca2a Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Wed, 28 Feb 2024 13:02:23 +0100
Subject: [PATCH 059/114] [flang] Fix linker error for debug builds. (#83250)

PR #81833 introduced some changes to broke some debug builds. This
happened due to an indirectly included file referencing an `operator <<`
function which is defined in a `.cpp` file that not linked with `tco`
and `fir-opt`.
---
 flang/include/flang/Lower/AbstractConverter.h   | 2 +-
 flang/lib/Lower/OpenMP/DataSharingProcessor.cpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index fa3fd9a8cc05eef..944430a15488391 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -16,7 +16,6 @@
 #include "flang/Common/Fortran.h"
 #include "flang/Lower/LoweringOptions.h"
 #include "flang/Lower/PFTDefs.h"
-#include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Semantics/symbol.h"
 #include "mlir/IR/Builders.h"
@@ -54,6 +53,7 @@ class DerivedTypeSpec;
 
 namespace lower {
 class SymMap;
+class SymbolBox;
 namespace pft {
 struct Variable;
 }
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index d6287cb4bc239eb..717b8cc0276a304 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -14,6 +14,7 @@
 
 #include "Utils.h"
 #include "flang/Lower/PFTBuilder.h"
+#include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Semantics/tools.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"

From 6287b7b9e9b729afde4aef7d41609d9b4efaecda Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 28 Feb 2024 12:24:34 +0000
Subject: [PATCH 060/114] [X86] combineEXTRACT_SUBVECTOR - extract 256-bit
 comparisons if only one subvector is required

If only one subvector extraction will be necessary (i.e. because the other is constant etc.) then extract the source operands and perform as a 128-bit comparison

Ideally DAGCombiner's narrowExtractedVectorBinOp would handle this but its tricky to confirm when a target opcode can be safely extracted and performed as a different vector type

Partially improves an outstanding regression in #82290
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 26 +++++++++++++++++++++++--
 llvm/test/CodeGen/X86/kshift.ll         | 12 +++++-------
 llvm/test/CodeGen/X86/pr46455.ll        |  5 ++---
 llvm/test/CodeGen/X86/setcc-lowering.ll |  8 ++++----
 4 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0722c402348ee0d..bec13d1c00ef7df 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55789,6 +55789,15 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  auto IsExtractFree = [](SDValue V) {
+    V = peekThroughBitcasts(V);
+    if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
+      return true;
+    if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode()))
+      return true;
+    return V.isUndef();
+  };
+
   // If we're extracting the lowest subvector and we're the only user,
   // we may be able to perform this with a smaller vector width.
   unsigned InOpcode = InVec.getOpcode();
@@ -55830,14 +55839,27 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
     }
     if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
-        (VT.is128BitVector() || VT.is256BitVector())) {
+        (SizeInBits == 128 || SizeInBits == 256)) {
       SDValue InVecSrc = InVec.getOperand(0);
       unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
       SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
       return DAG.getNode(InOpcode, DL, VT, Ext);
     }
+    if ((InOpcode == X86ISD::CMPP || InOpcode == X86ISD::PCMPEQ ||
+         InOpcode == X86ISD::PCMPGT) &&
+        (IsExtractFree(InVec.getOperand(0)) ||
+         IsExtractFree(InVec.getOperand(1))) &&
+        SizeInBits == 128) {
+      SDValue Ext0 =
+          extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
+      SDValue Ext1 =
+          extractSubVector(InVec.getOperand(1), IdxVal, DAG, DL, SizeInBits);
+      if (InOpcode == X86ISD::CMPP)
+        return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, InVec.getOperand(2));
+      return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1);
+    }
     if (InOpcode == X86ISD::MOVDDUP &&
-        (VT.is128BitVector() || VT.is256BitVector())) {
+        (SizeInBits == 128 || SizeInBits == 256)) {
       SDValue Ext0 =
           extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
       return DAG.getNode(InOpcode, DL, VT, Ext0);
diff --git a/llvm/test/CodeGen/X86/kshift.ll b/llvm/test/CodeGen/X86/kshift.ll
index 0acf82f5a144a2e..f4efacc1946cff9 100644
--- a/llvm/test/CodeGen/X86/kshift.ll
+++ b/llvm/test/CodeGen/X86/kshift.ll
@@ -270,11 +270,10 @@ define i64 @kshiftl_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
 ; KNL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
-; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; KNL-NEXT:    vpcmpeqb %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -564,14 +563,13 @@ define i64 @kshiftr_v64i1_63(<64 x i8> %x, <64 x i8> %y) {
 ; KNL-LABEL: kshiftr_v64i1_63:
 ; KNL:       # %bb.0:
 ; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k1
-; KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; KNL-NEXT:    vpcmpeqb %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
 ; KNL-NEXT:    kmovw %k0, %eax
diff --git a/llvm/test/CodeGen/X86/pr46455.ll b/llvm/test/CodeGen/X86/pr46455.ll
index 092e417c812e78a..3d1b04f74357139 100644
--- a/llvm/test/CodeGen/X86/pr46455.ll
+++ b/llvm/test/CodeGen/X86/pr46455.ll
@@ -4,10 +4,10 @@
 define void @EntryModule(ptr %buffer_table) {
 ; CHECK-LABEL: EntryModule:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    movq (%rdi), %rax
 ; CHECK-NEXT:    movq 24(%rdi), %rcx
-; CHECK-NEXT:    vcmpneqps (%rax), %ymm0, %ymm0
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vcmpneqps (%rax), %xmm0, %xmm0
 ; CHECK-NEXT:    vpsrld $31, %xmm0, %xmm1
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
@@ -16,7 +16,6 @@ define void @EntryModule(ptr %buffer_table) {
 ; CHECK-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovd %xmm0, (%rcx)
-; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
   %i1 = load ptr, ptr %buffer_table, align 8
diff --git a/llvm/test/CodeGen/X86/setcc-lowering.ll b/llvm/test/CodeGen/X86/setcc-lowering.ll
index 6636d6948b1096e..90e5c279d2e17c3 100644
--- a/llvm/test/CodeGen/X86/setcc-lowering.ll
+++ b/llvm/test/CodeGen/X86/setcc-lowering.ll
@@ -21,11 +21,11 @@ define <8 x i16> @pr25080(<8 x i32> %a) {
 ;
 ; AVX2-LABEL: pr25080:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [8388607,8388607,8388607,8388607]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper

From 0a54b36d5e6d941e25c60520a7317d75355aeae5 Mon Sep 17 00:00:00 2001
From: AtariDreams <83477269+AtariDreams@users.noreply.github.com>
Date: Wed, 28 Feb 2024 07:32:58 -0500
Subject: [PATCH 061/114] [X86] Resolve FIXME: Create cld only when needed
 (#82415)

Only use cld when we also have rep instructions, are calling a function, or contain inline asm.
---
 llvm/lib/Target/X86/X86FrameLowering.cpp     | 67 ++++++++++++++++++--
 llvm/test/CodeGen/X86/x86-32-intrcc.ll       | 10 ---
 llvm/test/CodeGen/X86/x86-64-intrcc-uintr.ll |  8 ---
 llvm/test/CodeGen/X86/x86-64-intrcc.ll       |  3 -
 4 files changed, 63 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index be416fb0db0695c..d914e1b61ab075d 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -1418,6 +1418,34 @@ bool X86FrameLowering::needsDwarfCFI(const MachineFunction &MF) const {
   return !isWin64Prologue(MF) && MF.needsFrameMoves();
 }
 
+/// Return true if an opcode is part of the REP group of instructions
+static bool isOpcodeRep(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::REPNE_PREFIX:
+  case X86::REP_MOVSB_32:
+  case X86::REP_MOVSB_64:
+  case X86::REP_MOVSD_32:
+  case X86::REP_MOVSD_64:
+  case X86::REP_MOVSQ_32:
+  case X86::REP_MOVSQ_64:
+  case X86::REP_MOVSW_32:
+  case X86::REP_MOVSW_64:
+  case X86::REP_PREFIX:
+  case X86::REP_STOSB_32:
+  case X86::REP_STOSB_64:
+  case X86::REP_STOSD_32:
+  case X86::REP_STOSD_64:
+  case X86::REP_STOSQ_32:
+  case X86::REP_STOSQ_64:
+  case X86::REP_STOSW_32:
+  case X86::REP_STOSW_64:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+
 /// emitPrologue - Push callee-saved registers onto the stack, which
 /// automatically adjust the stack pointer. Adjust the stack pointer to allocate
 /// space for local variables. Also emit labels used by the exception handler to
@@ -2194,13 +2222,44 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // flag (DF in EFLAGS register). Clear this flag by creating "cld" instruction
   // in each prologue of interrupt handler function.
   //
-  // FIXME: Create "cld" instruction only in these cases:
+  // Create "cld" instruction only in these cases:
   // 1. The interrupt handling function uses any of the "rep" instructions.
   // 2. Interrupt handling function calls another function.
+  // 3. If there are any inline asm blocks, as we do not know what they do
   //
-  if (Fn.getCallingConv() == CallingConv::X86_INTR)
-    BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
-        .setMIFlag(MachineInstr::FrameSetup);
+  // TODO: We should also emit cld if we detect the use of std, but as of now,
+  // the compiler does not even emit that instruction or even define it, so in
+  // practice, this would only happen with inline asm, which we cover anyway.
+  if (Fn.getCallingConv() == CallingConv::X86_INTR) {
+    bool NeedsCLD = false;
+
+    for (const MachineBasicBlock &B : MF) {
+      for (const MachineInstr &MI : B) {
+        if (MI.isCall()) {
+          NeedsCLD = true;
+          break;
+        }
+
+        if (isOpcodeRep(MI.getOpcode())) {
+          NeedsCLD = true;
+          break;
+        }
+
+        if (MI.isInlineAsm()) {
+          // TODO: Parse asm for rep instructions or call sites?
+          // For now, let's play it safe and emit a cld instruction
+          // just in case.
+          NeedsCLD = true;
+          break;
+        }
+      }
+    }
+
+    if (NeedsCLD) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+  }
 
   // At this point we know if the function has WinCFI or not.
   MF.setHasWinCFI(HasWinCFI);
diff --git a/llvm/test/CodeGen/X86/x86-32-intrcc.ll b/llvm/test/CodeGen/X86/x86-32-intrcc.ll
index 2e482753e26853e..3c3944c2082bd58 100644
--- a/llvm/test/CodeGen/X86/x86-32-intrcc.ll
+++ b/llvm/test/CodeGen/X86/x86-32-intrcc.ll
@@ -149,7 +149,6 @@ define x86_intrcc void @test_isr_x87(ptr byval(%struct.interrupt_frame) %frame)
 ; CHECK-NEXT:    pushl %ebp
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    andl $-16, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    fldt f80
 ; CHECK-NEXT:    fld1
 ; CHECK-NEXT:    faddp %st, %st(1)
@@ -163,7 +162,6 @@ define x86_intrcc void @test_isr_x87(ptr byval(%struct.interrupt_frame) %frame)
 ; CHECK0-NEXT:    pushl %ebp
 ; CHECK0-NEXT:    movl %esp, %ebp
 ; CHECK0-NEXT:    andl $-16, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    fldt f80
 ; CHECK0-NEXT:    fld1
 ; CHECK0-NEXT:    faddp %st, %st(1)
@@ -188,7 +186,6 @@ define dso_local x86_intrcc void @test_fp_1(ptr byval(%struct.interrupt_frame) %
 ; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-16, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    leal 20(%ebp), %eax
 ; CHECK-NEXT:    leal 4(%ebp), %ecx
 ; CHECK-NEXT:    movl %ecx, sink_address
@@ -206,7 +203,6 @@ define dso_local x86_intrcc void @test_fp_1(ptr byval(%struct.interrupt_frame) %
 ; CHECK0-NEXT:    pushl %ecx
 ; CHECK0-NEXT:    pushl %eax
 ; CHECK0-NEXT:    andl $-16, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    leal 4(%ebp), %ecx
 ; CHECK0-NEXT:    movl %ecx, %eax
 ; CHECK0-NEXT:    addl $16, %eax
@@ -234,7 +230,6 @@ define dso_local x86_intrcc void @test_fp_2(ptr byval(%struct.interrupt_frame) %
 ; CHECK-NEXT:    pushl %ecx
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-16, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    movl 4(%ebp), %eax
 ; CHECK-NEXT:    leal 24(%ebp), %ecx
 ; CHECK-NEXT:    leal 8(%ebp), %edx
@@ -257,7 +252,6 @@ define dso_local x86_intrcc void @test_fp_2(ptr byval(%struct.interrupt_frame) %
 ; CHECK0-NEXT:    pushl %ecx
 ; CHECK0-NEXT:    pushl %eax
 ; CHECK0-NEXT:    andl $-16, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    movl 4(%ebp), %eax
 ; CHECK0-NEXT:    leal 8(%ebp), %edx
 ; CHECK0-NEXT:    movl %edx, %ecx
@@ -288,7 +282,6 @@ define x86_intrcc void @test_copy_elide(ptr byval(%struct.interrupt_frame) %fram
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-16, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    leal 4(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, sink_address
 ; CHECK-NEXT:    leal -4(%ebp), %esp
@@ -303,7 +296,6 @@ define x86_intrcc void @test_copy_elide(ptr byval(%struct.interrupt_frame) %fram
 ; CHECK0-NEXT:    movl %esp, %ebp
 ; CHECK0-NEXT:    pushl %eax
 ; CHECK0-NEXT:    andl $-16, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    movl 4(%ebp), %eax
 ; CHECK0-NEXT:    leal 4(%ebp), %eax
 ; CHECK0-NEXT:    movl %eax, sink_address
@@ -358,7 +350,6 @@ define x86_intrcc void @test_isr_realign(ptr byval(%struct.interrupt_frame) %fra
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-32, %esp
 ; CHECK-NEXT:    subl $32, %esp
-; CHECK-NEXT:    cld
 ; CHECK-NEXT:    movl 4(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, (%esp)
 ; CHECK-NEXT:    leal -4(%ebp), %esp
@@ -374,7 +365,6 @@ define x86_intrcc void @test_isr_realign(ptr byval(%struct.interrupt_frame) %fra
 ; CHECK0-NEXT:    pushl %eax
 ; CHECK0-NEXT:    andl $-32, %esp
 ; CHECK0-NEXT:    subl $32, %esp
-; CHECK0-NEXT:    cld
 ; CHECK0-NEXT:    movl 4(%ebp), %eax
 ; CHECK0-NEXT:    movl %eax, (%esp)
 ; CHECK0-NEXT:    leal -4(%ebp), %esp
diff --git a/llvm/test/CodeGen/X86/x86-64-intrcc-uintr.ll b/llvm/test/CodeGen/X86/x86-64-intrcc-uintr.ll
index a46b9d9ba5a1001..1fe395b84d46c3b 100644
--- a/llvm/test/CodeGen/X86/x86-64-intrcc-uintr.ll
+++ b/llvm/test/CodeGen/X86/x86-64-intrcc-uintr.ll
@@ -21,28 +21,24 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_empty(ptr nocapture byval(%s
 ; CHECK-USER-LABEL: test_uintr_isr_cc_empty:
 ; CHECK-USER:       # %bb.0: # %entry
 ; CHECK-USER-NEXT:    pushq %rax
-; CHECK-USER-NEXT:    cld
 ; CHECK-USER-NEXT:    addq $16, %rsp
 ; CHECK-USER-NEXT:    uiret
 ;
 ; CHECK0-USER-LABEL: test_uintr_isr_cc_empty:
 ; CHECK0-USER:       # %bb.0: # %entry
 ; CHECK0-USER-NEXT:    pushq %rax
-; CHECK0-USER-NEXT:    cld
 ; CHECK0-USER-NEXT:    addq $16, %rsp
 ; CHECK0-USER-NEXT:    uiret
 ;
 ; CHECK-KERNEL-LABEL: test_uintr_isr_cc_empty:
 ; CHECK-KERNEL:       # %bb.0: # %entry
 ; CHECK-KERNEL-NEXT:    pushq %rax
-; CHECK-KERNEL-NEXT:    cld
 ; CHECK-KERNEL-NEXT:    addq $16, %rsp
 ; CHECK-KERNEL-NEXT:    iretq
 ;
 ; CHECK0-KERNEL-LABEL: test_uintr_isr_cc_empty:
 ; CHECK0-KERNEL:       # %bb.0: # %entry
 ; CHECK0-KERNEL-NEXT:    pushq %rax
-; CHECK0-KERNEL-NEXT:    cld
 ; CHECK0-KERNEL-NEXT:    addq $16, %rsp
 ; CHECK0-KERNEL-NEXT:    iretq
 entry:
@@ -75,7 +71,6 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_args(ptr nocapture readonly
 ; CHECK-USER-NEXT:    pushq %rax
 ; CHECK-USER-NEXT:    pushq %rdx
 ; CHECK-USER-NEXT:    pushq %rcx
-; CHECK-USER-NEXT:    cld
 ; CHECK-USER-NEXT:    movq 32(%rsp), %rax
 ; CHECK-USER-NEXT:    movq 40(%rsp), %rcx
 ; CHECK-USER-NEXT:    movq 48(%rsp), %rdx
@@ -96,7 +91,6 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_args(ptr nocapture readonly
 ; CHECK0-USER-NEXT:    pushq %rax
 ; CHECK0-USER-NEXT:    pushq %rdx
 ; CHECK0-USER-NEXT:    pushq %rcx
-; CHECK0-USER-NEXT:    cld
 ; CHECK0-USER-NEXT:    movq 32(%rsp), %rax
 ; CHECK0-USER-NEXT:    leaq 40(%rsp), %rcx
 ; CHECK0-USER-NEXT:    movq (%rcx), %rdx
@@ -118,7 +112,6 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_args(ptr nocapture readonly
 ; CHECK-KERNEL-NEXT:    pushq %rax
 ; CHECK-KERNEL-NEXT:    pushq %rdx
 ; CHECK-KERNEL-NEXT:    pushq %rcx
-; CHECK-KERNEL-NEXT:    cld
 ; CHECK-KERNEL-NEXT:    movq 32(%rsp), %rax
 ; CHECK-KERNEL-NEXT:    movq 40(%rsp), %rcx
 ; CHECK-KERNEL-NEXT:    movq 48(%rsp), %rdx
@@ -139,7 +132,6 @@ define dso_local x86_intrcc void @test_uintr_isr_cc_args(ptr nocapture readonly
 ; CHECK0-KERNEL-NEXT:    pushq %rax
 ; CHECK0-KERNEL-NEXT:    pushq %rdx
 ; CHECK0-KERNEL-NEXT:    pushq %rcx
-; CHECK0-KERNEL-NEXT:    cld
 ; CHECK0-KERNEL-NEXT:    movq 32(%rsp), %rax
 ; CHECK0-KERNEL-NEXT:    leaq 40(%rsp), %rcx
 ; CHECK0-KERNEL-NEXT:    movq (%rcx), %rdx
diff --git a/llvm/test/CodeGen/X86/x86-64-intrcc.ll b/llvm/test/CodeGen/X86/x86-64-intrcc.ll
index 443d4c2fa464904..5fc606eb566ea6b 100644
--- a/llvm/test/CodeGen/X86/x86-64-intrcc.ll
+++ b/llvm/test/CodeGen/X86/x86-64-intrcc.ll
@@ -114,7 +114,6 @@ define dso_local x86_intrcc void @test_fp_1(ptr byval(%struct.interrupt_frame) %
   ; CHECK: # %bb.0: # %entry
   ; CHECK-NEXT: pushq %rbp
   ; CHECK-NEXT: movq %rsp, %rbp
-  ; CHECK: cld
   ; CHECK-DAG: leaq 8(%rbp), %[[R1:[^ ]*]]
   ; CHECK-DAG: leaq 40(%rbp), %[[R2:[^ ]*]]
   ; CHECK: movq %[[R1]], sink_address
@@ -136,7 +135,6 @@ define dso_local x86_intrcc void @test_fp_2(ptr byval(%struct.interrupt_frame) %
   ; CHECK-NEXT: pushq %rax
   ; CHECK-NEXT: pushq %rbp
   ; CHECK-NEXT: movq %rsp, %rbp
-  ; CHECK: cld
   ; CHECK-DAG: movq 16(%rbp), %[[R3:[^ ]*]]
   ; CHECK-DAG: leaq 24(%rbp), %[[R1:[^ ]*]]
   ; CHECK-DAG: leaq 56(%rbp), %[[R2:[^ ]*]]
@@ -164,7 +162,6 @@ define x86_intrcc void @test_copy_elide(ptr byval(%struct.interrupt_frame) %fram
   ; CHECK-NEXT: pushq %rax
   ; CHECK-NEXT: pushq %rbp
   ; CHECK-NEXT: movq %rsp, %rbp
-  ; CHECK: cld
   ; CHECK: leaq 16(%rbp), %[[R1:[^ ]*]]
   ; CHECK: movq %[[R1]], sink_address(%rip)
 entry:

From 07d8a457ad8bb9a14974b9cb47072746c7f5e489 Mon Sep 17 00:00:00 2001
From: Ilia Kuklin <ikuklin@accesssoftek.com>
Date: Wed, 28 Feb 2024 17:38:26 +0500
Subject: [PATCH 062/114] [llvm-objcopy] Add --set-symbol-visibility and
 --set-symbols-visibility options (#80872)

Add options --set-symbol-visibility and --set-symbols-visibility to
manually change the visibility of symbols.

There is already an option to set the visibility of newly added symbols
via --add-symbol and --new-symbol-visibility. This option will allow to
change the visibility of already existing symbols.
---
 llvm/docs/CommandGuide/llvm-objcopy.rst       |   9 +
 llvm/docs/ReleaseNotes.rst                    |   4 +
 llvm/include/llvm/ObjCopy/ELF/ELFConfig.h     |   3 +
 llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp           |   4 +
 .../ELF/set-symbol-visibility.test            | 311 ++++++++++++++++++
 llvm/tools/llvm-objcopy/ObjcopyOptions.cpp    |  42 +++
 llvm/tools/llvm-objcopy/ObjcopyOpts.td        |  11 +
 7 files changed, 384 insertions(+)
 create mode 100644 llvm/test/tools/llvm-objcopy/ELF/set-symbol-visibility.test

diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 755291676abf6a9..9d0cb7ad1195893 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -455,6 +455,15 @@ them.
  Set the start address of the output to ``<addr>``. Overrides any previously
  specified :option:`--change-start` or :option:`--adjust-start` options.
 
+.. option:: --set-symbol-visibility <symbol>=<visibility>
+
+ Change the visibility of a symbol to the specified value.
+
+.. option:: --set-symbols-visibility <filename>=<visibility>
+
+ Read a list of symbols from <filename> and change their visibility to the
+ specified value. Visibility values: default, internal, hidden, protected.
+
 .. option:: --split-dwo <dwo-file>
 
  Equivalent to running :program:`llvm-objcopy` with :option:`--extract-dwo` and
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 8a3a0ec66ed874b..51b6527f65bb04d 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -147,6 +147,10 @@ Changes to the LLVM tools
   if it's not specified with the ``--format`` argument and cannot be inferred from
   input files.
 
+* llvm-objcopy now supports ``--set-symbol-visibility`` and
+  ``--set-symbols-visibility`` options for ELF input to change the
+  visibility of symbols.
+
 Changes to LLDB
 ---------------------------------
 
diff --git a/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h b/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
index d77cb69b159db67..eafed92516c7df3 100644
--- a/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
+++ b/llvm/include/llvm/ObjCopy/ELF/ELFConfig.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_OBJCOPY_ELF_ELFCONFIG_H
 #define LLVM_OBJCOPY_ELF_ELFCONFIG_H
 
+#include "llvm/ObjCopy/CommonConfig.h"
 #include "llvm/Object/ELFTypes.h"
 
 namespace llvm {
@@ -18,6 +19,8 @@ namespace objcopy {
 struct ELFConfig {
   uint8_t NewSymbolVisibility = (uint8_t)ELF::STV_DEFAULT;
 
+  std::vector<std::pair<NameMatcher, uint8_t>> SymbolsToSetVisibility;
+
   // ELF entry point address expression. The input parameter is an entry point
   // address in the input ELF file. The entry address in the output file is
   // calculated with EntryExpr(input_address), when either --set-start or
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index 1b3a58298ec08a6..f52bcb74938d156 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -300,6 +300,10 @@ static Error updateAndRemoveSymbols(const CommonConfig &Config,
          Config.SymbolsToLocalize.matches(Sym.Name)))
       Sym.Binding = STB_LOCAL;
 
+    for (auto &[Matcher, Visibility] : ELFConfig.SymbolsToSetVisibility)
+      if (Matcher.matches(Sym.Name))
+        Sym.Visibility = Visibility;
+
     // Note: these two globalize flags have very similar names but different
     // meanings:
     //
diff --git a/llvm/test/tools/llvm-objcopy/ELF/set-symbol-visibility.test b/llvm/test/tools/llvm-objcopy/ELF/set-symbol-visibility.test
new file mode 100644
index 000000000000000..de30ee09bfda53d
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/set-symbol-visibility.test
@@ -0,0 +1,311 @@
+# RUN: yaml2obj --docnum=1 %s -o %t.o
+# RUN: echo '.*' > %t.symbols.regex
+
+## Check that the visibility of all symbols is properly set to DEFAULT.
+# RUN: llvm-objcopy %t.o %t0.o --set-symbols-visibility=%t.symbols.regex=default --regex
+# RUN: llvm-readelf -s %t0.o | FileCheck %s --check-prefix=DEF
+
+# DEF-DAG: DEFAULT     1 default_local
+# DEF-DAG: DEFAULT     1 internal_local
+# DEF-DAG: DEFAULT     1 hidden_local
+# DEF-DAG: DEFAULT     1 protected_local
+# DEF-DAG: DEFAULT     1 default_global
+# DEF-DAG: DEFAULT     1 default_weak
+# DEF-DAG: DEFAULT     1 internal_global
+# DEF-DAG: DEFAULT     1 internal_weak
+# DEF-DAG: DEFAULT     1 hidden_global
+# DEF-DAG: DEFAULT     1 hidden_weak
+# DEF-DAG: DEFAULT     1 protected_global
+# DEF-DAG: DEFAULT     1 protected_weak
+
+## Check that the visibility of all symbols is properly set to HIDDEN.
+# RUN: llvm-objcopy %t.o %t1.o --set-symbols-visibility=%t.symbols.regex=hidden --regex
+# RUN: llvm-readelf -s %t1.o | FileCheck %s --check-prefix=HID
+
+# HID-DAG: HIDDEN      1 default_local
+# HID-DAG: HIDDEN      1 internal_local
+# HID-DAG: HIDDEN      1 hidden_local
+# HID-DAG: HIDDEN      1 protected_local
+# HID-DAG: HIDDEN      1 default_global
+# HID-DAG: HIDDEN      1 default_weak
+# HID-DAG: HIDDEN      1 internal_global
+# HID-DAG: HIDDEN      1 internal_weak
+# HID-DAG: HIDDEN      1 hidden_global
+# HID-DAG: HIDDEN      1 hidden_weak
+# HID-DAG: HIDDEN      1 protected_global
+# HID-DAG: HIDDEN      1 protected_weak
+
+## Check that the visibility of all symbols is properly set to PROTECTED.
+# RUN: llvm-objcopy %t.o %t2.o --set-symbols-visibility=%t.symbols.regex=protected --regex
+# RUN: llvm-readelf -s %t2.o | FileCheck %s --check-prefix=PRO
+
+# PRO-DAG: PROTECTED   1 default_local
+# PRO-DAG: PROTECTED   1 internal_local
+# PRO-DAG: PROTECTED   1 hidden_local
+# PRO-DAG: PROTECTED   1 protected_local
+# PRO-DAG: PROTECTED   1 default_global
+# PRO-DAG: PROTECTED   1 default_weak
+# PRO-DAG: PROTECTED   1 internal_global
+# PRO-DAG: PROTECTED   1 internal_weak
+# PRO-DAG: PROTECTED   1 hidden_global
+# PRO-DAG: PROTECTED   1 hidden_weak
+# PRO-DAG: PROTECTED   1 protected_global
+# PRO-DAG: PROTECTED   1 protected_weak
+
+## Check that the visibility of all symbols is properly set to INTERNAL.
+# RUN: llvm-objcopy %t.o %t3.o --set-symbols-visibility=%t.symbols.regex=internal --regex
+# RUN: llvm-readelf -s %t3.o | FileCheck %s --check-prefix=INT
+
+# INT-DAG: INTERNAL    1 default_local
+# INT-DAG: INTERNAL    1 internal_local
+# INT-DAG: INTERNAL    1 hidden_local
+# INT-DAG: INTERNAL    1 protected_local
+# INT-DAG: INTERNAL    1 default_global
+# INT-DAG: INTERNAL    1 default_weak
+# INT-DAG: INTERNAL    1 internal_global
+# INT-DAG: INTERNAL    1 internal_weak
+# INT-DAG: INTERNAL    1 hidden_global
+# INT-DAG: INTERNAL    1 hidden_weak
+# INT-DAG: INTERNAL    1 protected_global
+# INT-DAG: INTERNAL    1 protected_weak
+
+## Check that setting the visibility of certain symbols that were read from
+## a file does not affect other symbols.
+# RUN: echo -e "default_local\ninternal_local" > %t.symbol.list
+# RUN: llvm-objcopy %t.o %t4.o --set-symbols-visibility=%t.symbol.list=hidden
+# RUN: llvm-readelf -s %t4.o | FileCheck %s --check-prefix=FILE
+
+# FILE-DAG: HIDDEN      1 default_local
+# FILE-DAG: HIDDEN      1 internal_local
+## Unaffected symbols:
+# FILE-DAG: HIDDEN      1 hidden_local
+# FILE-DAG: PROTECTED   1 protected_local
+# FILE-DAG: DEFAULT     1 default_global
+# FILE-DAG: DEFAULT     1 default_weak
+# FILE-DAG: INTERNAL    1 internal_global
+# FILE-DAG: INTERNAL    1 internal_weak
+# FILE-DAG: HIDDEN      1 hidden_global
+# FILE-DAG: HIDDEN      1 hidden_weak
+# FILE-DAG: PROTECTED   1 protected_global
+# FILE-DAG: PROTECTED   1 protected_weak
+
+## Check that the visibility of a single symbol is set correctly,
+## and that no other symbols are affected.
+# RUN: llvm-objcopy %t.o %t5.o --set-symbol-visibility=default_local=hidden \
+# RUN:                         --set-symbol-visibility=internal_local=protected \
+# RUN:                         --set-symbol-visibility=hidden_local=internal \
+# RUN:                         --set-symbol-visibility=protected_local=default
+# RUN: llvm-readelf -s %t5.o | FileCheck %s --check-prefix=SINGLE
+
+# SINGLE-DAG: HIDDEN      1 default_local
+# SINGLE-DAG: PROTECTED   1 internal_local
+# SINGLE-DAG: INTERNAL    1 hidden_local
+# SINGLE-DAG: DEFAULT     1 protected_local
+## Unaffected symbols:
+# SINGLE-DAG: DEFAULT     1 default_global
+# SINGLE-DAG: DEFAULT     1 default_weak
+# SINGLE-DAG: INTERNAL    1 internal_global
+# SINGLE-DAG: INTERNAL    1 internal_weak
+# SINGLE-DAG: HIDDEN      1 hidden_global
+# SINGLE-DAG: HIDDEN      1 hidden_weak
+# SINGLE-DAG: PROTECTED   1 protected_global
+# SINGLE-DAG: PROTECTED   1 protected_weak
+
+## Check that the visibility of symbols specified by a regex are set correctly,
+## and that no other symbols are affected.
+# RUN: llvm-objcopy %t.o %t6.o --set-symbol-visibility='.*'_local=hidden --regex
+# RUN: llvm-readelf -s %t6.o | FileCheck %s --check-prefix=REGEX
+
+# REGEX-DAG: HIDDEN      1 default_local
+# REGEX-DAG: HIDDEN      1 internal_local
+# REGEX-DAG: HIDDEN      1 hidden_local
+# REGEX-DAG: HIDDEN      1 protected_local
+## Unaffected symbols:
+# REGEX-DAG: DEFAULT     1 default_global
+# REGEX-DAG: DEFAULT     1 default_weak
+# REGEX-DAG: INTERNAL    1 internal_global
+# REGEX-DAG: INTERNAL    1 internal_weak
+# REGEX-DAG: HIDDEN      1 hidden_global
+# REGEX-DAG: HIDDEN      1 hidden_weak
+# REGEX-DAG: PROTECTED   1 protected_global
+# REGEX-DAG: PROTECTED   1 protected_weak
+
+## Check that the visibility of symbols specified by a wildcard are set correctly,
+## and that no other symbols are affected.
+# RUN: llvm-objcopy %t.o %t7.o --set-symbol-visibility='*_local'=hidden --wildcard
+# RUN: llvm-readelf -s %t7.o | FileCheck %s --check-prefix=WILDCARD
+
+# WILDCARD-DAG: HIDDEN      1 default_local
+# WILDCARD-DAG: HIDDEN      1 internal_local
+# WILDCARD-DAG: HIDDEN      1 hidden_local
+# WILDCARD-DAG: HIDDEN      1 protected_local
+## Unaffected symbols:
+# WILDCARD-DAG: DEFAULT     1 default_global
+# WILDCARD-DAG: DEFAULT     1 default_weak
+# WILDCARD-DAG: INTERNAL    1 internal_global
+# WILDCARD-DAG: INTERNAL    1 internal_weak
+# WILDCARD-DAG: HIDDEN      1 hidden_global
+# WILDCARD-DAG: HIDDEN      1 hidden_weak
+# WILDCARD-DAG: PROTECTED   1 protected_global
+# WILDCARD-DAG: PROTECTED   1 protected_weak
+
+## Check that the latest option that matches the same symbols as any of the previous
+## options overwrites the visibility of these symbols.
+# RUN: echo -e '*_weak\n*_local' > %t.symbols.pattern
+# RUN: llvm-objcopy %t.o %t8.o --set-symbol-visibility='default_*'=hidden \
+# RUN:                         --set-symbol-visibility='internal_*'=hidden \
+# RUN:                         --set-symbols-visibility=%t.symbols.pattern=protected \
+# RUN:                         --wildcard
+# RUN: llvm-readelf -s %t8.o | FileCheck %s --check-prefix=REWRITE
+
+# REWRITE-DAG: PROTECTED   1 default_local
+# REWRITE-DAG: HIDDEN      1 default_global
+# REWRITE-DAG: PROTECTED   1 default_weak
+# REWRITE-DAG: PROTECTED   1 internal_local
+# REWRITE-DAG: HIDDEN      1 internal_global
+# REWRITE-DAG: PROTECTED   1 internal_weak
+# REWRITE-DAG: PROTECTED   1 hidden_local
+# REWRITE-DAG: PROTECTED   1 hidden_weak
+# REWRITE-DAG: PROTECTED   1 protected_local
+# REWRITE-DAG: PROTECTED   1 protected_weak
+## Unaffected symbols:
+# REWRITE-DAG: HIDDEN      1 hidden_global
+# REWRITE-DAG: PROTECTED   1 protected_global
+
+## Check that a symbol name with a special charater is treated as a plain name
+## when pattern matching options are not enabled.
+# RUN: yaml2obj --docnum=2 %s -o %t9.o
+# RUN: llvm-objcopy %t9.o --set-symbol-visibility='f*o'=hidden
+# RUN: llvm-readelf -s %t9.o | FileCheck %s --check-prefix=SPECIAL
+
+# SPECIAL-DAG: HIDDEN      1 f*o
+## Unaffected symbol:
+# SPECIAL-DAG: DEFAULT     1 foo
+
+# RUN: yaml2obj --docnum=3 %s -o %t10.o
+
+## Check that the visibility of undefined symbols can be changed as well.
+# RUN: llvm-objcopy %t10.o --set-symbol-visibility=foo=hidden
+# RUN: llvm-readelf -s %t10.o | FileCheck %s --check-prefix=UNDEF
+# UNDEF: HIDDEN    UND foo
+
+## Check that passing an invalid visibility type generates an error message.
+# RUN: echo 'foo' > %t.symbols
+# RUN: not llvm-objcopy %t10.o --set-symbols-visibility=%t.symbols=invalid-type 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=TYPE
+# RUN: not llvm-objcopy %t10.o --set-symbol-visibility=foo=invalid-type 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=TYPE
+# TYPE: error: 'invalid-type' is not a valid symbol visibility
+
+## Check that omitting the '=' character generates an error.
+# RUN: not llvm-objcopy %t10.o --set-symbols-visibility=%t.symbols,hidden 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=FORMAT -DOPTION=--set-symbols-visibility
+# RUN: not llvm-objcopy %t10.o --set-symbol-visibility=foo default 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=FORMAT -DOPTION=--set-symbol-visibility
+# FORMAT: error: bad format for [[OPTION]]
+
+## Check that using an invalid symbol pattern generates an error.
+# RUN: echo '*.' > %t.symbols.regex
+# RUN: not llvm-objcopy %t10.o --set-symbols-visibility=%t.symbols.regex=hidden --regex 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=SYMBOL
+# RUN: not llvm-objcopy %t10.o --set-symbol-visibility='*.'=default --regex 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=SYMBOL
+# SYMBOL: error: cannot compile regular expression '*.': repetition-operator operand invalid
+
+## Check passing an invalid filename generates an error.
+# RUN: not llvm-objcopy %t10.o --set-symbols-visibility=no_file=hidden 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=NO_FILE -DMSG=%errc_ENOENT
+# NO_FILE: error: 'no_file': [[MSG]]
+
+---
+!ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name:  .text
+    Type:  SHT_PROGBITS
+Symbols:
+  - Name:    default_local
+    Section: .text
+    Binding:  STB_LOCAL
+  - Name:    protected_local
+    Section: .text
+    Binding:  STB_LOCAL
+    Other:    [ STV_PROTECTED ]
+  - Name:    internal_local
+    Section: .text
+    Binding:  STB_LOCAL
+    Other:    [ STV_INTERNAL ]
+  - Name:    hidden_local
+    Section: .text
+    Binding:  STB_LOCAL
+    Other:    [ STV_HIDDEN ]
+  - Name:    default_weak
+    Section: .text
+    Binding:  STB_WEAK
+  - Name:    internal_weak
+    Section: .text
+    Binding:  STB_WEAK
+    Other:    [ STV_INTERNAL ]
+  - Name:    hidden_weak
+    Section: .text
+    Binding:  STB_WEAK
+    Other:    [ STV_HIDDEN ]
+  - Name:    protected_weak
+    Section: .text
+    Binding:  STB_WEAK
+    Other:    [ STV_PROTECTED ]
+  - Name:    default_global
+    Section: .text
+    Binding:  STB_GLOBAL
+  - Name:    internal_global
+    Section: .text
+    Binding:  STB_GLOBAL
+    Other:    [ STV_INTERNAL ]
+  - Name:    hidden_global
+    Section: .text
+    Binding:  STB_GLOBAL
+    Other:    [ STV_HIDDEN ]
+  - Name:    protected_global
+    Section: .text
+    Binding:  STB_GLOBAL
+    Other:    [ STV_PROTECTED ]
+  - Name:    ignored_name
+    Section: .text
+    Binding:  STB_GLOBAL
+    Other:    [ STV_INTERNAL ]
+...
+
+---
+!ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name:  .text
+    Type:  SHT_PROGBITS
+Symbols:
+  - Name:    f*o
+    Section: .text
+    Binding:  STB_LOCAL
+  - Name:    foo
+    Section: .text
+    Binding:  STB_LOCAL
+...
+
+---
+!ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Symbols:
+  - Name:     foo
+    Binding:  STB_LOCAL
+...
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index ec9dc0a2a814d4c..6318578b110048c 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -254,6 +254,21 @@ parseSetSectionFlagValue(StringRef FlagValue) {
   return SFU;
 }
 
+static Expected<uint8_t> parseVisibilityType(StringRef VisType) {
+  const uint8_t Invalid = 0xff;
+  uint8_t type = StringSwitch<uint8_t>(VisType)
+                     .Case("default", ELF::STV_DEFAULT)
+                     .Case("hidden", ELF::STV_HIDDEN)
+                     .Case("internal", ELF::STV_INTERNAL)
+                     .Case("protected", ELF::STV_PROTECTED)
+                     .Default(Invalid);
+  if (type == Invalid)
+    return createStringError(errc::invalid_argument,
+                             "'%s' is not a valid symbol visibility",
+                             VisType.str().c_str());
+  return type;
+}
+
 namespace {
 struct TargetInfo {
   FileFormat Format;
@@ -969,6 +984,33 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
 
     Config.SymbolsToAdd.push_back(*SymInfo);
   }
+  for (auto *Arg : InputArgs.filtered(OBJCOPY_set_symbol_visibility)) {
+    if (!StringRef(Arg->getValue()).contains('='))
+      return createStringError(errc::invalid_argument,
+                               "bad format for --set-symbol-visibility");
+    auto [Sym, Visibility] = StringRef(Arg->getValue()).split('=');
+    Expected<uint8_t> Type = parseVisibilityType(Visibility);
+    if (!Type)
+      return Type.takeError();
+    ELFConfig.SymbolsToSetVisibility.emplace_back(NameMatcher(), *Type);
+    if (Error E = ELFConfig.SymbolsToSetVisibility.back().first.addMatcher(
+            NameOrPattern::create(Sym, SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
+  }
+  for (auto *Arg : InputArgs.filtered(OBJCOPY_set_symbols_visibility)) {
+    if (!StringRef(Arg->getValue()).contains('='))
+      return createStringError(errc::invalid_argument,
+                               "bad format for --set-symbols-visibility");
+    auto [File, Visibility] = StringRef(Arg->getValue()).split('=');
+    Expected<uint8_t> Type = parseVisibilityType(Visibility);
+    if (!Type)
+      return Type.takeError();
+    ELFConfig.SymbolsToSetVisibility.emplace_back(NameMatcher(), *Type);
+    if (Error E =
+            addSymbolsFromFile(ELFConfig.SymbolsToSetVisibility.back().first,
+                               DC.Alloc, File, SymbolMatchStyle, ErrorCallback))
+      return std::move(E);
+  }
 
   ELFConfig.AllowBrokenLinks = InputArgs.hasArg(OBJCOPY_allow_broken_links);
 
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
index 86774c889ab8600..3c0e5cd475a36b7 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
@@ -88,6 +88,17 @@ defm set_section_type
          "Set the type of section <section> to the integer <type>">,
       MetaVarName<"section=type">;
 
+defm set_symbol_visibility
+    : Eq<"set-symbol-visibility",
+         "Change the visibility of a symbol to the specified value">,
+      MetaVarName<"symbol=visibility">;
+defm set_symbols_visibility
+    : Eq<"set-symbols-visibility",
+         "Read a list of symbols from <filename> and change their "
+         "visibility to the specified value. Visibility values: default, "
+         "internal, hidden, protected">,
+      MetaVarName<"filename=visibility">;
+
 def S : Flag<["-"], "S">,
         Alias<strip_all>,
         HelpText<"Alias for --strip-all">;

From 27b297bf21b6637047c1ac403f983351b9a3fc64 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Wed, 28 Feb 2024 20:58:20 +0800
Subject: [PATCH 063/114] [clang] Fix -Wunused-variable in CGCall.cpp (NFC)

llvm-project/clang/lib/CodeGen/CGCall.cpp:3226:24:
error: unused variable 'StructSize' [-Werror,-Wunused-variable]
        llvm::TypeSize StructSize = CGM.getDataLayout().getTypeAllocSize(STy);
                       ^
llvm-project/clang/lib/CodeGen/CGCall.cpp:3227:24:
error: unused variable 'PtrElementSize' [-Werror,-Wunused-variable]
        llvm::TypeSize PtrElementSize =
                       ^
llvm-project/clang/lib/CodeGen/CGCall.cpp:5313:24:
error: unused variable 'SrcTypeSize' [-Werror,-Wunused-variable]
        llvm::TypeSize SrcTypeSize =
                       ^
llvm-project/clang/lib/CodeGen/CGCall.cpp:5315:24:
error: unused variable 'DstTypeSize' [-Werror,-Wunused-variable]
        llvm::TypeSize DstTypeSize = CGM.getDataLayout().getTypeAllocSize(STy);
                       ^
4 errors generated.
---
 clang/lib/CodeGen/CGCall.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 0d86fcf544d0fdd..13f68237b464d60 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -3223,8 +3223,9 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
           dyn_cast<llvm::StructType>(ArgI.getCoerceToType());
       if (ArgI.isDirect() && !ArgI.getCanBeFlattened() && STy &&
           STy->getNumElements() > 1) {
-        llvm::TypeSize StructSize = CGM.getDataLayout().getTypeAllocSize(STy);
-        llvm::TypeSize PtrElementSize =
+        [[maybe_unused]] llvm::TypeSize StructSize =
+            CGM.getDataLayout().getTypeAllocSize(STy);
+        [[maybe_unused]] llvm::TypeSize PtrElementSize =
             CGM.getDataLayout().getTypeAllocSize(ConvertTypeForMem(Ty));
         if (STy->containsHomogeneousScalableVectorTypes()) {
           assert(StructSize == PtrElementSize &&
@@ -5310,9 +5311,10 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
           dyn_cast<llvm::StructType>(ArgInfo.getCoerceToType());
       if (STy && ArgInfo.isDirect() && !ArgInfo.getCanBeFlattened()) {
         llvm::Type *SrcTy = ConvertTypeForMem(I->Ty);
-        llvm::TypeSize SrcTypeSize =
+        [[maybe_unused]] llvm::TypeSize SrcTypeSize =
             CGM.getDataLayout().getTypeAllocSize(SrcTy);
-        llvm::TypeSize DstTypeSize = CGM.getDataLayout().getTypeAllocSize(STy);
+        [[maybe_unused]] llvm::TypeSize DstTypeSize =
+            CGM.getDataLayout().getTypeAllocSize(STy);
         if (STy->containsHomogeneousScalableVectorTypes()) {
           assert(SrcTypeSize == DstTypeSize &&
                  "Only allow non-fractional movement of structure with "

From 1f74f5f48bc9e1091602163ac925c807d70706e1 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 28 Feb 2024 14:05:06 +0100
Subject: [PATCH 064/114] [flang] Fix flang build after #83132 (#83253)

This fix is a temporary workaround. `LowerHLFIRIntrinsics.cpp` should be
using the greedy pattern rewriter or a manual IR traversal. All patterns
in this file are rewrite patterns. The test failure was caused by
`replaceAllUsesWith`, which is not supported by the dialect conversion;
additional asserts were added recently to prevent incorrect API usage.
These trigger now.

Alternatively, turning the patterns into conversion patterns and
specifying a type converter may work.

Failing test case:
`Fortran/gfortran/regression/gfortran-regression-compile-regression__inline_matmul_14_f90.test`
---
 .../Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp  | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
index 314e4264c17e833..377cc44392028f1 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
@@ -176,7 +176,14 @@ class HlfirIntrinsicConversion : public mlir::OpRewritePattern<OP> {
           rewriter.eraseOp(use);
       }
     }
-    rewriter.replaceAllUsesWith(op->getResults(), {base});
+    // TODO: This entire pass should be a greedy pattern rewrite or a manual
+    // IR traversal. A dialect conversion cannot be used here because
+    // `replaceAllUsesWith` is not supported. Similarly, `replaceOp` is not
+    // suitable because "op->getResult(0)" and "base" can have different types.
+    // In such a case, the dialect conversion will attempt to convert the type,
+    // but no type converter is specified in this pass. Also note that all
+    // patterns in this pass are actually rewrite patterns.
+    op->getResult(0).replaceAllUsesWith(base);
     rewriter.replaceOp(op, base);
   }
 };

From 570bc5d291f92e19f6264262b02ddff1a2f2e09b Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs@gmail.com>
Date: Wed, 28 Feb 2024 14:09:39 +0100
Subject: [PATCH 065/114] Revert "[clang][analyzer] StreamChecker: Model getc,
 vfscanf, putc, vfprintf (#82476)"

This reverts commit ffe7049b543adb9739261d28a60d4a47a00aa2e0.

This commit breaks on e.g. arm:
Example:
https://lab.llvm.org/buildbot/#/builders/245/builds/21177/steps/5/logs/FAIL__Clang__stream_c

```
******************** TEST 'Clang :: Analysis/stream.c' FAILED ********************
Exit Code: 1
Command Output (stderr):
--
RUN: at line 1: /home/tcwg-buildbot/worker/clang-armv8-quick/stage1/bin/clang -cc1 -internal-isystem /home/tcwg-buildbot/worker/clang-armv8-quick/stage1/lib/clang/19/include -nostdsysteminc -analyze -analyzer-constraints=range -setup-static-analyzer -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection -verify /home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Analysis/stream.c
+ /home/tcwg-buildbot/worker/clang-armv8-quick/stage1/bin/clang -cc1 -internal-isystem /home/tcwg-buildbot/worker/clang-armv8-quick/stage1/lib/clang/19/include -nostdsysteminc -analyze -analyzer-constraints=range -setup-static-analyzer -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection -verify /home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Analysis/stream.c
error: 'expected-warning' diagnostics expected but not seen:
  File /home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Analysis/stream.c Line 147: Stream pointer might be NULL
  File /home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Analysis/stream.c Line 153: Stream pointer might be NULL
error: 'expected-warning' diagnostics seen but not expected:
  File /home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Analysis/stream.c Line 148: Stream pointer might be NULL [alpha.unix.Stream]
  File /home/tcwg-buildbot/worker/clang-armv8-quick/llvm/clang/test/Analysis/stream.c Line 154: Stream pointer might be NULL [alpha.unix.Stream]
4 errors generated.
--
********************
```
---
 .../StaticAnalyzer/Checkers/StreamChecker.cpp | 23 ++--------
 ...ystem-header-simulator-for-simple-stream.h |  2 +-
 .../system-header-simulator-for-valist.h      |  6 ---
 .../Analysis/Inputs/system-header-simulator.h |  3 --
 clang/test/Analysis/stream-invalidate.c       | 42 -------------------
 clang/test/Analysis/stream.c                  | 34 ---------------
 6 files changed, 5 insertions(+), 105 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 29956fed2b3c24b..65bdc4cac309409 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -348,30 +348,18 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
       {{{"fgets"}, 3},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, true),
         std::bind(&StreamChecker::evalFgetx, _1, _2, _3, _4, false), 2}},
-      {{{"getc"}, 1},
-       {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, true),
-        std::bind(&StreamChecker::evalFgetx, _1, _2, _3, _4, true), 0}},
       {{{"fputc"}, 2},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
         std::bind(&StreamChecker::evalFputx, _1, _2, _3, _4, true), 1}},
       {{{"fputs"}, 2},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
         std::bind(&StreamChecker::evalFputx, _1, _2, _3, _4, false), 1}},
-      {{{"putc"}, 2},
-       {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
-        std::bind(&StreamChecker::evalFputx, _1, _2, _3, _4, true), 1}},
       {{{"fprintf"}},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
         std::bind(&StreamChecker::evalFprintf, _1, _2, _3, _4), 0}},
-      {{{"vfprintf"}, 3},
-       {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
-        std::bind(&StreamChecker::evalFprintf, _1, _2, _3, _4), 0}},
       {{{"fscanf"}},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, true),
         std::bind(&StreamChecker::evalFscanf, _1, _2, _3, _4), 0}},
-      {{{"vfscanf"}, 3},
-       {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, true),
-        std::bind(&StreamChecker::evalFscanf, _1, _2, _3, _4), 0}},
       {{{"ungetc"}, 2},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
         std::bind(&StreamChecker::evalUngetc, _1, _2, _3, _4), 1}},
@@ -1050,13 +1038,10 @@ void StreamChecker::evalFscanf(const FnDescription *Desc, const CallEvent &Call,
     if (!StateNotFailed)
       return;
 
-    if (auto const *Callee = Call.getCalleeIdentifier();
-        !Callee || !Callee->getName().equals("vfscanf")) {
-      SmallVector<unsigned int> EscArgs;
-      for (auto EscArg : llvm::seq(2u, Call.getNumArgs()))
-        EscArgs.push_back(EscArg);
-      StateNotFailed = escapeArgs(StateNotFailed, C, Call, EscArgs);
-    }
+    SmallVector<unsigned int> EscArgs;
+    for (auto EscArg : llvm::seq(2u, Call.getNumArgs()))
+      EscArgs.push_back(EscArg);
+    StateNotFailed = escapeArgs(StateNotFailed, C, Call, EscArgs);
 
     if (StateNotFailed)
       C.addTransition(StateNotFailed);
diff --git a/clang/test/Analysis/Inputs/system-header-simulator-for-simple-stream.h b/clang/test/Analysis/Inputs/system-header-simulator-for-simple-stream.h
index c26d3582149120e..098a2208fecbe91 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator-for-simple-stream.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator-for-simple-stream.h
@@ -5,7 +5,7 @@
 // suppressed.
 #pragma clang system_header
 
-typedef struct _FILE {
+typedef struct __sFILE {
   unsigned char *_p;
 } FILE;
 FILE *fopen(const char *restrict, const char *restrict) __asm("_" "fopen" );
diff --git a/clang/test/Analysis/Inputs/system-header-simulator-for-valist.h b/clang/test/Analysis/Inputs/system-header-simulator-for-valist.h
index 720944abb8ad47c..7299b61353d460d 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator-for-valist.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator-for-valist.h
@@ -10,8 +10,6 @@
 #define restrict /*restrict*/
 #endif
 
-typedef struct _FILE FILE;
-
 typedef __builtin_va_list va_list;
 
 #define va_start(ap, param) __builtin_va_start(ap, param)
@@ -23,10 +21,6 @@ int vprintf (const char *restrict format, va_list arg);
 
 int vsprintf (char *restrict s, const char *restrict format, va_list arg);
 
-int vfprintf(FILE *stream, const char *format, va_list ap);
-
-int vfscanf(FILE *stream, const char *format, va_list ap);
-
 int some_library_function(int n, va_list arg);
 
 // No warning from system header.
diff --git a/clang/test/Analysis/Inputs/system-header-simulator.h b/clang/test/Analysis/Inputs/system-header-simulator.h
index 8fd51449ecc0a42..15986984802c0e6 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator.h
@@ -73,9 +73,6 @@ int ferror(FILE *stream);
 int fileno(FILE *stream);
 int fflush(FILE *stream);
 
-
-int getc(FILE *stream);
-
 size_t strlen(const char *);
 
 char *strcpy(char *restrict, const char *restrict);
diff --git a/clang/test/Analysis/stream-invalidate.c b/clang/test/Analysis/stream-invalidate.c
index 5046a356d0583d5..6745d11a2fe7017 100644
--- a/clang/test/Analysis/stream-invalidate.c
+++ b/clang/test/Analysis/stream-invalidate.c
@@ -4,7 +4,6 @@
 // RUN: -analyzer-checker=debug.ExprInspection
 
 #include "Inputs/system-header-simulator.h"
-#include "Inputs/system-header-simulator-for-valist.h"
 
 void clang_analyzer_eval(int);
 void clang_analyzer_dump(int);
@@ -146,44 +145,3 @@ void test_fgetpos() {
 
   fclose(F);
 }
-
-void test_fprintf() {
-  FILE *F1 = tmpfile();
-  if (!F1)
-    return;
-
-  unsigned a = 42;
-  char *output = "HELLO";
-  int r = fprintf(F1, "%s\t%u\n", output, a);
-  // fprintf does not invalidate any of its input
-  // 69 is ascii for 'E'
-  clang_analyzer_dump(a); // expected-warning {{42 S32b}}
-  clang_analyzer_dump(output[1]); // expected-warning {{69 S32b}}
-  fclose(F1);
-}
-
-int test_vfscanf_inner(const char *fmt, ...) {
-  FILE *F1 = tmpfile();
-  if (!F1)
-    return EOF;
-
-  va_list ap;
-  va_start(ap, fmt);
-
-  int r = vfscanf(F1, fmt, ap);
-
-  fclose(F1);
-  va_end(ap);
-  return r;
-}
-
-void test_vfscanf() {
-  int i = 42;
-  int j = 43;
-  int r = test_vfscanf_inner("%d", &i);
-  if (r != EOF) {
-    // i gets invalidated by the call to test_vfscanf_inner, not by vfscanf.
-    clang_analyzer_dump(i); // expected-warning {{conj_$}}
-    clang_analyzer_dump(j); // expected-warning {{43 S32b}}
-  }
-}
diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c
index 7c7f68abeecac79..378c9154f8f6a87 100644
--- a/clang/test/Analysis/stream.c
+++ b/clang/test/Analysis/stream.c
@@ -1,7 +1,6 @@
 // RUN: %clang_analyze_cc1 -analyzer-checker=core,alpha.unix.Stream,debug.ExprInspection -verify %s
 
 #include "Inputs/system-header-simulator.h"
-#include "Inputs/system-header-simulator-for-valist.h"
 
 void clang_analyzer_eval(int);
 
@@ -66,24 +65,12 @@ void check_fseek(void) {
   fclose(fp);
 }
 
-void check_fseeko(void) {
-  FILE *fp = tmpfile();
-  fseeko(fp, 0, 0); // expected-warning {{Stream pointer might be NULL}}
-  fclose(fp);
-}
-
 void check_ftell(void) {
   FILE *fp = tmpfile();
   ftell(fp); // expected-warning {{Stream pointer might be NULL}}
   fclose(fp);
 }
 
-void check_ftello(void) {
-  FILE *fp = tmpfile();
-  ftello(fp); // expected-warning {{Stream pointer might be NULL}}
-  fclose(fp);
-}
-
 void check_rewind(void) {
   FILE *fp = tmpfile();
   rewind(fp); // expected-warning {{Stream pointer might be NULL}}
@@ -142,18 +129,6 @@ void f_dopen(int fd) {
   fclose(F);
 }
 
-void f_vfprintf(int fd, va_list args) {
-  FILE *F = fdopen(fd, "r");
-  vfprintf(F, "%d", args); // expected-warning {{Stream pointer might be NULL}}
-  fclose(F);
-}
-
-void f_vfscanf(int fd, va_list args) {
-  FILE *F = fdopen(fd, "r");
-  vfscanf(F, "%u", args); // expected-warning {{Stream pointer might be NULL}}
-  fclose(F);
-}
-
 void f_seek(void) {
   FILE *p = fopen("foo", "r");
   if (!p)
@@ -163,15 +138,6 @@ void f_seek(void) {
   fclose(p);
 }
 
-void f_seeko(void) {
-  FILE *p = fopen("foo", "r");
-  if (!p)
-    return;
-  fseeko(p, 1, SEEK_SET); // no-warning
-  fseeko(p, 1, 3); // expected-warning {{The whence argument to fseek() should be SEEK_SET, SEEK_END, or SEEK_CUR}}
-  fclose(p);
-}
-
 void f_double_close(void) {
   FILE *p = fopen("foo", "r");
   if (!p)

From 8a5d51b039c52c3e429390966670b0ab21cf257c Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 28 Feb 2024 13:14:39 +0000
Subject: [PATCH 066/114] [ConstraintElim] Use default depth for most calls of
 isNonNegative.

Helps to improve resuls in some cases, while being overall neutral with
respect to compile-time,
https://llvm-compile-time-tracker.com/compare.php?from=2c9b6c1b36b8185299de083c3058e0c1e7760442&to=5984b1649dc12741308089de235647cf036df95f&stat=instructions:u
---
 .../Scalar/ConstraintElimination.cpp          |  6 ++---
 .../loops-bottom-tested-pointer-cmps.ll       |  6 ++---
 .../loops-header-tested-pointer-cmps.ll       | 24 +++++++------------
 .../zext-for-per-formula-reasoning.ll         |  4 +---
 4 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 9b6a39e98f5ce8e..7e48c28176bd1cf 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -461,7 +461,7 @@ static Decomposition decomposeGEP(GEPOperator &GEP,
 
     // If Op0 is signed non-negative, the GEP is increasing monotonically and
     // can be de-composed.
-    if (!isKnownNonNegative(Index, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1))
+    if (!isKnownNonNegative(Index, DL))
       Preconditions.emplace_back(CmpInst::ICMP_SGE, Index,
                                  ConstantInt::get(Index->getType(), 0));
   }
@@ -560,10 +560,10 @@ static Decomposition decompose(Value *V,
     return MergeResults(Op0, Op1, IsSigned);
   }
   if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) {
-    if (!isKnownNonNegative(Op0, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1))
+    if (!isKnownNonNegative(Op0, DL))
       Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0,
                                  ConstantInt::get(Op0->getType(), 0));
-    if (!isKnownNonNegative(Op1, DL, /*Depth=*/MaxAnalysisRecursionDepth - 1))
+    if (!isKnownNonNegative(Op1, DL))
       Preconditions.emplace_back(CmpInst::ICMP_SGE, Op1,
                                  ConstantInt::get(Op1->getType(), 0));
 
diff --git a/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll b/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
index e3f2a54f321edaa..279238bea1842e6 100644
--- a/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
+++ b/llvm/test/Transforms/ConstraintElimination/loops-bottom-tested-pointer-cmps.ll
@@ -93,9 +93,8 @@ define void @some_checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i16 [[IV]], 1
 ; CHECK-NEXT:    [[PTR_IV_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV_1]]
-; CHECK-NEXT:    [[CMP_PTR_IV_1_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV_1]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_1_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV_1]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_PTR_IV_1_LOWER]], [[CMP_PTR_IV_1_UPPER]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_PTR_IV_1_UPPER]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[TRAP]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    store i8 0, ptr [[PTR_IV]], align 4
@@ -171,9 +170,8 @@ define void @no_checks_in_loops_removable(ptr %ptr, ptr %lower, ptr %upper, i8 %
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[IV_1:%.*]] = add nuw nsw i16 [[IV]], 1
 ; CHECK-NEXT:    [[PTR_IV_1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i16 [[IV_1]]
-; CHECK-NEXT:    [[CMP_PTR_IV_1_LOWER:%.*]] = icmp ugt ptr [[LOWER]], [[PTR_IV_1]]
 ; CHECK-NEXT:    [[CMP_PTR_IV_1_UPPER:%.*]] = icmp ule ptr [[UPPER]], [[PTR_IV_1]]
-; CHECK-NEXT:    [[OR_1:%.*]] = or i1 [[CMP_PTR_IV_1_LOWER]], [[CMP_PTR_IV_1_UPPER]]
+; CHECK-NEXT:    [[OR_1:%.*]] = or i1 false, [[CMP_PTR_IV_1_UPPER]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[TRAP]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    store i8 0, ptr [[PTR_IV]], align 4
diff --git a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
index 66ce1ffc6ebc93f..1842ca2d82545c1 100644
--- a/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
+++ b/llvm/test/Transforms/ConstraintElimination/loops-header-tested-pointer-cmps.ll
@@ -27,18 +27,16 @@ define void @test1(ptr %src, ptr noundef %lower, ptr noundef %upper, i8 %N) {
 ; CHECK-NEXT:    store i32 0, ptr [[PTR_SRC_IV]], align 4
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_1]]
-; CHECK-NEXT:    [[CMP_IV_1_START:%.*]] = icmp ult ptr [[SRC_IV_1]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_1_END:%.*]] = icmp uge ptr [[SRC_IV_1]], [[UPPER]]
-; CHECK-NEXT:    [[OR_2:%.*]] = or i1 [[CMP_IV_1_START]], [[CMP_IV_1_END]]
+; CHECK-NEXT:    [[OR_2:%.*]] = or i1 false, [[CMP_IV_1_END]]
 ; CHECK-NEXT:    br i1 [[OR_2]], label [[TRAP_BB]], label [[LOOP_BODY_2:%.*]]
 ; CHECK:       loop.body.2:
 ; CHECK-NEXT:    [[PTR_SRC_IV_1:%.*]] = bitcast ptr [[SRC_IV_1]] to ptr
 ; CHECK-NEXT:    store i32 0, ptr [[PTR_SRC_IV_1]], align 4
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i8 [[IV]], 2
 ; CHECK-NEXT:    [[SRC_IV_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_2]]
-; CHECK-NEXT:    [[CMP_IV_2_START:%.*]] = icmp ult ptr [[SRC_IV_2]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_2_END:%.*]] = icmp uge ptr [[SRC_IV_2]], [[UPPER]]
-; CHECK-NEXT:    [[OR_3:%.*]] = or i1 [[CMP_IV_2_START]], [[CMP_IV_2_END]]
+; CHECK-NEXT:    [[OR_3:%.*]] = or i1 false, [[CMP_IV_2_END]]
 ; CHECK-NEXT:    br i1 [[OR_3]], label [[TRAP_BB]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PTR_SRC_IV_2:%.*]] = bitcast ptr [[SRC_IV_2]] to ptr
@@ -125,16 +123,14 @@ define void @test2(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_1]]
-; CHECK-NEXT:    [[CMP_IV_1_START:%.*]] = icmp ult ptr [[SRC_IV_1]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_1_END:%.*]] = icmp uge ptr [[SRC_IV_1]], [[UPPER]]
-; CHECK-NEXT:    [[OR_2:%.*]] = or i1 [[CMP_IV_1_START]], [[CMP_IV_1_END]]
+; CHECK-NEXT:    [[OR_2:%.*]] = or i1 false, [[CMP_IV_1_END]]
 ; CHECK-NEXT:    br i1 [[OR_2]], label [[TRAP_BB]], label [[LOOP_BODY_2:%.*]]
 ; CHECK:       loop.body.2:
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i8 [[IV]], 2
 ; CHECK-NEXT:    [[SRC_IV_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_2]]
-; CHECK-NEXT:    [[CMP_IV_2_START:%.*]] = icmp ult ptr [[SRC_IV_2]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_2_END:%.*]] = icmp uge ptr [[SRC_IV_2]], [[UPPER]]
-; CHECK-NEXT:    [[OR_3:%.*]] = or i1 [[CMP_IV_2_START]], [[CMP_IV_2_END]]
+; CHECK-NEXT:    [[OR_3:%.*]] = or i1 false, [[CMP_IV_2_END]]
 ; CHECK-NEXT:    br i1 [[OR_3]], label [[TRAP_BB]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PTR:%.*]] = bitcast ptr [[SRC_IV]] to ptr
@@ -221,16 +217,14 @@ define void @test2_with_ne(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i8 [[IV]], 1
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_1]]
-; CHECK-NEXT:    [[CMP_IV_1_START:%.*]] = icmp ult ptr [[SRC_IV_1]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_1_END:%.*]] = icmp uge ptr [[SRC_IV_1]], [[UPPER]]
-; CHECK-NEXT:    [[OR_2:%.*]] = or i1 [[CMP_IV_1_START]], [[CMP_IV_1_END]]
+; CHECK-NEXT:    [[OR_2:%.*]] = or i1 false, [[CMP_IV_1_END]]
 ; CHECK-NEXT:    br i1 [[OR_2]], label [[TRAP_BB]], label [[LOOP_BODY_2:%.*]]
 ; CHECK:       loop.body.2:
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i8 [[IV]], 2
 ; CHECK-NEXT:    [[SRC_IV_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_2]]
-; CHECK-NEXT:    [[CMP_IV_2_START:%.*]] = icmp ult ptr [[SRC_IV_2]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_2_END:%.*]] = icmp uge ptr [[SRC_IV_2]], [[UPPER]]
-; CHECK-NEXT:    [[OR_3:%.*]] = or i1 [[CMP_IV_2_START]], [[CMP_IV_2_END]]
+; CHECK-NEXT:    [[OR_3:%.*]] = or i1 false, [[CMP_IV_2_END]]
 ; CHECK-NEXT:    br i1 [[OR_3]], label [[TRAP_BB]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PTR:%.*]] = bitcast ptr [[SRC_IV]] to ptr
@@ -316,16 +310,14 @@ define void @test3(ptr %src, ptr %lower, ptr %upper, i8 %N) {
 ; CHECK-NEXT:    br i1 [[OR_1]], label [[TRAP_BB]], label [[LOOP_BODY_1:%.*]]
 ; CHECK:       loop.body.1:
 ; CHECK-NEXT:    [[SRC_IV_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[NEXT]]
-; CHECK-NEXT:    [[CMP_IV_1_START:%.*]] = icmp ult ptr [[SRC_IV_1]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_1_END:%.*]] = icmp uge ptr [[SRC_IV_1]], [[UPPER]]
-; CHECK-NEXT:    [[OR_2:%.*]] = or i1 [[CMP_IV_1_START]], [[CMP_IV_1_END]]
+; CHECK-NEXT:    [[OR_2:%.*]] = or i1 false, [[CMP_IV_1_END]]
 ; CHECK-NEXT:    br i1 [[OR_2]], label [[TRAP_BB]], label [[LOOP_BODY_2:%.*]]
 ; CHECK:       loop.body.2:
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i8 [[IV]], 2
 ; CHECK-NEXT:    [[SRC_IV_2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i8 [[ADD_2]]
-; CHECK-NEXT:    [[CMP_IV_2_START:%.*]] = icmp ult ptr [[SRC_IV_2]], [[LOWER]]
 ; CHECK-NEXT:    [[CMP_IV_2_END:%.*]] = icmp uge ptr [[SRC_IV_2]], [[UPPER]]
-; CHECK-NEXT:    [[OR_3:%.*]] = or i1 [[CMP_IV_2_START]], [[CMP_IV_2_END]]
+; CHECK-NEXT:    [[OR_3:%.*]] = or i1 false, [[CMP_IV_2_END]]
 ; CHECK-NEXT:    br i1 [[OR_3]], label [[TRAP_BB]], label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PTR:%.*]] = bitcast ptr [[SRC_IV]] to ptr
diff --git a/llvm/test/Transforms/ConstraintElimination/zext-for-per-formula-reasoning.ll b/llvm/test/Transforms/ConstraintElimination/zext-for-per-formula-reasoning.ll
index 63f5d4d4ba347b3..7844651a01f9a3b 100644
--- a/llvm/test/Transforms/ConstraintElimination/zext-for-per-formula-reasoning.ll
+++ b/llvm/test/Transforms/ConstraintElimination/zext-for-per-formula-reasoning.ll
@@ -90,11 +90,9 @@ define i1 @gep_zext_idx_adds(ptr %p, i8 %cnt, i8 %off) {
 ; CHECK-NEXT:    [[EXT:%.*]] = zext i8 [[CNT]] to i16
 ; CHECK-NEXT:    [[EXT_1:%.*]] = add nuw nsw i16 [[EXT]], 1
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i16 [[EXT_1]]
-; CHECK-NEXT:    [[T_1:%.*]] = icmp uge ptr [[ADD_PTR]], [[P]]
-; CHECK-NEXT:    [[F_1:%.*]] = icmp ult ptr [[ADD_PTR]], [[P]]
 ; CHECK-NEXT:    [[GEP_11:%.*]] = getelementptr inbounds i32, ptr [[P]], i16 11
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp uge ptr [[ADD_PTR]], [[GEP_11]]
-; CHECK-NEXT:    [[RES_1:%.*]] = xor i1 [[T_1]], [[F_1]]
+; CHECK-NEXT:    [[RES_1:%.*]] = xor i1 true, false
 ; CHECK-NEXT:    [[RES_2:%.*]] = xor i1 [[RES_1]], [[C_1]]
 ; CHECK-NEXT:    ret i1 [[RES_2]]
 ;

From 15d9d0fa8f55936625882a28759f0ec0033cb6de Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 28 Feb 2024 13:19:43 +0000
Subject: [PATCH 067/114] [VPlan] Also print final VPlan directly before
 codegen/execute. (#82269)

Some optimizations are apply after UF and VF have been chosen. This
patch adds an extra print of the final VPlan just before
codegen/execution.

In the future, there will be additional transforms that are applied
later (interleaving for example).

PR: https://github.com/llvm/llvm-project/pull/82269
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  8 +-
 .../RISCV/riscv-vector-reverse.ll             |  4 +-
 .../vplan-printing-before-execute.ll          | 90 +++++++++++++++++++
 3 files changed, 97 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e5deac7975728fb..2e76adb83bd5039 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7450,12 +7450,14 @@ LoopVectorizationPlanner::executePlan(
       (IsEpilogueVectorization || !ExpandedSCEVs) &&
       "expanded SCEVs to reuse can only be used during epilogue vectorization");
 
-  LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
-                    << '\n');
-
   if (!IsEpilogueVectorization)
     VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
 
+  LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
+                    << ", UF=" << BestUF << '\n');
+  BestVPlan.setName("Final VPlan");
+  LLVM_DEBUG(BestVPlan.dump());
+
   // Perform the actual loop transformation.
   VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
                          OrigLoop->getHeader()->getContext());
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index 1bcd7a2e009e0b0..da6dc34e4096849 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -120,7 +120,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
 entry:
@@ -260,7 +260,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
 ; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
-; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
+; CHECK:       LV: Interleaving disabled by the pass manager
 ; CHECK-NEXT:  LV: Vectorizing: innermost loop.
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
new file mode 100644
index 000000000000000..1dddbfe20a2ed7e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-before-execute.ll
@@ -0,0 +1,90 @@
+; RUN: opt -passes=loop-vectorize -force-vector-width=8 -force-vector-interleave=2 -disable-output -debug -S %s 2>&1 | FileCheck --check-prefixes=CHECK %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; REQUIRES: asserts
+
+; Check if the vector loop condition can be simplified to true for a given
+; VF/IC combination.
+define void @test_tc_less_than_16(ptr %A, i64 %N) {
+; CHECK:      LV: Scalarizing:  %cmp =
+; CHECK-NEXT: VPlan 'Initial VPlan for VF={8},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count
+; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ph:
+; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
+; CHECK-NEXT:     EMIT ir<%p.src> = WIDEN-POINTER-INDUCTION ir<%A>, 1
+; CHECK-NEXT:     vp<[[VPTR:%.]]> = vector-pointer ir<%p.src>
+; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VPTR]]>
+; CHECK-NEXT:     WIDEN ir<%add> = add nsw ir<%l>, ir<10>
+; CHECK-NEXT:     vp<[[VPTR2:%.+]]> = vector-pointer ir<%p.src>
+; CHECK-NEXT:     WIDEN store vp<[[VPTR2]]>, ir<%add>
+; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+; CHECK: Executing best plan with VF=8, UF=2
+; CHECK-NEXT: VPlan 'Final VPlan for VF={8},UF={2}' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF
+; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ph:
+; CHECK-NEXT:   EMIT vp<[[TC]]> = EXPAND SCEV (zext i4 (trunc i64 %N to i4) to i64)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
+; CHECK-NEXT:     EMIT ir<%p.src> = WIDEN-POINTER-INDUCTION ir<%A>, 1
+; CHECK-NEXT:     vp<[[VPTR:%.]]> = vector-pointer ir<%p.src>
+; CHECK-NEXT:     WIDEN ir<%l> = load vp<[[VPTR]]>
+; CHECK-NEXT:     WIDEN ir<%add> = add nsw ir<%l>, ir<10>
+; CHECK-NEXT:     vp<[[VPTR2:%.+]]> = vector-pointer ir<%p.src>
+; CHECK-NEXT:     WIDEN store vp<[[VPTR2]]>, ir<%add>
+; CHECK-NEXT:     EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV:%.+]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:     EMIT branch-on-cond ir<true>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+  %and = and i64 %N, 15
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %and, %entry ], [ %iv.next, %loop ]
+  %p.src = phi ptr [ %A, %entry ], [ %p.src.next, %loop ]
+  %p.src.next = getelementptr inbounds i8, ptr %p.src, i64 1
+  %l = load i8, ptr %p.src, align 1
+  %add = add nsw i8 %l, 10
+  store i8 %add, ptr %p.src
+  %iv.next = add nsw i64 %iv, -1
+  %cmp = icmp eq i64 %iv.next, 0
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}

From 0e42289236d44408e50a710dace629ebad2812b6 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Wed, 28 Feb 2024 21:19:04 +0800
Subject: [PATCH 068/114] [flang] Fix build failure (NFC)

llvm-project/flang/include/flang/Lower/SymbolMap.h:52:1:
error: 'SymbolBox' defined as a struct here but previously declared as a class;
this is valid, but may result in linker errors under the Microsoft C++ ABI [-Werror,-Wmismatched-tags]
struct SymbolBox : public fir::details::matcher<SymbolBox> {
^
llvm-project/flang/include/flang/Lower/AbstractConverter.h:56:1: note: did you mean struct here?
class SymbolBox;
^~~~~
struct
---
 flang/include/flang/Lower/AbstractConverter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index 944430a15488391..32e7a5e2b040615 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -53,7 +53,7 @@ class DerivedTypeSpec;
 
 namespace lower {
 class SymMap;
-class SymbolBox;
+struct SymbolBox;
 namespace pft {
 struct Variable;
 }

From 926a19bf0b7ea0aa34f2685534b5f4a339f8b409 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 28 Feb 2024 14:26:02 +0100
Subject: [PATCH 069/114] [mlir][Transforms][NFC] Remove `SplitBlockRewrite`
 (#82777)

When splitting a block during a dialect conversion, a
`SplitBlockRewrite` object is stored in the dialect conversion state.
This commit removes `SplitBlockRewrite`. Instead, a combination of
`CreateBlockRewrite` and multiple `MoveOperationRewrite` is used.

This change simplifies the internal state of the dialect conversion and
is also needed to properly support listeners.

`RewriteBase::splitBlock` is now no longer virtual. All necessary
information for committing/rolling back a split block rewrite can be
deduced from `Listener::notifyBlockInserted` and
`Listener::notifyOperationInserted` (which is also called when moving an
operation).
---
 mlir/include/mlir/IR/PatternMatch.h           |  2 +-
 .../mlir/Transforms/DialectConversion.h       |  3 --
 .../Transforms/Utils/DialectConversion.cpp    | 42 -------------------
 3 files changed, 1 insertion(+), 46 deletions(-)

diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index 2ce3bc3fc2e783a..f8d22cfb22afd05 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -579,7 +579,7 @@ class RewriterBase : public OpBuilder {
 
   /// Split the operations starting at "before" (inclusive) out of the given
   /// block into a new block, and return it.
-  virtual Block *splitBlock(Block *block, Block::iterator before);
+  Block *splitBlock(Block *block, Block::iterator before);
 
   /// Unlink this operation from its current block and insert it right before
   /// `existingOp` which may be in the same or another block in the same
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 7e8e67a9d178247..84396529eb7c2e0 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -741,9 +741,6 @@ class ConversionPatternRewriter final : public PatternRewriter {
   /// implemented for dialect conversion.
   void eraseBlock(Block *block) override;
 
-  /// PatternRewriter hook for splitting a block into two parts.
-  Block *splitBlock(Block *block, Block::iterator before) override;
-
   /// PatternRewriter hook for inlining the ops of a block into another block.
   void inlineBlockBefore(Block *source, Block *dest, Block::iterator before,
                          ValueRange argValues = std::nullopt) override;
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 5d399ce1eb9cf02..26899301eb742e7 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -192,7 +192,6 @@ class IRRewrite {
     EraseBlock,
     InlineBlock,
     MoveBlock,
-    SplitBlock,
     BlockTypeConversion,
     ReplaceBlockArg,
     // Operation rewrites
@@ -400,30 +399,6 @@ class MoveBlockRewrite : public BlockRewrite {
   Block *insertBeforeBlock;
 };
 
-/// Splitting of a block. This rewrite is immediately reflected in the IR.
-class SplitBlockRewrite : public BlockRewrite {
-public:
-  SplitBlockRewrite(ConversionPatternRewriterImpl &rewriterImpl, Block *block,
-                    Block *originalBlock)
-      : BlockRewrite(Kind::SplitBlock, rewriterImpl, block),
-        originalBlock(originalBlock) {}
-
-  static bool classof(const IRRewrite *rewrite) {
-    return rewrite->getKind() == Kind::SplitBlock;
-  }
-
-  void rollback() override {
-    // Merge back the block that was split out.
-    originalBlock->getOperations().splice(originalBlock->end(),
-                                          block->getOperations());
-    eraseBlock(block);
-  }
-
-private:
-  // The original block from which this block was split.
-  Block *originalBlock;
-};
-
 /// This structure contains the information pertaining to an argument that has
 /// been converted.
 struct ConvertedArgInfo {
@@ -883,9 +858,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   void notifyBlockInserted(Block *block, Region *previous,
                            Region::iterator previousIt) override;
 
-  /// Notifies that a block was split.
-  void notifySplitBlock(Block *block, Block *continuation);
-
   /// Notifies that a block is being inlined into another block.
   void notifyBlockBeingInlined(Block *block, Block *srcBlock,
                                Block::iterator before);
@@ -1522,11 +1494,6 @@ void ConversionPatternRewriterImpl::notifyBlockInserted(
   appendRewrite<MoveBlockRewrite>(block, previous, prevBlock);
 }
 
-void ConversionPatternRewriterImpl::notifySplitBlock(Block *block,
-                                                     Block *continuation) {
-  appendRewrite<SplitBlockRewrite>(continuation, block);
-}
-
 void ConversionPatternRewriterImpl::notifyBlockBeingInlined(
     Block *block, Block *srcBlock, Block::iterator before) {
   appendRewrite<InlineBlockRewrite>(block, srcBlock, before);
@@ -1665,15 +1632,6 @@ ConversionPatternRewriter::getRemappedValues(ValueRange keys,
                            results);
 }
 
-Block *ConversionPatternRewriter::splitBlock(Block *block,
-                                             Block::iterator before) {
-  assert(!impl->wasOpReplaced(block->getParentOp()) &&
-         "attempting to split a block within a replaced/erased op");
-  auto *continuation = block->splitBlock(before);
-  impl->notifySplitBlock(block, continuation);
-  return continuation;
-}
-
 void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
                                                   Block::iterator before,
                                                   ValueRange argValues) {

From a2efb68906ec2bf7b55b464060c3713e395e68e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 28 Feb 2024 12:55:56 +0100
Subject: [PATCH 070/114] [clang][Interp] Remove now faulty assertion

We can call getBase() for pointers where Base != Offset as well,
for example when we've added a constant to the Offset.
---
 clang/lib/AST/Interp/Pointer.h  | 1 -
 clang/test/AST/Interp/cxx11.cpp | 6 ++++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index fa2e03d71190f55..34ecdb967960d53 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -215,7 +215,6 @@ class Pointer {
       assert(Offset == PastEndMark && "cannot get base of a block");
       return Pointer(Pointee, Base, 0);
     }
-    assert(Offset == Base && "not an inner field");
     unsigned NewBase = Base - getInlineDesc()->Offset;
     return Pointer(Pointee, NewBase, NewBase);
   }
diff --git a/clang/test/AST/Interp/cxx11.cpp b/clang/test/AST/Interp/cxx11.cpp
index 0a1e0f3fd28e9d0..29098ea694c6881 100644
--- a/clang/test/AST/Interp/cxx11.cpp
+++ b/clang/test/AST/Interp/cxx11.cpp
@@ -22,3 +22,9 @@ int array2[recurse2]; // both-warning {{variable length arrays in C++}} \
                       // both-note {{initializer of 'recurse2' is not a constant expression}} \
                       // expected-error {{variable length array declaration not allowed at file scope}} \
                       // ref-warning {{variable length array folded to constant array as an extension}}
+
+struct S {
+  int m;
+};
+constexpr S s = { 5 };
+constexpr const int *p = &s.m + 1;

From 915fce040271c77df1ff9b2c8797c441cec0d18d Mon Sep 17 00:00:00 2001
From: Rishabh Bali <rishabhsbali@gmail.com>
Date: Wed, 28 Feb 2024 18:58:53 +0530
Subject: [PATCH 071/114] [mlir][affine] Enable ConvertAffineToStandard pass to
 handle affine.delinearize_index Op. (#82189)

This PR, aims to enable the `ConvertAffineToStandard` to handle
`affine.dilinearize_index` Operation.

Fixes #78458
---
 .../AffineToStandard/AffineToStandard.cpp     |  2 +
 .../AffineToStandard/lower-affine.mlir        | 54 +++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
index e69f9c837ca1d61..10ccd5c97783bf8 100644
--- a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
+++ b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Affine/Transforms/Transforms.h"
 #include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
@@ -558,6 +559,7 @@ class LowerAffinePass
     RewritePatternSet patterns(&getContext());
     populateAffineToStdConversionPatterns(patterns);
     populateAffineToVectorConversionPatterns(patterns);
+    populateAffineExpandIndexOpsPatterns(patterns);
     ConversionTarget target(getContext());
     target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
                            scf::SCFDialect, VectorDialect>();
diff --git a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
index 00d7b6b8d65f672..23e0edd510cbb14 100644
--- a/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
+++ b/mlir/test/Conversion/AffineToStandard/lower-affine.mlir
@@ -927,3 +927,57 @@ func.func @affine_parallel_with_reductions_i64(%arg0: memref<3x3xi64>, %arg1: me
 // CHECK:      scf.reduce.return %[[RES]] : i64
 // CHECK:    }
 // CHECK:  }
+
+///////////////////////////////////////////////////////////////////////
+
+func.func @test_dilinearize_index(%linear_index: index) -> (index, index, index) {
+  %b0 = arith.constant 16 : index
+  %b1 = arith.constant 224 : index
+  %b2 = arith.constant 224 : index
+  %1:3 = affine.delinearize_index %linear_index into (%b0, %b1, %b2) : index, index, index
+  return %1#0, %1#1, %1#2 : index, index, index
+}
+// CHECK-LABEL:   func.func @test_dilinearize_index(
+// CHECK-SAME:                                      %[[VAL_0:.*]]: index) -> (index, index, index) {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 16 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 224 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 224 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 50176 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 50176 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant -1 : index
+// CHECK:           %[[VAL_8:.*]] = arith.cmpi slt, %[[VAL_0]], %[[VAL_6]] : index
+// CHECK:           %[[VAL_9:.*]] = arith.subi %[[VAL_7]], %[[VAL_0]] : index
+// CHECK:           %[[VAL_10:.*]] = arith.select %[[VAL_8]], %[[VAL_9]], %[[VAL_0]] : index
+// CHECK:           %[[VAL_11:.*]] = arith.divsi %[[VAL_10]], %[[VAL_5]] : index
+// CHECK:           %[[VAL_12:.*]] = arith.subi %[[VAL_7]], %[[VAL_11]] : index
+// CHECK:           %[[VAL_13:.*]] = arith.select %[[VAL_8]], %[[VAL_12]], %[[VAL_11]] : index
+// CHECK:           %[[VAL_14:.*]] = arith.constant 50176 : index
+// CHECK:           %[[VAL_15:.*]] = arith.remsi %[[VAL_0]], %[[VAL_14]] : index
+// CHECK:           %[[VAL_16:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_17:.*]] = arith.cmpi slt, %[[VAL_15]], %[[VAL_16]] : index
+// CHECK:           %[[VAL_18:.*]] = arith.addi %[[VAL_15]], %[[VAL_14]] : index
+// CHECK:           %[[VAL_19:.*]] = arith.select %[[VAL_17]], %[[VAL_18]], %[[VAL_15]] : index
+// CHECK:           %[[VAL_20:.*]] = arith.constant 50176 : index
+// CHECK:           %[[VAL_21:.*]] = arith.remsi %[[VAL_0]], %[[VAL_20]] : index
+// CHECK:           %[[VAL_22:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_23:.*]] = arith.cmpi slt, %[[VAL_21]], %[[VAL_22]] : index
+// CHECK:           %[[VAL_24:.*]] = arith.addi %[[VAL_21]], %[[VAL_20]] : index
+// CHECK:           %[[VAL_25:.*]] = arith.select %[[VAL_23]], %[[VAL_24]], %[[VAL_21]] : index
+// CHECK:           %[[VAL_26:.*]] = arith.constant 224 : index
+// CHECK:           %[[VAL_27:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_28:.*]] = arith.constant -1 : index
+// CHECK:           %[[VAL_29:.*]] = arith.cmpi slt, %[[VAL_25]], %[[VAL_27]] : index
+// CHECK:           %[[VAL_30:.*]] = arith.subi %[[VAL_28]], %[[VAL_25]] : index
+// CHECK:           %[[VAL_31:.*]] = arith.select %[[VAL_29]], %[[VAL_30]], %[[VAL_25]] : index
+// CHECK:           %[[VAL_32:.*]] = arith.divsi %[[VAL_31]], %[[VAL_26]] : index
+// CHECK:           %[[VAL_33:.*]] = arith.subi %[[VAL_28]], %[[VAL_32]] : index
+// CHECK:           %[[VAL_34:.*]] = arith.select %[[VAL_29]], %[[VAL_33]], %[[VAL_32]] : index
+// CHECK:           %[[VAL_35:.*]] = arith.constant 224 : index
+// CHECK:           %[[VAL_36:.*]] = arith.remsi %[[VAL_0]], %[[VAL_35]] : index
+// CHECK:           %[[VAL_37:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_38:.*]] = arith.cmpi slt, %[[VAL_36]], %[[VAL_37]] : index
+// CHECK:           %[[VAL_39:.*]] = arith.addi %[[VAL_36]], %[[VAL_35]] : index
+// CHECK:           %[[VAL_40:.*]] = arith.select %[[VAL_38]], %[[VAL_39]], %[[VAL_36]] : index
+// CHECK:           return %[[VAL_13]], %[[VAL_34]], %[[VAL_40]] : index, index, index
+// CHECK:         }

From 06f775a82f6f562f8de75053f62c9c0dbeaa67d2 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Wed, 28 Feb 2024 14:30:29 +0100
Subject: [PATCH 072/114] [flang] Give internal linkage to internal procedures
 (#81929)

Internal procedures cannot be called directly from outside the host
procedure, so there is no point giving them external linkage. The only
reason flang did is because it is the default in MLIR.

Giving external linkage to them:
- prevents deleting them when not used/inlined by LLVM
- causes bugs with shared libraries (at least on linux x86-64) because
the call to the internal function could lead to a dynamic loader call
that would overwrite r10 register (the static chain pointer) due to
system calls and did not restore (it seems it does not expect r10 to be
used for PLT calls).

This patch gives internal linkage to internal procedures:

Note: the llvm.linkage attribute name cannot be obtained via a
getLinkageAttrName since it is not the same name as the one used in the
LLVM dialect. It is just a placeholder defined in
mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp until the func dialect
gets a real linkage model. So simply avoid hard coding it too many times
in lowering.
---
 .../flang/Optimizer/Builder/FIRBuilder.h      |  3 +
 flang/lib/Lower/Bridge.cpp                    | 11 +++-
 flang/lib/Optimizer/Builder/FIRBuilder.cpp    |  8 +++
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp |  5 +-
 .../allocatable-end-of-scope-dealloc.f90      |  2 +-
 .../test/Lower/HLFIR/bindc_internal_proc.f90  |  4 +-
 .../Lower/HLFIR/internal-procedures-2.f90     |  2 +-
 .../test/Lower/HLFIR/internal-procedures.f90  | 12 ++--
 flang/test/Lower/Intrinsics/random.f90        |  4 +-
 flang/test/Lower/Intrinsics/ubound01.f90      |  2 +-
 flang/test/Lower/OpenACC/acc-routine04.f90    |  2 +-
 .../FIR/threadprivate-use-association-2.f90   |  2 +-
 .../OpenMP/threadprivate-commonblock-use.f90  |  2 +-
 .../threadprivate-use-association-2-hlfir.f90 |  2 +-
 flang/test/Lower/PowerPC/ppc-vector-types.f90 | 10 +--
 flang/test/Lower/array-temp.f90               |  2 +-
 flang/test/Lower/dummy-arguments.f90          |  2 +-
 .../test/Lower/dummy-procedure-character.f90  |  4 +-
 .../Lower/equivalence-with-host-assoc.f90     | 16 ++---
 .../Lower/explicit-interface-results-2.f90    | 12 ++--
 flang/test/Lower/forall/array-constructor.f90 |  8 +--
 flang/test/Lower/forall/character-1.f90       |  2 +-
 flang/test/Lower/global-initialization.f90    |  9 ++-
 .../test/Lower/host-associated-functions.f90  | 16 ++---
 flang/test/Lower/host-associated-globals.f90  | 10 +--
 flang/test/Lower/host-associated.f90          | 64 +++++++++----------
 flang/test/Lower/module-and-internal-proc.f90 |  4 +-
 flang/test/Lower/parent-component.f90         | 18 +++---
 flang/test/Lower/polymorphic.f90              |  4 +-
 .../test/Lower/program-units-fir-mangling.f90 | 18 +++---
 30 files changed, 140 insertions(+), 120 deletions(-)

diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index 39821f1036c63fc..bd9b67b14b966cd 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -688,6 +688,9 @@ fir::BoxValue createBoxValue(fir::FirOpBuilder &builder, mlir::Location loc,
 /// Generate Null BoxProc for procedure pointer null initialization.
 mlir::Value createNullBoxProc(fir::FirOpBuilder &builder, mlir::Location loc,
                               mlir::Type boxType);
+
+/// Set internal linkage attribute on a function.
+void setInternalLinkage(mlir::func::FuncOp);
 } // namespace fir::factory
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_FIRBUILDER_H
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 0691753defceb30..153ce0623ab3aee 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -4488,7 +4488,16 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     assert(builder && "FirOpBuilder did not instantiate");
     builder->setFastMathFlags(bridge.getLoweringOptions().getMathOptions());
     builder->setInsertionPointToStart(&func.front());
-    func.setVisibility(mlir::SymbolTable::Visibility::Public);
+    if (funit.parent.isA<Fortran::lower::pft::FunctionLikeUnit>()) {
+      // Give internal linkage to internal functions. There are no name clash
+      // risks, but giving global linkage to internal procedure will break the
+      // static link register in shared libraries because of the system calls.
+      // Also, it should be possible to eliminate the procedure code if all the
+      // uses have been inlined.
+      fir::factory::setInternalLinkage(func);
+    } else {
+      func.setVisibility(mlir::SymbolTable::Visibility::Public);
+    }
     assert(blockId == 0 && "invalid blockId");
     assert(activeConstructStack.empty() && "invalid construct stack state");
 
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 3cce39f5b8c782d..788c99e40105a26 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -18,6 +18,7 @@
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "flang/Optimizer/Support/InternalNames.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -1533,3 +1534,10 @@ mlir::Value fir::factory::createNullBoxProc(fir::FirOpBuilder &builder,
   mlir::Value initVal{builder.create<fir::ZeroOp>(loc, boxEleTy)};
   return builder.create<fir::EmboxProcOp>(loc, boxTy, initVal);
 }
+
+void fir::factory::setInternalLinkage(mlir::func::FuncOp func) {
+  auto internalLinkage = mlir::LLVM::linkage::Linkage::Internal;
+  auto linkage =
+      mlir::LLVM::LinkageAttr::get(func->getContext(), internalLinkage);
+  func->setAttr("llvm.linkage", linkage);
+}
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index c84fb27cb38da51..837ef0576ef6962 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -1834,10 +1834,7 @@ mlir::func::FuncOp IntrinsicLibrary::getWrapper(GeneratorType generator,
     // First time this wrapper is needed, build it.
     function = builder.createFunction(loc, wrapperName, funcType);
     function->setAttr("fir.intrinsic", builder.getUnitAttr());
-    auto internalLinkage = mlir::LLVM::linkage::Linkage::Internal;
-    auto linkage =
-        mlir::LLVM::LinkageAttr::get(builder.getContext(), internalLinkage);
-    function->setAttr("llvm.linkage", linkage);
+    fir::factory::setInternalLinkage(function);
     function.addEntryBlock();
 
     // Create local context to emit code into the newly created function
diff --git a/flang/test/Lower/HLFIR/allocatable-end-of-scope-dealloc.f90 b/flang/test/Lower/HLFIR/allocatable-end-of-scope-dealloc.f90
index ad4b015ef9443fc..05cae6e5ba6ccfa 100644
--- a/flang/test/Lower/HLFIR/allocatable-end-of-scope-dealloc.f90
+++ b/flang/test/Lower/HLFIR/allocatable-end-of-scope-dealloc.f90
@@ -224,7 +224,7 @@ subroutine internal()
     allocate(x)
   end subroutine
 end subroutine
-! CHECK-LABEL:   func.func @_QFno_dealloc_host_assocPinternal
+! CHECK-LABEL:   func.func private @_QFno_dealloc_host_assocPinternal
 ! CHECK-NOT: freemem
 ! CHECK-NOT: Deallocate
 ! CHECK: return
diff --git a/flang/test/Lower/HLFIR/bindc_internal_proc.f90 b/flang/test/Lower/HLFIR/bindc_internal_proc.f90
index 027c94f95a326e5..00e24c7016f19f1 100644
--- a/flang/test/Lower/HLFIR/bindc_internal_proc.f90
+++ b/flang/test/Lower/HLFIR/bindc_internal_proc.f90
@@ -3,7 +3,7 @@
 ! internal procedures.
 ! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
 
-!CHECK: func.func @_QFsub1Pfoo(%{{.*}}: i32
+!CHECK: func.func private @_QFsub1Pfoo(%{{.*}}: i32
 subroutine sub1()
   call foo(42)
 contains
@@ -13,7 +13,7 @@ subroutine foo(i) bind(c)
   end subroutine
 end subroutine
 
-!CHECK: func.func @_QFsub2Pfoo(%{{.*}}: i64
+!CHECK: func.func private @_QFsub2Pfoo(%{{.*}}: i64
 subroutine sub2()
   call foo(42_8)
 contains
diff --git a/flang/test/Lower/HLFIR/internal-procedures-2.f90 b/flang/test/Lower/HLFIR/internal-procedures-2.f90
index bb05545bef1aa27..f1c4780954b24d9 100644
--- a/flang/test/Lower/HLFIR/internal-procedures-2.f90
+++ b/flang/test/Lower/HLFIR/internal-procedures-2.f90
@@ -23,7 +23,7 @@ subroutine internal_procedure(i, mask)
   end forall
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFhost_procedurePinternal_procedure(
+! CHECK-LABEL: func.func private @_QFhost_procedurePinternal_procedure(
 ! CHECK:    fir.address_of(@_QMmodule_used_by_hostEindexed_by_var) : !fir.ref<!fir.array<2xi32>>
 ! CHECK:    fir.address_of(@_QMmodule_used_by_hostEref_in_forall) : !fir.ref<!fir.array<2xi32>>
 ! CHECK:    fir.address_of(@_QMmodule_used_by_hostEref_in_implied_do) : !fir.ref<i32>
diff --git a/flang/test/Lower/HLFIR/internal-procedures.f90 b/flang/test/Lower/HLFIR/internal-procedures.f90
index d517cb4345afa93..c898903b6fbe112 100644
--- a/flang/test/Lower/HLFIR/internal-procedures.f90
+++ b/flang/test/Lower/HLFIR/internal-procedures.f90
@@ -9,8 +9,8 @@ subroutine internal
   call takes_array(x)
 end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFtest_explicit_shape_arrayPinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFtest_explicit_shape_arrayPinternal(
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.box<!fir.array<?xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
@@ -27,8 +27,8 @@ subroutine internal
   call takes_array(x)
 end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFtest_assumed_shapePinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFtest_assumed_shapePinternal(
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.box<!fir.array<?xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
@@ -44,8 +44,8 @@ subroutine internal()
   call bar(c)
 end subroutine
 end subroutine
-! CHECK-LABEL:   func.func @_QFtest_scalar_charPinternal(
-! CHECK-SAME:                               %[[VAL_0:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL:   func.func private @_QFtest_scalar_charPinternal(
+! CHECK-SAME:                               %[[VAL_0:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.boxchar<1>>
diff --git a/flang/test/Lower/Intrinsics/random.f90 b/flang/test/Lower/Intrinsics/random.f90
index ca194befd027412..4fb1a9a5da27a76 100644
--- a/flang/test/Lower/Intrinsics/random.f90
+++ b/flang/test/Lower/Intrinsics/random.f90
@@ -45,7 +45,7 @@ subroutine random_test_2
   call foo(size)
   call bar(size, get)
 contains
-  ! CHECK-LABEL: func @_QFrandom_test_2Pfoo
+  ! CHECK-LABEL: func private @_QFrandom_test_2Pfoo
   subroutine foo(size, put, get)
     ! CHECK: [[s1:%[0-9]+]] = fir.is_present %arg0
     ! CHECK: [[s2:%[0-9]+]] = fir.embox %arg0
@@ -70,7 +70,7 @@ subroutine foo(size, put, get)
     print*, size
   end subroutine
 
-  ! CHECK-LABEL: func @_QFrandom_test_2Pbar
+  ! CHECK-LABEL: func private @_QFrandom_test_2Pbar
   subroutine bar(size, get, put)
     integer, optional :: size
     ! CHECK: [[p1:%[0-9]+]] = fir.is_present %arg2
diff --git a/flang/test/Lower/Intrinsics/ubound01.f90 b/flang/test/Lower/Intrinsics/ubound01.f90
index 4ebe3870cf0b36f..df51d79eb6afe09 100644
--- a/flang/test/Lower/Intrinsics/ubound01.f90
+++ b/flang/test/Lower/Intrinsics/ubound01.f90
@@ -16,7 +16,7 @@ subroutine s2(a,n,n2)
   End Subroutine
 end
 
-! CHECK-LABEL: func.func @_QFPs2
+! CHECK-LABEL: func.func private @_QFPs2
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xf32>>
 ! CHECK: %[[BOX:.*]] = fir.rebox %[[ARG0]](%{{.*}}) : (!fir.box<!fir.array<?x?xf32>>, !fir.shift<2>) -> !fir.box<!fir.array<?x?xf32>>
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[BOX]] : (!fir.box<!fir.array<?x?xf32>>) -> !fir.box<none>
diff --git a/flang/test/Lower/OpenACC/acc-routine04.f90 b/flang/test/Lower/OpenACC/acc-routine04.f90
index b5f5aa2ca4882e0..2339c23eaaf857c 100644
--- a/flang/test/Lower/OpenACC/acc-routine04.f90
+++ b/flang/test/Lower/OpenACC/acc-routine04.f90
@@ -31,4 +31,4 @@ subroutine sub2()
 ! CHECK: acc.routine @acc_routine_0 func(@_QMdummy_modPsub1) seq
 ! CHECK: func.func @_QMdummy_modPsub1(%arg0: !fir.ref<i32> {fir.bindc_name = "i"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>}
 ! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "test_acc_routine"}
-! CHECK: func.func @_QFPsub2() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>}
+! CHECK: func.func private @_QFPsub2() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>, llvm.linkage = #llvm.linkage<internal>}
diff --git a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90 b/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90
index 14c0dff8da4bb18..6db5735c21f1e17 100644
--- a/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90
+++ b/flang/test/Lower/OpenMP/FIR/threadprivate-use-association-2.f90
@@ -17,7 +17,7 @@ module m
 ! CHECK:         return
 ! CHECK:       }
 !
-! CHECK-LABEL: func.func @_QMm2FtestPinternal_test() {
+! CHECK-LABEL: func.func private @_QMm2FtestPinternal_test() {{.*}} {
 ! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QMmEx) : !fir.ref<i32>
 ! CHECK:         %[[VAL_1:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<i32> -> !fir.ref<i32>
 ! CHECK:         fir.call @_QPbar(%[[VAL_1]]) {{.*}}: (!fir.ref<i32>) -> ()
diff --git a/flang/test/Lower/OpenMP/threadprivate-commonblock-use.f90 b/flang/test/Lower/OpenMP/threadprivate-commonblock-use.f90
index 28616f7595a0d6f..71f1c7608a2c318 100644
--- a/flang/test/Lower/OpenMP/threadprivate-commonblock-use.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-commonblock-use.f90
@@ -15,7 +15,7 @@ module  m1
   subroutine ss1
     use m0
   contains
-!CHECK-LABEL: func @_QMm1Fss1Pss2
+!CHECK-LABEL: func private @_QMm1Fss1Pss2
 !CHECK: %[[CMN:.*]] = fir.address_of(@cmn_) : !fir.ref<!fir.array<4xi8>>
 !CHECK: omp.parallel
 !CHECK: %{{.*}} = omp.threadprivate %[[CMN]] : !fir.ref<!fir.array<4xi8>> -> !fir.ref<!fir.array<4xi8>>
diff --git a/flang/test/Lower/OpenMP/threadprivate-use-association-2-hlfir.f90 b/flang/test/Lower/OpenMP/threadprivate-use-association-2-hlfir.f90
index 722f023fbefc96d..79a1ce9897f2564 100644
--- a/flang/test/Lower/OpenMP/threadprivate-use-association-2-hlfir.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-use-association-2-hlfir.f90
@@ -19,7 +19,7 @@ module m
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func.func @_QMm2FtestPinternal_test() {
+! CHECK-LABEL: func.func private @_QMm2FtestPinternal_test() {{.*}} {
 ! CHECK:         %[[VAL_0:.*]] = fir.address_of(@_QMmEx) : !fir.ref<i32>
 ! CHECK:         %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QMmEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK:         %[[VAL_2:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref<i32> -> !fir.ref<i32>
diff --git a/flang/test/Lower/PowerPC/ppc-vector-types.f90 b/flang/test/Lower/PowerPC/ppc-vector-types.f90
index be293f873ecb429..4745e4567b2d121 100644
--- a/flang/test/Lower/PowerPC/ppc-vector-types.f90
+++ b/flang/test/Lower/PowerPC/ppc-vector-types.f90
@@ -44,7 +44,7 @@ program ppc_vec_unit
       ! CHECK-LLVM-NEXT: store <512 x i1> %[[RESQ]], ptr @_QFEvq2, align 64
 
       contains
-      ! CHECK-LLVM-LABEL: define <4 x i32> @_QFPtest_vec_integer_assign
+      ! CHECK-LLVM-LABEL: define internal <4 x i32> @_QFPtest_vec_integer_assign
       function test_vec_integer_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <4 x i32>, i64 1, align 16
         vector(integer(4)) :: arg1, test_vec_integer_assign
@@ -58,7 +58,7 @@ function test_vec_integer_assign(arg1)
         ! CHECK-LLVM-NEXT: ret <4 x i32> %[[RET]]
       end function test_vec_integer_assign
 
-      ! CHECK-LLVM-LABEL: define <2 x double> @_QFPtest_vec_real_assign
+      ! CHECK-LLVM-LABEL: define internal <2 x double> @_QFPtest_vec_real_assign
       function test_vec_real_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <2 x double>, i64 1, align 16
         vector(real(8)) :: arg1, test_vec_real_assign
@@ -72,7 +72,7 @@ function test_vec_real_assign(arg1)
         ! CHECK-LLVM-NEXT: ret <2 x double> %[[RET]]
       end function test_vec_real_assign
 
-      ! CHECK-LLVM-LABEL: define <8 x i16> @_QFPtest_vec_unsigned_assign
+      ! CHECK-LLVM-LABEL: define internal <8 x i16> @_QFPtest_vec_unsigned_assign
       function test_vec_unsigned_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <8 x i16>, i64 1, align 16
         vector(unsigned(2)) :: arg1, test_vec_unsigned_assign
@@ -86,7 +86,7 @@ function test_vec_unsigned_assign(arg1)
         ! CHECK-LLVM-NEXT: ret <8 x i16> %[[RET]]
       end function test_vec_unsigned_assign
 
-      ! CHECK-LLVM-LABEL: define <256 x i1> @_QFPtest_vec_pair_assign
+      ! CHECK-LLVM-LABEL: define internal <256 x i1> @_QFPtest_vec_pair_assign
       function test_vec_pair_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <256 x i1>, i64 1, align 32
         __vector_pair :: arg1, test_vec_pair_assign
@@ -100,7 +100,7 @@ function test_vec_pair_assign(arg1)
         ! CHECK-LLVM-NEXT: ret <256 x i1> %[[RET]]
       end function test_vec_pair_assign
 
-      ! CHECK-LLVM-LABEL: define <512 x i1> @_QFPtest_vec_quad_assign
+      ! CHECK-LLVM-LABEL: define internal <512 x i1> @_QFPtest_vec_quad_assign
       function test_vec_quad_assign(arg1)
         ! CHECK-LLVM: %[[FUNC_RES:.*]] = alloca <512 x i1>, i64 1, align 64
         __vector_quad :: arg1, test_vec_quad_assign
diff --git a/flang/test/Lower/array-temp.f90 b/flang/test/Lower/array-temp.f90
index 347d4cef78bcd1c..10c5ee91d44bdae 100644
--- a/flang/test/Lower/array-temp.f90
+++ b/flang/test/Lower/array-temp.f90
@@ -404,7 +404,7 @@ subroutine tt1
   ! CHECK-NEXT: fir.call @_FortranAioEndIoStatement
   print*, [(r([7.0]),i=1,3)]
 contains
-  ! CHECK-LABEL: func @_QFtt1Pr
+  ! CHECK-LABEL: func private @_QFtt1Pr
   function r(x)
     real x(:)
     r = x(1)
diff --git a/flang/test/Lower/dummy-arguments.f90 b/flang/test/Lower/dummy-arguments.f90
index 43d8e3c1e5d4485..331e089a60fa02c 100644
--- a/flang/test/Lower/dummy-arguments.f90
+++ b/flang/test/Lower/dummy-arguments.f90
@@ -9,7 +9,7 @@ program test1
   call foo(10)
 contains
 
-! CHECK-LABEL: func @_QFPfoo
+! CHECK-LABEL: func private @_QFPfoo
 subroutine foo(avar1)
   integer :: avar1
 !  integer :: my_data, my_data2
diff --git a/flang/test/Lower/dummy-procedure-character.f90 b/flang/test/Lower/dummy-procedure-character.f90
index cecd839287ed133..72d548513fb270b 100644
--- a/flang/test/Lower/dummy-procedure-character.f90
+++ b/flang/test/Lower/dummy-procedure-character.f90
@@ -213,7 +213,7 @@ subroutine host(f)
   ! CHECK: fir.call @_QFhostPintern(%[[VAL_1]])
   call intern()
 contains
-! CHECK-LABEL: func @_QFhostPintern(
+! CHECK-LABEL: func private @_QFhostPintern(
 ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc})
   subroutine intern()
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
@@ -242,7 +242,7 @@ subroutine host2(f)
   ! CHECK: fir.call @_QFhost2Pintern(%[[VAL_1]])
   call intern()
 contains
-! CHECK-LABEL: func @_QFhost2Pintern(
+! CHECK-LABEL: func private @_QFhost2Pintern(
 ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc})
   subroutine intern()
     ! CHECK:  %[[VAL_1:.*]] = fir.alloca !fir.char<1,42> {bindc_name = ".result"}
diff --git a/flang/test/Lower/equivalence-with-host-assoc.f90 b/flang/test/Lower/equivalence-with-host-assoc.f90
index ec84fb506314bad..0ffb1bc5bf9ee17 100644
--- a/flang/test/Lower/equivalence-with-host-assoc.f90
+++ b/flang/test/Lower/equivalence-with-host-assoc.f90
@@ -10,7 +10,7 @@ subroutine inner
     i1 = j1
   end subroutine inner
 end subroutine test1
-! FIR-LABEL:   func.func @_QFtest1Pinner() attributes {fir.internal_proc} {
+! FIR-LABEL:   func.func private @_QFtest1Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@_QFtest1Ei1) : !fir.ref<!fir.array<1xi32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -24,7 +24,7 @@ end subroutine test1
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func @_QFtest1Pinner() attributes {fir.internal_proc} {
+! HLFIR-LABEL:   func.func private @_QFtest1Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@_QFtest1Ei1) : !fir.ref<!fir.array<1xi32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -54,7 +54,7 @@ subroutine inner
     end subroutine inner
   end subroutine host
 end module test2
-! FIR-LABEL:   func.func @_QMtest2FhostPinner() attributes {fir.internal_proc} {
+! FIR-LABEL:   func.func private @_QMtest2FhostPinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@_QMtest2FhostEf1) : !fir.ref<!fir.array<1xi32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -68,7 +68,7 @@ end module test2
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func @_QMtest2FhostPinner() attributes {fir.internal_proc} {
+! HLFIR-LABEL:   func.func private @_QMtest2FhostPinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@_QMtest2FhostEf1) : !fir.ref<!fir.array<1xi32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -94,7 +94,7 @@ subroutine inner
     i1 = j1 + k1
   end subroutine inner
 end subroutine test3
-! FIR-LABEL:   func.func @_QFtest3Pinner() attributes {fir.internal_proc} {
+! FIR-LABEL:   func.func private @_QFtest3Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -115,7 +115,7 @@ end subroutine test3
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func @_QFtest3Pinner() attributes {fir.internal_proc} {
+! HLFIR-LABEL:   func.func private @_QFtest3Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -149,7 +149,7 @@ subroutine inner
     i1 = j1 + k1
   end subroutine inner
 end subroutine test4
-! FIR-LABEL:   func.func @_QFtest4Pinner() attributes {fir.internal_proc} {
+! FIR-LABEL:   func.func private @_QFtest4Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -170,7 +170,7 @@ end subroutine test4
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func @_QFtest4Pinner() attributes {fir.internal_proc} {
+! HLFIR-LABEL:   func.func private @_QFtest4Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/explicit-interface-results-2.f90 b/flang/test/Lower/explicit-interface-results-2.f90
index 4e4b035bae7e1ea..86aae720e7fcf9c 100644
--- a/flang/test/Lower/explicit-interface-results-2.f90
+++ b/flang/test/Lower/explicit-interface-results-2.f90
@@ -69,8 +69,8 @@ subroutine host4()
   integer :: n
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost4Pinternal_proc_a
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFhost4Pinternal_proc_a
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
@@ -94,7 +94,7 @@ subroutine host5()
   implicit none
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost5Pinternal_proc_a() attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFhost5Pinternal_proc_a() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QMsome_moduleEn_module) : !fir.ref<i32>
@@ -115,7 +115,7 @@ subroutine host6()
   implicit none
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost6Pinternal_proc_a
+! CHECK-LABEL: func private @_QFhost6Pinternal_proc_a
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QMsome_moduleEn_module) : !fir.ref<i32>
@@ -187,7 +187,7 @@ subroutine host9()
   common /mycom/ n_common
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost9Pinternal_proc_a
+! CHECK-LABEL: func private @_QFhost9Pinternal_proc_a
   subroutine internal_proc_a()
 ! CHECK:  %[[VAL_0:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_1:.*]] = fir.address_of(@mycom_) : !fir.ref<!fir.array<4xi8>>
@@ -213,7 +213,7 @@ subroutine host10()
   implicit none
   call internal_proc_a()
 contains
-! CHECK-LABEL: func @_QFhost10Pinternal_proc_a
+! CHECK-LABEL: func private @_QFhost10Pinternal_proc_a
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_0:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/forall/array-constructor.f90 b/flang/test/Lower/forall/array-constructor.f90
index 083a71ba479aa2e..ad21ed33fba2d4c 100644
--- a/flang/test/Lower/forall/array-constructor.f90
+++ b/flang/test/Lower/forall/array-constructor.f90
@@ -114,8 +114,8 @@ end subroutine ac1
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func @_QFac1Pfunc(
-! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -> i32 {
+! CHECK-LABEL: func private @_QFac1Pfunc(
+! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -> i32 {{.*}} {
 ! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "func", uniq_name = "_QFac1FfuncEfunc"}
 ! CHECK:         %[[VAL_2:.*]] = arith.constant 1 : i64
 ! CHECK:         %[[VAL_3:.*]] = arith.constant 1 : i64
@@ -259,8 +259,8 @@ end subroutine ac2
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func @_QFac2Pfunc(
-! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -> !fir.array<3xi32> {
+! CHECK-LABEL: func private @_QFac2Pfunc(
+! CHECK-SAME:                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) -> !fir.array<3xi32> {{.*}} {
 ! CHECK:         %[[VAL_1:.*]] = arith.constant 3 : index
 ! CHECK:         %[[VAL_2:.*]] = fir.alloca !fir.array<3xi32> {bindc_name = "func", uniq_name = "_QFac2FfuncEfunc"}
 ! CHECK:         %[[VAL_3:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/forall/character-1.f90 b/flang/test/Lower/forall/character-1.f90
index e5c40a16420a622..e97c3f36b0b10f2 100644
--- a/flang/test/Lower/forall/character-1.f90
+++ b/flang/test/Lower/forall/character-1.f90
@@ -17,7 +17,7 @@ subroutine sub(x)
   end subroutine sub
 end program test
 
-! CHECK-LABEL: define void @_QFPsub(
+! CHECK-LABEL: define internal void @_QFPsub(
 ! CHECK-SAME:    ptr %[[arg:.*]])
 ! CHECK: %[[extent:.*]] = getelementptr { {{.*}}, [1 x [3 x i64]] }, ptr %[[arg]], i32 0, i32 7, i64 0, i32 1
 ! CHECK: %[[extval:.*]] = load i64, ptr %[[extent]]
diff --git a/flang/test/Lower/global-initialization.f90 b/flang/test/Lower/global-initialization.f90
index dd60a6fd8b9f861..ff208ecc3c89781 100644
--- a/flang/test/Lower/global-initialization.f90
+++ b/flang/test/Lower/global-initialization.f90
@@ -4,16 +4,19 @@ program bar
 ! CHECK: fir.address_of(@[[name1:.*]]my_data)
   integer, save :: my_data = 1
   print *, my_data
+  call foo()
+  call foo2()
+  call foo3()
 contains
 
-! CHECK-LABEL: func @_QFPfoo
+! CHECK-LABEL: func private @_QFPfoo
 subroutine foo()
 ! CHECK: fir.address_of(@[[name2:.*foo.*my_data]])
   integer, save :: my_data = 2
   print *, my_data + 1
 end subroutine
 
-! CHECK-LABEL: func @_QFPfoo2
+! CHECK-LABEL: func private @_QFPfoo2
 subroutine foo2()
 ! CHECK: fir.address_of(@[[name3:.*foo2.*my_data]])
   integer, save :: my_data
@@ -21,7 +24,7 @@ subroutine foo2()
   print *, my_data
 end subroutine
 
-! CHECK-LABEL: func @_QFPfoo3
+! CHECK-LABEL: func private @_QFPfoo3
 subroutine foo3()
 ! CHECK-DAG: fir.address_of(@[[name4:.*foo3.*idata]]){{.*}}fir.array<5xi32>
 ! CHECK-DAG: fir.address_of(@[[name5:.*foo3.*rdata]]){{.*}}fir.array<3xf16>
diff --git a/flang/test/Lower/host-associated-functions.f90 b/flang/test/Lower/host-associated-functions.f90
index 77a51490950fa11..78d081748c2f42f 100644
--- a/flang/test/Lower/host-associated-functions.f90
+++ b/flang/test/Lower/host-associated-functions.f90
@@ -19,8 +19,8 @@ subroutine capture_char_func_dummy(char_func_dummy, n)
   ! CHECK:  fir.call @_QFcapture_char_func_dummyPinternal(%[[VAL_2]]) {{.*}}: (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>>) -> ()
   call internal()
 contains
-  ! CHECK-LABEL: func @_QFcapture_char_func_dummyPinternal(
-  ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFcapture_char_func_dummyPinternal(
+  ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal()
   ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
   ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>>, i32) -> !fir.ref<tuple<!fir.boxproc<() -> ()>, i64>>
@@ -55,8 +55,8 @@ subroutine capture_char_func_assumed_dummy(char_func_dummy)
 ! CHECK:  fir.call @_QFcapture_char_func_assumed_dummyPinternal(%[[VAL_1]]) {{.*}}: (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>>) -> ()
   call internal()
 contains
-! CHECK-LABEL: func @_QFcapture_char_func_assumed_dummyPinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFcapture_char_func_assumed_dummyPinternal(
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal()
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>>, i32) -> !fir.ref<tuple<!fir.boxproc<() -> ()>, i64>>
@@ -84,7 +84,7 @@ subroutine capture_char_func(n)
 ! CHECK:  fir.call @_QFcapture_char_funcPinternal(%[[VAL_1]]) {{.*}}: (!fir.ref<tuple<!fir.ref<i32>>>) -> ()
   call internal()
 contains
-! CHECK-LABEL: func @_QFcapture_char_funcPinternal(
+! CHECK-LABEL: func private @_QFcapture_char_funcPinternal(
 ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc})
   subroutine internal()
    print *, char_func()
@@ -109,8 +109,8 @@ function array_func()
   call internal()
 contains
   subroutine internal()
-! CHECK-LABEL: func @_QFcapture_array_funcPinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFcapture_array_funcPinternal(
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.llvm_ptr<!fir.ref<i32>>
@@ -146,7 +146,7 @@ subroutine use_module()
 ! CHECK:  fir.call @_QFuse_modulePinternal() {{.*}}: () -> ()
   call internal()
   contains
-! CHECK-LABEL: func @_QFuse_modulePinternal() {
+! CHECK-LABEL: func private @_QFuse_modulePinternal() {{.*}} {
   subroutine internal()
     print *, return_char(42)
   end subroutine
diff --git a/flang/test/Lower/host-associated-globals.f90 b/flang/test/Lower/host-associated-globals.f90
index bb22a32775427d1..fe612e777aeaad5 100644
--- a/flang/test/Lower/host-associated-globals.f90
+++ b/flang/test/Lower/host-associated-globals.f90
@@ -18,7 +18,7 @@ subroutine bar()
     print *, j_in_equiv, not_in_equiv
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFmodule_varPbar()
+! CHECK-LABEL: func.func private @_QFmodule_varPbar()
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QMtest_mod_used_in_hostEi) : !fir.ref<!fir.array<4xi8>>
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<!fir.array<4xi8>>, index) -> !fir.ref<i8>
@@ -37,7 +37,7 @@ subroutine bar()
     print *, j_in_equiv, not_in_equiv
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFtest_commonPbar() attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFtest_commonPbar() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@x_) : !fir.ref<!fir.array<12xi8>>
 ! CHECK:  %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<12xi8>>) -> !fir.ref<!fir.array<?xi8>>
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 4 : index
@@ -59,7 +59,7 @@ subroutine bar()
     print *, j_in_equiv, not_in_equiv
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFsaved_equivPbar() attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFsaved_equivPbar() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QFsaved_equivEi) : !fir.ref<!fir.array<8xi8>>
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 4 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<!fir.array<8xi8>>, index) -> !fir.ref<i8>
@@ -79,8 +79,8 @@ subroutine bar()
     call test(saved_j, j)
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func @_QFmixed_capturePbar(
-! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QFmixed_capturePbar(
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = fir.address_of(@_QFmixed_captureEsaved_i) : !fir.ref<!fir.array<4xi8>>
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_1]], %[[VAL_2]] : (!fir.ref<!fir.array<4xi8>>, index) -> !fir.ref<i8>
diff --git a/flang/test/Lower/host-associated.f90 b/flang/test/Lower/host-associated.f90
index 25e637805e872bc..f88903c8af80f8f 100644
--- a/flang/test/Lower/host-associated.f90
+++ b/flang/test/Lower/host-associated.f90
@@ -19,8 +19,8 @@ subroutine test1
   call test1_internal
   print *, i
 contains
-  ! CHECK-LABEL: func @_QFtest1Ptest1_internal(
-  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest1Ptest1_internal(
+  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   ! CHECK: %[[iaddr:.*]] = fir.coordinate_of %[[arg]], %c0
   ! CHECK: %[[i:.*]] = fir.load %[[iaddr]] : !fir.llvm_ptr<!fir.ref<i32>>
   ! CHECK: %[[val:.*]] = fir.call @_QPifoo() {{.*}}: () -> i32
@@ -46,8 +46,8 @@ subroutine test2
   call test2_internal
   print *, a, b
 contains
-  ! CHECK-LABEL: func @_QFtest2Ptest2_internal(
-  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest2Ptest2_internal(
+  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test2_internal
     ! CHECK: %[[a:.*]] = fir.coordinate_of %[[arg]], %c0
     ! CHECK: %[[aa:.*]] = fir.load %[[a]] : !fir.llvm_ptr<!fir.ref<f32>>
@@ -61,8 +61,8 @@ subroutine test2_internal
     call test2_inner
   end subroutine test2_internal
 
-  ! CHECK-LABEL: func @_QFtest2Ptest2_inner(
-  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest2Ptest2_inner(
+  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test2_inner
     ! CHECK: %[[a:.*]] = fir.coordinate_of %[[arg]], %c0
     ! CHECK: %[[aa:.*]] = fir.load %[[a]] : !fir.llvm_ptr<!fir.ref<f32>>
@@ -95,8 +95,8 @@ subroutine test6(c)
   print *, c
 
 contains
-  ! CHECK-LABEL: func @_QFtest6Ptest6_inner(
-  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest6Ptest6_inner(
+  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test6_inner
     ! CHECK: %[[coor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
     ! CHECK: %[[load:.*]] = fir.load %[[coor]] : !fir.ref<!fir.boxchar<1>>
@@ -137,8 +137,8 @@ subroutine test3(p,q,i)
   end if
   
 contains
-  ! CHECK-LABEL: func @_QFtest3Ptest3_inner(
-  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest3Ptest3_inner(
+  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test3_inner
     ! CHECK: %[[pcoor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
     ! CHECK: %[[p:.*]] = fir.load %[[pcoor]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
@@ -184,8 +184,8 @@ subroutine test3a(p)
   end if
   
 contains
-  ! CHECK: func @_QFtest3aPtest3a_inner(
-  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<10xf32>>, !fir.box<!fir.array<10xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK: func private @_QFtest3aPtest3a_inner(
+  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<10xf32>>, !fir.box<!fir.array<10xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test3a_inner
     ! CHECK: %[[pcoor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.box<!fir.array<10xf32>>, !fir.box<!fir.array<10xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<10xf32>>>
     ! CHECK: %[[p:.*]] = fir.load %[[pcoor]] : !fir.ref<!fir.box<!fir.array<10xf32>>>
@@ -228,8 +228,8 @@ subroutine test4
   end if
   
 contains
-  ! CHECK-LABEL: func @_QFtest4Ptest4_inner(
-  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest4Ptest4_inner(
+  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test4_inner
     ! CHECK: %[[ptup:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<f32>>>>
     ! CHECK: %[[p:.*]] = fir.load %[[ptup]] : !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<f32>>>>
@@ -270,8 +270,8 @@ subroutine test5
   end if
   
 contains
-  ! CHECK-LABEL: func @_QFtest5Ptest5_inner(
-  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+  ! CHECK-LABEL: func private @_QFtest5Ptest5_inner(
+  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test5_inner
     ! CHECK: %[[ptup:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>>
     ! CHECK: %[[p:.*]] = fir.load %[[ptup]] : !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>>
@@ -308,8 +308,8 @@ subroutine test7(j, k)
   k = test7_inner(k)
 contains
 
-! CHECK-LABEL: func @_QFtest7Ptest7_inner(
-! CHECK-SAME: %[[i:.*]]: !fir.ref<i32>{{.*}}, %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) -> i32 attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFtest7Ptest7_inner(
+! CHECK-SAME: %[[i:.*]]: !fir.ref<i32>{{.*}}, %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) -> i32 attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 elemental integer function test7_inner(i)
   implicit none
   integer, intent(in) :: i
@@ -329,8 +329,8 @@ subroutine issue990()
   integer :: captured
   call bar()
 contains
-! CHECK-LABEL: func @_QFissue990Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFissue990Pbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   integer :: stmt_func, i
   stmt_func(i) = i + captured
@@ -351,8 +351,8 @@ subroutine issue990b()
   captured_stmt_func(i) = i + captured
   call bar()
 contains
-! CHECK-LABEL: func @_QFissue990bPbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFissue990bPbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
   ! CHECK: %[[addr:.*]] = fir.load %[[tupAddr]] : !fir.llvm_ptr<!fir.ref<i32>>
@@ -372,8 +372,8 @@ real function dummy_proc(x)
  end interface
  call bar()
 contains
-! CHECK-LABEL: func @_QFtest8Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFtest8Pbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.boxproc<() -> ()>>>, i32) -> !fir.ref<!fir.boxproc<() -> ()>>
   ! CHECK: %[[dummyProc:.*]] = fir.load %[[tupAddr]] : !fir.ref<!fir.boxproc<() -> ()>>
@@ -392,8 +392,8 @@ subroutine dummy_proc()
  end interface
  call bar()
 contains
-! CHECK-LABEL: func @_QFtest9Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFtest9Pbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.boxproc<() -> ()>>>, i32) -> !fir.ref<!fir.boxproc<() -> ()>>
   ! CHECK: %[[dummyProc:.*]] = fir.load %[[tupAddr]] : !fir.ref<!fir.boxproc<() -> ()>>
@@ -415,8 +415,8 @@ subroutine test10(i)
  ! CHECK: fir.call @_QFtest10Pbar(%[[tup]]) {{.*}}: (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>>) -> ()
  call bar()
 contains
-! CHECK-LABEL: func @_QFtest10Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func private @_QFtest10Pbar(
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>
   ! CHECK: fir.load %[[tupAddr]] : !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>
@@ -433,9 +433,9 @@ subroutine bar()
 ! CHECK:         %[[VAL_8:.*]] = fir.emboxproc %[[VAL_7]], %[[VAL_5]] : ((!fir.ref<i32>, !fir.ref<tuple<!fir.ref<i32>>>) -> (), !fir.ref<tuple<!fir.ref<i32>>>) -> !fir.boxproc<() -> ()>
 ! CHECK:         fir.call @_QPtest_proc_dummy_other(%[[VAL_8]]) {{.*}}: (!fir.boxproc<() -> ()>) -> ()
 
-! CHECK-LABEL: func @_QFtest_proc_dummyPtest_proc_dummy_a(
+! CHECK-LABEL: func private @_QFtest_proc_dummyPtest_proc_dummy_a(
 ! CHECK-SAME:          %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "j"},
-! CHECK-SAME:          %[[VAL_1:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-SAME:          %[[VAL_1:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:         %[[VAL_2:.*]] = arith.constant 0 : i32
 ! CHECK:         %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_1]], %[[VAL_2]] : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
 ! CHECK:         %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.llvm_ptr<!fir.ref<i32>>
@@ -525,10 +525,10 @@ end subroutine test_proc_dummy_other
 ! CHECK:         return
 ! CHECK:       }
 
-! CHECK-LABEL: func @_QFtest_proc_dummy_charPgen_message(
+! CHECK-LABEL: func private @_QFtest_proc_dummy_charPgen_message(
 ! CHECK-SAME:                                            %[[VAL_0:.*]]: !fir.ref<!fir.char<1,10>>,
 ! CHECK-SAME:                                            %[[VAL_1:.*]]: index,
-! CHECK-SAME:                                            %[[VAL_2:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) -> !fir.boxchar<1> attributes {fir.internal_proc} {
+! CHECK-SAME:                                            %[[VAL_2:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) -> !fir.boxchar<1> attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK-DAG:         %[[VAL_3:.*]] = arith.constant 0 : i32
 ! CHECK-DAG:         %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK-DAG:         %[[VAL_5:.*]] = arith.constant false
diff --git a/flang/test/Lower/module-and-internal-proc.f90 b/flang/test/Lower/module-and-internal-proc.f90
index 1da5ce422939741..0f4c6809581cfc4 100644
--- a/flang/test/Lower/module-and-internal-proc.f90
+++ b/flang/test/Lower/module-and-internal-proc.f90
@@ -17,7 +17,7 @@ subroutine test1()
 subroutine test2()
   call test2internal()
   contains
-  ! CHECK-LABEL: func @_QMparentFtest2Ptest2internal()
+  ! CHECK-LABEL: func private @_QMparentFtest2Ptest2internal()
   subroutine test2internal()
     ! CHECK: fir.address_of(@_QMparentEi) : !fir.ref<i32>
     print *, i
@@ -31,7 +31,7 @@ subroutine test3()
   use parent
   call test3internal()
   contains
-  ! CHECK-LABEL: func @_QFtest3Ptest3internal()
+  ! CHECK-LABEL: func private @_QFtest3Ptest3internal()
   subroutine test3internal()
     ! CHECK: fir.address_of(@_QMparentEi) : !fir.ref<i32>
     print *, i
diff --git a/flang/test/Lower/parent-component.f90 b/flang/test/Lower/parent-component.f90
index ed1130a08493e22..c6bc53340643fca 100644
--- a/flang/test/Lower/parent-component.f90
+++ b/flang/test/Lower/parent-component.f90
@@ -29,20 +29,20 @@ subroutine print_scalar(a)
     type(p), intent(in) :: a
     print*, a
   end subroutine
-  ! CHECK-LABEL: func.func @_QFPprint_scalar(%{{.*}}: !fir.ref<!fir.type<_QFTp{a:i32}>> {fir.bindc_name = "a"})
+  ! CHECK-LABEL: func.func private @_QFPprint_scalar(%{{.*}}: !fir.ref<!fir.type<_QFTp{a:i32}>> {fir.bindc_name = "a"})
 
   subroutine print_p(a)
     type(p), intent(in) :: a(2)
     print*, a
   end subroutine
-  ! CHECK-LABEL: func.func @_QFPprint_p(%{{.*}}: !fir.ref<!fir.array<2x!fir.type<_QFTp{a:i32}>>> {fir.bindc_name = "a"})
+  ! CHECK-LABEL: func.func private @_QFPprint_p(%{{.*}}: !fir.ref<!fir.array<2x!fir.type<_QFTp{a:i32}>>> {fir.bindc_name = "a"})
 
   subroutine init_with_slice()
     type(c) :: y(2) = [ c(11, 21), c(12, 22) ]
     call print_p(y(:)%p)
     print*,y(:)%p
   end subroutine
-  ! CHECK-LABEL: func.func @_QFPinit_with_slice()
+  ! CHECK-LABEL: func.func private @_QFPinit_with_slice()
   ! CHECK: %[[Y:.*]] = fir.address_of(@_QFFinit_with_sliceEy) : !fir.ref<!fir.array<2x!fir.type<_QFTc{a:i32,b:i32}>>>
   ! CHECK: %[[C2:.*]] = arith.constant 2 : index
   ! CHECK: %[[C1:.*]] = arith.constant 1 : index
@@ -78,7 +78,7 @@ subroutine init_no_slice()
     call print_p(y%p)
     print*,y%p
   end subroutine
-  ! CHECK-LABEL: func.func @_QFPinit_no_slice()
+  ! CHECK-LABEL: func.func private @_QFPinit_no_slice()
   ! CHECK: %[[Y:.*]] = fir.address_of(@_QFFinit_no_sliceEy) : !fir.ref<!fir.array<2x!fir.type<_QFTc{a:i32,b:i32}>>>
   ! CHECK: %[[C2:.*]] = arith.constant 2 : index
   ! CHECK: %[[SHAPE:.*]] = fir.shape %[[C2]] : (index) -> !fir.shape<1>
@@ -106,7 +106,7 @@ subroutine init_allocatable()
     print*,y%p
   end subroutine
 
-  ! CHECK-LABEL: func.func @_QFPinit_allocatable()
+  ! CHECK-LABEL: func.func private @_QFPinit_allocatable()
   ! CHECK: %[[ALLOC:.*]] = fir.alloca !fir.heap<!fir.array<?x!fir.type<_QFTc{a:i32,b:i32}>>> {uniq_name = "_QFFinit_allocatableEy.addr"}
   ! CHECK: %[[LB0:.*]] = fir.alloca index {uniq_name = "_QFFinit_allocatableEy.lb0"}
   ! CHECK: %[[EXT0:.*]] = fir.alloca index {uniq_name = "_QFFinit_allocatableEy.ext0"}
@@ -139,7 +139,7 @@ subroutine init_scalar()
     print*,s%p
   end subroutine
 
-  ! CHECK-LABEL: func.func @_QFPinit_scalar()
+  ! CHECK-LABEL: func.func private @_QFPinit_scalar()
   ! CHECK: %[[S:.*]] = fir.address_of(@_QFFinit_scalarEs) : !fir.ref<!fir.type<_QFTc{a:i32,b:i32}>>
   ! CHECK: %[[CAST:.*]] = fir.convert %[[S]] : (!fir.ref<!fir.type<_QFTc{a:i32,b:i32}>>) -> !fir.ref<!fir.type<_QFTp{a:i32}>>
   ! CHECK: fir.call @_QFPprint_scalar(%[[CAST]]) {{.*}}: (!fir.ref<!fir.type<_QFTp{a:i32}>>) -> ()
@@ -154,7 +154,7 @@ subroutine init_assumed(y)
     print*,y%p
   end subroutine
 
-  ! CHECK-LABEL: func.func @_QFPinit_assumed(
+  ! CHECK-LABEL: func.func private @_QFPinit_assumed(
   ! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x!fir.type<_QFTc{a:i32,b:i32}>>
   ! CHECK: %[[BOX:.*]] = fir.rebox %[[ARG0]] : (!fir.box<!fir.array<?x!fir.type<_QFTc{a:i32,b:i32}>>>) -> !fir.box<!fir.array<?x!fir.type<_QFTp{a:i32}>>>
 
@@ -167,7 +167,7 @@ subroutine init_existing_field()
     call print_p(y%c%p)
   end subroutine
 
-  ! CHECK-LABEL: func.func @_QFPinit_existing_field
+  ! CHECK-LABEL: func.func private @_QFPinit_existing_field
   ! CHECK: %[[C2:.*]] = arith.constant 2 : index
   ! CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.array<2x!fir.type<_QFTz{k:i32,c:!fir.type<_QFTc{a:i32,b:i32}>}>> {bindc_name = "y", uniq_name = "_QFFinit_existing_fieldEy"}
   ! CHECK: %[[FIELD_C:.*]] = fir.field_index c, !fir.type<_QFTz{k:i32,c:!fir.type<_QFTc{a:i32,b:i32}>}>
@@ -183,7 +183,7 @@ subroutine parent_comp_lhs()
     a%p = B
   end subroutine
 
-! CHECK-LABEL: func.func @_QFPparent_comp_lhs()
+! CHECK-LABEL: func.func private @_QFPparent_comp_lhs()
 ! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.type<_QFTp{a:i32}>>
 ! CHECK: %[[A:.*]] = fir.alloca !fir.type<_QFTc{a:i32,b:i32}> {bindc_name = "a", uniq_name = "_QFFparent_comp_lhsEa"}
 ! CHECK: %[[B:.*]] = fir.alloca !fir.type<_QFTp{a:i32}> {bindc_name = "b", uniq_name = "_QFFparent_comp_lhsEb"}
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index a813eff690b77e8..15d8a86e4ef47cc 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -519,8 +519,8 @@ subroutine internal
     end subroutine
   end subroutine
 
-! CHECK-LABEL: func.func @_QMpolymorphic_testFhost_assocPinternal(
-! CHECK-SAME: %[[TUPLE:.*]]: !fir.ref<tuple<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>> {fir.host_assoc}) attributes {fir.internal_proc} {
+! CHECK-LABEL: func.func private @_QMpolymorphic_testFhost_assocPinternal(
+! CHECK-SAME: %[[TUPLE:.*]]: !fir.ref<tuple<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK: %[[POS_IN_TUPLE:.*]] = arith.constant 0 : i32
 ! CHECK: %[[COORD_OF_CLASS:.*]] = fir.coordinate_of %[[TUPLE]], %[[POS_IN_TUPLE]] : (!fir.ref<tuple<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>, i32) -> !fir.ref<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
 ! CHECK: %[[CLASS:.*]] = fir.load %[[COORD_OF_CLASS]] : !fir.ref<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
diff --git a/flang/test/Lower/program-units-fir-mangling.f90 b/flang/test/Lower/program-units-fir-mangling.f90
index 36631979141a087..002343c45f6ec73 100644
--- a/flang/test/Lower/program-units-fir-mangling.f90
+++ b/flang/test/Lower/program-units-fir-mangling.f90
@@ -44,12 +44,12 @@ function foo()
 function foo2()
   real(4) :: foo2
 contains
-  ! CHECK-LABEL: func @_QFfoo2Psub() {
+  ! CHECK-LABEL: func private @_QFfoo2Psub() {{.*}} {
   subroutine sub()
   ! CHECK: }
   end subroutine
 
-  ! CHECK-LABEL: func @_QFfoo2Pfoo() {
+  ! CHECK-LABEL: func private @_QFfoo2Pfoo() {{.*}} {
   subroutine foo()
   ! CHECK: }
   end subroutine
@@ -58,12 +58,12 @@ subroutine foo()
 ! CHECK-LABEL: func @_QPsub2()
 subroutine sUb2()
 contains
-  ! CHECK-LABEL: func @_QFsub2Psub() {
+  ! CHECK-LABEL: func private @_QFsub2Psub() {{.*}} {
   subroutine sub()
   ! CHECK: }
   end subroutine
 
-  ! CHECK-LABEL: func @_QFsub2Pfoo() {
+  ! CHECK-LABEL: func private @_QFsub2Pfoo() {{.*}} {
   subroutine Foo()
   ! CHECK: }
   end subroutine
@@ -74,7 +74,7 @@ module testMod2
   ! CHECK-LABEL: func @_QMtestmod2Psub()
   subroutine sub()
   contains
-    ! CHECK-LABEL: func @_QMtestmod2FsubPsubsub() {
+    ! CHECK-LABEL: func private @_QMtestmod2FsubPsubsub() {{.*}} {
     subroutine subSub()
     ! CHECK: }
     end subroutine
@@ -105,7 +105,7 @@ subroutine sub
   ! CHECK-LABEL: func @_QMcolor_pointsScolor_points_aSimplPfoo()
   subroutine foo
     contains
-    ! CHECK-LABEL: func @_QMcolor_pointsScolor_points_aSimplFfooPbar() {
+    ! CHECK-LABEL: func private @_QMcolor_pointsScolor_points_aSimplFfooPbar() {{.*}} {
     subroutine bar
     ! CHECK: }
     end subroutine
@@ -128,7 +128,7 @@ subroutine should_not_collide()
 program test
 ! CHECK: }
 contains
-! CHECK-LABEL: func @_QFPshould_not_collide() {
+! CHECK-LABEL: func private @_QFPshould_not_collide() {{.*}} {
 subroutine should_not_collide()
 ! CHECK: }
 end subroutine
@@ -226,7 +226,7 @@ subroutine nest1
   ! CHECK:   fir.call @_QFnest1Pinner()
   call inner
 contains
-  ! CHECK-LABEL: func @_QFnest1Pinner
+  ! CHECK-LABEL: func private @_QFnest1Pinner
   subroutine inner
     ! CHECK:   %[[V_0:[0-9]+]] = fir.address_of(@_QFnest1FinnerEkk) : !fir.ref<i32>
     integer, save :: kk = 1
@@ -239,7 +239,7 @@ subroutine nest2
   ! CHECK:   fir.call @_QFnest2Pinner()
   call inner
 contains
-  ! CHECK-LABEL: func @_QFnest2Pinner
+  ! CHECK-LABEL: func private @_QFnest2Pinner
   subroutine inner
     ! CHECK:   %[[V_0:[0-9]+]] = fir.address_of(@_QFnest2FinnerEkk) : !fir.ref<i32>
     integer, save :: kk = 77

From e39e30e95237bee53346acf16d197de8fdf4825e Mon Sep 17 00:00:00 2001
From: Tobias Gysi <tobias.gysi@nextsilicon.com>
Date: Wed, 28 Feb 2024 14:45:18 +0100
Subject: [PATCH 073/114] [mlir][llvm] Fix access group translation (#83257)

This commit fixes the translation of access group metadata to LLVM IR.
Previously, it did not use a temporary metadata node to model the
placeholder of the self-referencing access group nodes. This is
dangerous since, the translation may produce a metadata list with a null
entry that is later on changed changed with a self reference. At the
same time, for example the debug info translation may create the same
uniqued node, which after setting the self-reference the suddenly
references the access group metadata. The commit avoids such breakages.
---
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  |  5 ++-
 .../Target/LLVMIR/attribute-alias-scopes.mlir | 44 ++++++++++++++++---
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index a11603a44dcd1c6..c00628a420a000a 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -1512,13 +1512,14 @@ ModuleTranslation::getOrCreateAliasScope(AliasScopeAttr aliasScopeAttr) {
   if (!scopeInserted)
     return scopeIt->second;
   llvm::LLVMContext &ctx = llvmModule->getContext();
+  auto dummy = llvm::MDNode::getTemporary(ctx, std::nullopt);
   // Convert the domain metadata node if necessary.
   auto [domainIt, insertedDomain] = aliasDomainMetadataMapping.try_emplace(
       aliasScopeAttr.getDomain(), nullptr);
   if (insertedDomain) {
     llvm::SmallVector<llvm::Metadata *, 2> operands;
     // Placeholder for self-reference.
-    operands.push_back({});
+    operands.push_back(dummy.get());
     if (StringAttr description = aliasScopeAttr.getDomain().getDescription())
       operands.push_back(llvm::MDString::get(ctx, description));
     domainIt->second = llvm::MDNode::get(ctx, operands);
@@ -1529,7 +1530,7 @@ ModuleTranslation::getOrCreateAliasScope(AliasScopeAttr aliasScopeAttr) {
   assert(domainIt->second && "Scope's domain should already be valid");
   llvm::SmallVector<llvm::Metadata *, 3> operands;
   // Placeholder for self-reference.
-  operands.push_back({});
+  operands.push_back(dummy.get());
   operands.push_back(domainIt->second);
   if (StringAttr description = aliasScopeAttr.getDescription())
     operands.push_back(llvm::MDString::get(ctx, description));
diff --git a/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir b/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
index 4434aea4ec965f9..fa3395533af2204 100644
--- a/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
+++ b/mlir/test/Target/LLVMIR/attribute-alias-scopes.mlir
@@ -59,14 +59,48 @@ llvm.func @alias_scopes(%arg1 : !llvm.ptr) {
 #alias_scope1 = #llvm.alias_scope<id = distinct[1]<>, domain = #alias_scope_domain>
 
 // CHECK-LABEL: @noalias_intr_only
-llvm.func @noalias_intr_only(%arg1 : !llvm.ptr) {
-  %0 = llvm.mlir.constant(0 : i32) : i32
-  // CHECK:  call void @llvm.experimental.noalias.scope.decl(metadata ![[SCOPES1:[0-9]+]])
+llvm.func @noalias_intr_only() {
+  // CHECK: call void @llvm.experimental.noalias.scope.decl(metadata ![[SCOPES:[0-9]+]])
   llvm.intr.experimental.noalias.scope.decl #alias_scope1
   llvm.return
 }
 
 // Check the translated metadata.
 // CHECK-DAG: ![[DOMAIN:[0-9]+]] = distinct !{![[DOMAIN]], !"The domain"}
-// CHECK-DAG: ![[SCOPE1:[0-9]+]] = distinct !{![[SCOPE1]], ![[DOMAIN]]}
-// CHECK-DAG: ![[SCOPES1]] = !{![[SCOPE1]]}
+// CHECK-DAG: ![[SCOPE:[0-9]+]] = distinct !{![[SCOPE]], ![[DOMAIN]]}
+// CHECK-DAG: ![[SCOPES]] = !{![[SCOPE]]}
+
+// -----
+
+// This test ensures the alias scope translation creates a temporary metadata
+// node as a placeholder for self-references. Without this, the debug info
+// translation of a type list with a null entry could inadvertently reference
+// access group metadata. This occurs when both translations generate a metadata
+// list with a null entry, which are then uniqued to the same metadata node.
+// The access group translation subsequently updates the null entry to a
+// self-reference, which causes the type list to reference the access
+// group node as well. The use of a temporary placeholder node avoids the issue.
+
+#alias_scope_domain = #llvm.alias_scope_domain<id = distinct[0]<>>
+#alias_scope = #llvm.alias_scope<id = distinct[1]<>, domain = #alias_scope_domain>
+
+#di_null_type = #llvm.di_null_type
+#di_subroutine_type = #llvm.di_subroutine_type<types = #di_null_type>
+#di_file = #llvm.di_file<"attribute-alias-scope.mlir" in "">
+#di_compile_unit = #llvm.di_compile_unit<id = distinct[3]<>, sourceLanguage = DW_LANG_C11, file = #di_file, isOptimized = true, emissionKind = Full>
+#di_subprogram = #llvm.di_subprogram<id = distinct[2]<>, compileUnit = #di_compile_unit, scope = #di_file, file = #di_file, subprogramFlags = "Definition", type = #di_subroutine_type>
+
+// CHECK-LABEL: @self_reference
+llvm.func @self_reference() {
+  // CHECK: call void @llvm.experimental.noalias.scope.decl(metadata ![[SCOPES:[0-9]+]])
+  llvm.intr.experimental.noalias.scope.decl #alias_scope
+  llvm.return
+} loc(fused<#di_subprogram>[unknown])
+
+// Check that the translated subroutine types do not reference the access group
+// domain since both of them are created as metadata list with a null entry.
+// CHECK-DAG: ![[DOMAIN:[0-9]+]] = distinct !{![[DOMAIN]]}
+// CHECK-DAG: ![[SCOPE:[0-9]+]] = distinct !{![[SCOPE]], ![[DOMAIN]]}
+// CHECK-DAG: ![[SCOPES]] = !{![[SCOPE]]}
+// CHECK-DAG: = !DISubroutineType(types: ![[TYPES:[0-9]+]])
+// CHECK-DAG: ![[TYPES]] = !{null}

From ce0687e2df59ff6681c5800f076716f4665c5ec3 Mon Sep 17 00:00:00 2001
From: Niwin Anto <niwinantop@gmail.com>
Date: Wed, 28 Feb 2024 14:46:00 +0100
Subject: [PATCH 074/114] [LV] Add test for tail fold by masking with external
 IV users. (#82329)

Test case for https://github.com/llvm/llvm-project/issues/76069
---
 ...o-fold-tail-by-masking-iv-external-uses.ll | 159 ++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll

diff --git a/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
new file mode 100644
index 000000000000000..9968c933494aa03
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/no-fold-tail-by-masking-iv-external-uses.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s
+
+; FIXME: The vectorizer should refuse to fold the tail by masking because
+; %conv is used outside of the loop. Test for this by checking that
+; %n.vec, the vector trip count, is rounded down to the next multiple of
+; 4. If folding the tail, it would have been rounded up instead.
+; Test case for #76069(https://github.com/llvm/llvm-project/issues/76069).
+define i32 @test(ptr %arr, i64 %n) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[ARR:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i64 [[N]], 1
+; CHECK-NEXT:    br i1 [[CMP1]], label [[PREHEADER:%.*]], label [[DONE:%.*]]
+; CHECK:       preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[N]], -2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i8
+; CHECK-NEXT:    [[TMP3:%.*]] = add i8 1, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i8 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP1]], 255
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP1]] to i8
+; CHECK-NEXT:    [[TMP8:%.*]] = add i8 2, [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i8 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt i64 [[TMP1]], 255
+; CHECK-NEXT:    [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP6]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP0]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = add i64 1, [[N_VEC]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i8
+; CHECK-NEXT:    [[IND_END1:%.*]] = add i8 1, [[DOTCAST]]
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP0]], 1
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 1, i64 2, i64 3, i64 4>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE10]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 3
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT4]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <4 x i64> [[VEC_IND]], <i64 -1, i64 -1, i64 -1, i64 -1>
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP17]], i32 0
+; CHECK-NEXT:    br i1 [[TMP19]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP20]]
+; CHECK-NEXT:    store i32 65, ptr [[TMP21]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i1> [[TMP17]], i32 1
+; CHECK-NEXT:    br i1 [[TMP22]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; CHECK:       pred.store.if5:
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP23]]
+; CHECK-NEXT:    store i32 65, ptr [[TMP24]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
+; CHECK:       pred.store.continue6:
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i1> [[TMP17]], i32 2
+; CHECK-NEXT:    br i1 [[TMP25]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK:       pred.store.if7:
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP26]]
+; CHECK-NEXT:    store i32 65, ptr [[TMP27]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
+; CHECK:       pred.store.continue8:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP17]], i32 3
+; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.if9:
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[TMP29]]
+; CHECK-NEXT:    store i32 65, ptr [[TMP30]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
+; CHECK:       pred.store.continue10:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMO:%.*]] = sub i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[IND_ESCAPE:%.*]] = add i64 1, [[CMO]]
+; CHECK-NEXT:    br i1 true, label [[LOAD_VAL:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL2:%.*]] = phi i8 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ], [ 1, [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[CONV:%.*]] = phi i64 [ [[CONV2:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i8 [ [[INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i64 [[CONV]], -1
+; CHECK-NEXT:    [[PTR:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[SUB]]
+; CHECK-NEXT:    store i32 65, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[INC]] = add i8 [[I]], 1
+; CHECK-NEXT:    [[CONV2]] = zext i8 [[INC]] to i64
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i64 [[CONV2]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOP]], label [[LOAD_VAL]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       load_val:
+; CHECK-NEXT:    [[FINAL:%.*]] = phi i64 [ [[CONV]], [[LOOP]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[FINAL]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[PTR2]], align 4
+; CHECK-NEXT:    br label [[DONE]]
+; CHECK:       done:
+; CHECK-NEXT:    [[VALUE:%.*]] = phi i32 [ [[VAL]], [[LOAD_VAL]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[VALUE]]
+;
+entry:
+  %cmp1 = icmp ugt i64 %n, 1
+  br i1 %cmp1, label %preheader, label %done
+
+preheader:
+  br label %loop
+
+loop:
+  %conv = phi i64 [ %conv2, %loop ], [ 1, %preheader ]
+  %i = phi i8 [ %inc, %loop ], [ 1, %preheader ]
+  %sub = add nsw i64 %conv, -1
+  %ptr = getelementptr inbounds i32, ptr %arr, i64 %sub
+  store i32 65, ptr %ptr, align 4
+  %inc = add i8 %i, 1
+  %conv2 = zext i8 %inc to i64
+  %cmp2 = icmp ult i64 %conv2, %n
+  br i1 %cmp2, label %loop, label %load_val, !llvm.loop !0
+
+load_val:
+  %final = phi i64 [ %conv, %loop ]
+  %ptr2 = getelementptr inbounds i32, ptr %arr, i64 %final
+  %val = load i32, ptr %ptr2, align 4
+  br label %done
+
+done:
+  %value = phi i32 [ %val, %load_val ], [ 0, %entry ]
+  ret i32 %value
+
+}
+
+!0 = distinct !{!0, !1, !2, !3}
+!1 = !{!"llvm.loop.unroll.disable"}
+!2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
+; CHECK: [[META2]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META3]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+;.

From a845ea3878f18878b6bbc91ff5fee2dd51a794f3 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin@gmail.com>
Date: Wed, 28 Feb 2024 14:47:33 +0100
Subject: [PATCH 075/114] [AMDGPU] Fix SDWA 'preserve' transformation for
 instructions in different basic blocks. (#82406)

This fixes crash when operand sources for V_OR instruction reside in
different basic blocks.
---
 llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp  |  7 ++-
 llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir | 57 ++++++++++++++++++++++
 2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 53fc2c0686245f9..afc380b42034579 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -472,12 +472,11 @@ bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
   }
 
   // Move MI before v_or_b32
-  auto MBB = MI.getParent();
-  MBB->remove(&MI);
-  MBB->insert(getParentInst(), &MI);
+  MI.getParent()->remove(&MI);
+  getParentInst()->getParent()->insert(getParentInst(), &MI);
 
   // Add Implicit use of preserved register
-  MachineInstrBuilder MIB(*MBB->getParent(), MI);
+  MachineInstrBuilder MIB(*MI.getMF(), MI);
   MIB.addReg(getPreservedOperand()->getReg(),
              RegState::ImplicitKill,
              getPreservedOperand()->getSubReg());
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
index f93456ccacb806a..4c61e6803febf35 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-preserve.mir
@@ -160,3 +160,60 @@ body:             |
     S_ENDPGM 0
 
 ...
+---
+name:            add_f16_u32_preserve_different_bb
+tracksRegLiveness: true
+body:             |
+  ; SDWA-LABEL: name: add_f16_u32_preserve_different_bb
+  ; SDWA: bb.0:
+  ; SDWA-NEXT:   successors: %bb.1(0x80000000)
+  ; SDWA-NEXT:   liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT:   [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr30_sgpr31
+  ; SDWA-NEXT:   [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+  ; SDWA-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+  ; SDWA-NEXT:   [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+  ; SDWA-NEXT:   [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+  ; SDWA-NEXT:   [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 65535, [[FLAT_LOAD_DWORD]], implicit $exec
+  ; SDWA-NEXT:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[FLAT_LOAD_DWORD1]], implicit $exec
+  ; SDWA-NEXT:   [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[FLAT_LOAD_DWORD]], 8, 8, implicit $exec
+  ; SDWA-NEXT:   [[V_LSHRREV_B32_e32_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e32 24, [[FLAT_LOAD_DWORD1]], implicit $exec
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT: bb.1:
+  ; SDWA-NEXT:   successors: %bb.2(0x80000000)
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT:   [[V_MUL_F32_sdwa:%[0-9]+]]:vgpr_32 = V_MUL_F32_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 5, 0, 1, 3, implicit $mode, implicit $exec
+  ; SDWA-NEXT: {{  $}}
+  ; SDWA-NEXT: bb.2:
+  ; SDWA-NEXT:   [[V_ADD_F16_sdwa:%[0-9]+]]:vgpr_32 = V_ADD_F16_sdwa 0, [[FLAT_LOAD_DWORD]], 0, [[FLAT_LOAD_DWORD1]], 0, 0, 1, 2, 4, 5, implicit $mode, implicit $exec, implicit killed [[V_MUL_F32_sdwa]](tied-def 0)
+  ; SDWA-NEXT:   FLAT_STORE_DWORD [[COPY2]], [[V_ADD_F16_sdwa]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+  ; SDWA-NEXT:   $sgpr30_sgpr31 = COPY [[COPY]]
+  ; SDWA-NEXT:   S_SETPC_B64_return $sgpr30_sgpr31
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+
+    %2:sreg_64 = COPY $sgpr30_sgpr31
+    %1:vreg_64 = COPY $vgpr2_vgpr3
+    %0:vreg_64 = COPY $vgpr0_vgpr1
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+    %4:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
+
+    %5:vgpr_32 = V_AND_B32_e32 65535, %3, implicit $exec
+    %6:vgpr_32 = V_LSHRREV_B32_e64 16, %4, implicit $exec
+    %7:vgpr_32 = V_BFE_U32_e64 %3, 8, 8, implicit $exec
+    %8:vgpr_32 = V_LSHRREV_B32_e32 24, %4, implicit $exec
+
+    %9:vgpr_32 = V_ADD_F16_e64 0, %5, 0, %6, 0, 0, implicit $mode, implicit $exec
+    %10:vgpr_32 = V_LSHLREV_B16_e64 8, %9, implicit $exec
+
+  bb.1:
+    %11:vgpr_32 = V_MUL_F32_e64 0, %7, 0, %8, 0, 0, implicit $mode, implicit $exec
+    %12:vgpr_32 = V_LSHLREV_B32_e64 16, %11, implicit $exec
+
+  bb.2:
+    %13:vgpr_32 = V_OR_B32_e64 %10, %12, implicit $exec
+
+    FLAT_STORE_DWORD %0, %13, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    $sgpr30_sgpr31 = COPY %2
+    S_SETPC_B64_return $sgpr30_sgpr31
+...

From 5f2097dbeda0dfb21bc9dec27f4c8ff2ad42cef2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ingo=20M=C3=BCller?= <ingomueller@google.com>
Date: Wed, 28 Feb 2024 14:48:40 +0100
Subject: [PATCH 076/114] [mlir] Expose MLIR_CUDA_CONVERSIONS_ENABLED in
 mlir-config.h. (#83004)

That macro was not defined in some cases and thus yielded warnings if
compiled with `-Wundef`. In particular, they were not defined in the
BUILD files, so the GPU targets were broken when built with Bazel. This
commit exposes mentioned CMake variable through mlir-config.h and uses
the macro that is introduced with the same name. This replaces the macro
MLIR_CUDA_CONVERSIONS_ENABLED, which the CMake files previously defined
manually.
---
 mlir/CMakeLists.txt                           |  2 --
 mlir/include/mlir/Config/mlir-config.h.cmake  |  4 ++++
 mlir/include/mlir/InitAllPasses.h             |  3 ++-
 .../GPU/Pipelines/GPUToNVVMPipeline.cpp       |  5 +++--
 .../Dialect/GPU/Transforms/ModuleToBinary.cpp |  3 ++-
 mlir/lib/Target/LLVM/NVVM/Target.cpp          |  9 +++++----
 .../Target/LLVM/SerializeNVVMTarget.cpp       |  7 ++++---
 .../llvm-project-overlay/mlir/BUILD.bazel     | 11 ++++++++---
 .../mlir/test/BUILD.bazel                     | 19 +------------------
 9 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 16c898bdeb6e003..070609c94a3b384 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -111,8 +111,6 @@ if ("NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
 else()
   set(MLIR_ENABLE_CUDA_CONVERSIONS 0)
 endif()
-# TODO: we should use a config.h file like LLVM does
-add_definitions(-DMLIR_CUDA_CONVERSIONS_ENABLED=${MLIR_ENABLE_CUDA_CONVERSIONS})
 
 # Build the ROCm conversions and run according tests if the AMDGPU backend
 # is available.
diff --git a/mlir/include/mlir/Config/mlir-config.h.cmake b/mlir/include/mlir/Config/mlir-config.h.cmake
index e152a36c0ce0cf3..4a7d75e22668237 100644
--- a/mlir/include/mlir/Config/mlir-config.h.cmake
+++ b/mlir/include/mlir/Config/mlir-config.h.cmake
@@ -29,4 +29,8 @@
 /* If set, enables PDL usage. */
 #cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH
 
+/* If set, enables CUDA-related features in CUDA-related transforms, pipelines,
+   and targets. */
+#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS
+
 #endif
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index e28921619fe582b..5d90c197a6ccedc 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -14,6 +14,7 @@
 #ifndef MLIR_INITALLPASSES_H_
 #define MLIR_INITALLPASSES_H_
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Conversion/Passes.h"
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
 #include "mlir/Dialect/Affine/Passes.h"
@@ -96,7 +97,7 @@ inline void registerAllPasses() {
   bufferization::registerBufferizationPipelines();
   sparse_tensor::registerSparseTensorPipelines();
   tosa::registerTosaToLinalgPipelines();
-#if MLIR_CUDA_CONVERSIONS_ENABLED
+#if MLIR_ENABLE_CUDA_CONVERSIONS
   gpu::registerGPUToNVVMPipeline();
 #endif
 }
diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
index 935f0deaf9c8a69..db1974ddb3773be 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
@@ -38,7 +39,7 @@
 
 using namespace mlir;
 
-#if MLIR_CUDA_CONVERSIONS_ENABLED
+#if MLIR_ENABLE_CUDA_CONVERSIONS
 namespace {
 
 //===----------------------------------------------------------------------===//
@@ -127,4 +128,4 @@ void mlir::gpu::registerGPUToNVVMPipeline() {
       buildLowerToNVVMPassPipeline);
 }
 
-#endif // MLIR_CUDA_CONVERSIONS_ENABLED
+#endif // MLIR_ENABLE_CUDA_CONVERSIONS
diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
index 0527073da85b695..f379ea8193923d6 100644
--- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -13,6 +13,7 @@
 
 #include "mlir/Dialect/GPU/Transforms/Passes.h"
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -48,7 +49,7 @@ void GpuModuleToBinaryPass::getDependentDialects(
   // Register all GPU related translations.
   registry.insert<gpu::GPUDialect>();
   registry.insert<LLVM::LLVMDialect>();
-#if MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#if MLIR_ENABLE_CUDA_CONVERSIONS
   registry.insert<NVVM::NVVMDialect>();
 #endif
 #if MLIR_ROCM_CONVERSIONS_ENABLED == 1
diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp
index 71b15a92782ed7b..d5b6645631edd65 100644
--- a/mlir/lib/Target/LLVM/NVVM/Target.cpp
+++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp
@@ -13,6 +13,7 @@
 
 #include "mlir/Target/LLVM/NVVM/Target.h"
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Target/LLVM/NVVM/Utils.h"
@@ -156,7 +157,7 @@ SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
   return std::move(bcFiles);
 }
 
-#if MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#if MLIR_ENABLE_CUDA_CONVERSIONS
 namespace {
 class NVPTXSerializer : public SerializeGPUModuleBase {
 public:
@@ -562,7 +563,7 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule) {
   return compileToBinary(*serializedISA);
 #endif // MLIR_NVPTXCOMPILER_ENABLED == 1
 }
-#endif // MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#endif // MLIR_ENABLE_CUDA_CONVERSIONS
 
 std::optional<SmallVector<char, 0>>
 NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
@@ -574,7 +575,7 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
     module->emitError("Module must be a GPU module.");
     return std::nullopt;
   }
-#if MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#if MLIR_ENABLE_CUDA_CONVERSIONS
   NVPTXSerializer serializer(*module, cast<NVVMTargetAttr>(attribute), options);
   serializer.init();
   return serializer.run();
@@ -582,7 +583,7 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
   module->emitError(
       "The `NVPTX` target was not built. Please enable it when building LLVM.");
   return std::nullopt;
-#endif // MLIR_CUDA_CONVERSIONS_ENABLED == 1
+#endif // MLIR_ENABLE_CUDA_CONVERSIONS
 }
 
 Attribute
diff --git a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
index 26bfbd5c11e81e0..cea49356538f0aa 100644
--- a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
+++ b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Config/mlir-config.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/IR/MLIRContext.h"
@@ -29,10 +30,10 @@
 using namespace mlir;
 
 // Skip the test if the NVPTX target was not built.
-#if MLIR_CUDA_CONVERSIONS_ENABLED == 0
-#define SKIP_WITHOUT_NVPTX(x) DISABLED_##x
-#else
+#if MLIR_ENABLE_CUDA_CONVERSIONS
 #define SKIP_WITHOUT_NVPTX(x) x
+#else
+#define SKIP_WITHOUT_NVPTX(x) DISABLED_##x
 #endif
 
 class MLIRTargetLLVMNVVM : public ::testing::Test {
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 59ee03d9a3213f2..f58808476cbbca4 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6,7 +6,6 @@
 #   The MLIR "Multi-Level Intermediate Representation" Compiler Infrastructure
 
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("@bazel_skylib//rules:write_file.bzl", "write_file")
 load(
     ":build_defs.bzl",
     "cc_headers_only",
@@ -36,7 +35,10 @@ expand_template(
         "#cmakedefine01 MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS": "#define MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS 0",
         "#cmakedefine MLIR_GREEDY_REWRITE_RANDOMIZER_SEED ${MLIR_GREEDY_REWRITE_RANDOMIZER_SEED}": "/* #undef MLIR_GREEDY_REWRITE_RANDOMIZER_SEED */",
         "#cmakedefine01 MLIR_ENABLE_PDL_IN_PATTERNMATCH": "#define MLIR_ENABLE_PDL_IN_PATTERNMATCH 1",
-    },
+    } | if_cuda_available(
+        {"#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS": "#define MLIR_ENABLE_CUDA_CONVERSIONS 1"},
+        {"#cmakedefine01 MLIR_ENABLE_CUDA_CONVERSIONS": "#define MLIR_ENABLE_CUDA_CONVERSIONS 0"},
+    ),
     template = "include/mlir/Config/mlir-config.h.cmake",
 )
 
@@ -5468,7 +5470,6 @@ cc_library(
     srcs = ["lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp"],
     hdrs = ["include/mlir/Dialect/GPU/Pipelines/Passes.h"],
     includes = ["include"],
-    local_defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     deps = [
         ":AffineToStandard",
         ":ArithToLLVM",
@@ -5492,6 +5493,7 @@ cc_library(
         ":Transforms",
         ":VectorToLLVM",
         ":VectorToSCF",
+        ":config",
     ],
 )
 
@@ -5541,6 +5543,7 @@ cc_library(
         ":Transforms",
         ":VCIXToLLVMIRTranslation",
         ":VectorDialect",
+        ":config",
         "//llvm:Core",
         "//llvm:MC",
         "//llvm:Support",
@@ -6176,6 +6179,7 @@ cc_library(
         ":NVVMToLLVMIRTranslation",
         ":TargetLLVM",
         ":ToLLVMIRTranslation",
+        ":config",
         "//llvm:NVPTXCodeGen",
         "//llvm:Support",
     ],
@@ -9131,6 +9135,7 @@ cc_library(
         ":VectorTransforms",
         ":X86VectorDialect",
         ":X86VectorTransforms",
+        ":config",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 68d9b23fd5643ea..583411aa60e5557 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -552,7 +552,6 @@ cc_library(
 cc_library(
     name = "TestTransforms",
     srcs = glob(["lib/Transforms/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -579,7 +578,6 @@ cc_library(
 cc_library(
     name = "TestFuncToLLVM",
     srcs = glob(["lib/Conversion/FuncToLLVM/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -594,7 +592,6 @@ cc_library(
 cc_library(
     name = "TestOneToNTypeConversion",
     srcs = glob(["lib/Conversion/OneToNTypeConversion/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -653,7 +650,6 @@ cc_library(
 cc_library(
     name = "TestDLTI",
     srcs = glob(["lib/Dialect/DLTI/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -667,7 +663,7 @@ cc_library(
 cc_library(
     name = "TestGPU",
     srcs = glob(["lib/Dialect/GPU/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"] + if_cuda_available([
+    defines = if_cuda_available([
         "MLIR_GPU_TO_CUBIN_PASS_ENABLE",
     ]),
     includes = ["lib/Dialect/Test"],
@@ -714,7 +710,6 @@ cc_library(
 cc_library(
     name = "TestLinalg",
     srcs = glob(["lib/Dialect/Linalg/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//llvm:Support",
@@ -748,7 +743,6 @@ cc_library(
 cc_library(
     name = "TestLLVM",
     srcs = glob(["lib/Dialect/LLVM/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:AffineToStandard",
@@ -773,7 +767,6 @@ cc_library(
 cc_library(
     name = "TestMath",
     srcs = glob(["lib/Dialect/Math/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithDialect",
@@ -790,7 +783,6 @@ cc_library(
 cc_library(
     name = "TestMathToVCIX",
     srcs = glob(["lib/Conversion/MathToVCIX/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithDialect",
@@ -807,7 +799,6 @@ cc_library(
 cc_library(
     name = "TestMemRef",
     srcs = glob(["lib/Dialect/MemRef/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -847,7 +838,6 @@ cc_library(
 cc_library(
     name = "TestNVGPU",
     srcs = glob(["lib/Dialect/NVGPU/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:AffineDialect",
@@ -871,7 +861,6 @@ cc_library(
 cc_library(
     name = "TestSCF",
     srcs = glob(["lib/Dialect/SCF/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//llvm:Support",
@@ -891,7 +880,6 @@ cc_library(
 cc_library(
     name = "TestArith",
     srcs = glob(["lib/Dialect/Arith/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithDialect",
@@ -908,7 +896,6 @@ cc_library(
 cc_library(
     name = "TestArmSME",
     srcs = glob(["lib/Dialect/ArmSME/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithToArmSME",
@@ -927,7 +914,6 @@ cc_library(
 cc_library(
     name = "TestBufferization",
     srcs = glob(["lib/Dialect/Bufferization/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:BufferizationDialect",
@@ -989,7 +975,6 @@ cc_library(
 cc_library(
     name = "TestFunc",
     srcs = glob(["lib/Dialect/Func/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         ":TestDialect",
@@ -1005,7 +990,6 @@ cc_library(
 cc_library(
     name = "TestTensor",
     srcs = glob(["lib/Dialect/Tensor/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:ArithDialect",
@@ -1022,7 +1006,6 @@ cc_library(
 cc_library(
     name = "TestVector",
     srcs = glob(["lib/Dialect/Vector/*.cpp"]),
-    defines = ["MLIR_CUDA_CONVERSIONS_ENABLED"],
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:AffineDialect",

From bb0ff1541092b54f81296350ce0e8a397673a105 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Wed, 28 Feb 2024 14:48:59 +0100
Subject: [PATCH 077/114] [flang] fix one more compilation issue on aarch64.
 (#83258)

I used `class` instead of `struct` for `SymbolMap`.

From 8e51b22ce21b01ae0be8267c5da3703ffd3b2c5b Mon Sep 17 00:00:00 2001
From: chuongg3 <chuong.goh@arm.com>
Date: Wed, 28 Feb 2024 13:55:27 +0000
Subject: [PATCH 078/114] [AArch64][GlobalISel] Legalize G_LOAD for v4s8 Vector
 (#82989)

Lowers `v4s8 = G_LOAD %ptr ptr` into

`s32 = G_LOAD %ptr ptr`
`v4s8 = G_BITCAST s32`
---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  8 +++++-
 llvm/test/CodeGen/AArch64/load.ll             | 27 +++++++++++--------
 2 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 3205484bb0631d3..91323e456a5ef84 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -388,8 +388,14 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(0, s32, 4)
       .clampMaxNumElements(0, s64, 2)
       .clampMaxNumElements(0, p0, 2)
+      // TODO: Use BITCAST for v2i8, v2i16 after G_TRUNC gets sorted out
+      .bitcastIf(typeInSet(0, {v4s8}),
+                 [=](const LegalityQuery &Query) {
+                   const LLT VecTy = Query.Types[0];
+                   return std::pair(0, LLT::scalar(VecTy.getSizeInBits()));
+                 })
       .customIf(IsPtrVecPred)
-      .scalarizeIf(typeIs(0, v2s16), 0);
+      .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0);
 
   getActionDefinitionsBuilder(G_STORE)
       .customIf([=](const LegalityQuery &Query) {
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index d82c1fe55a2dd2c..7f4540d915ab37b 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -1,9 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
-
-; CHECK-GI:         warning: Instruction selection used fallback path for load_v2i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for load_v4i8
+; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; ===== Legal Scalars =====
 
@@ -111,13 +108,21 @@ define <2 x i64> @load_v2i64(ptr %ptr){
 ; ===== Smaller/Larger Width Vectors with Legal Element Sizes =====
 
 define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b){
-; CHECK-LABEL: load_v2i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-NEXT:    add x8, x0, #1
-; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: load_v2i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
+; CHECK-SD-NEXT:    add x8, x0, #1
+; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: load_v2i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr b0, [x0]
+; CHECK-GI-NEXT:    ldr b1, [x0, #1]
+; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
     %a = load <2 x i8>, ptr %ptr
     ret <2 x i8> %a
 }

From 9e1432069555d70e1f0148742e565b31d3ba8695 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ingo=20M=C3=BCller?= <ingomueller@google.com>
Date: Wed, 28 Feb 2024 13:54:19 +0000
Subject: [PATCH 079/114] [bazel] Fix breakage from
 915fce040271c77df1ff9b2c8797c441cec0d18d.

That commit (from #82189) introduces a new dependency but does not
declare it in the BUILD files.
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index f58808476cbbca4..7860ccd0406a138 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -4111,6 +4111,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":AffineDialect",
+        ":AffineTransforms",
         ":AffineUtils",
         ":ArithDialect",
         ":ConversionPassIncGen",

From a8364c9e17b46ec339e97cc00877c58a7bdf6089 Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao@gmail.com>
Date: Wed, 28 Feb 2024 09:29:59 -0500
Subject: [PATCH 080/114] [mlir] Fix shared builds. NFC

---
 mlir/lib/Conversion/AffineToStandard/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
index 2ba0f30b1190ee7..f41e3ca27ee4dde 100644
--- a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
+++ b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt
@@ -12,12 +12,13 @@ add_mlir_conversion_library(MLIRAffineToStandard
 
   LINK_LIBS PUBLIC
   MLIRAffineDialect
+  MLIRAffineTransforms
   MLIRAffineUtils
   MLIRArithDialect
   MLIRIR
   MLIRMemRefDialect
-  MLIRSCFDialect
   MLIRPass
+  MLIRSCFDialect
   MLIRTransforms
   MLIRVectorDialect
   )

From c89d51112d329a4a37ff6dbcda7002853847c8a3 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 28 Feb 2024 06:07:58 -0800
Subject: [PATCH 081/114] [SLP]Use It->second.first for BWSz, NFC.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index de4e56ff80659a2..2b7d518c1c1a78e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9364,7 +9364,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
                 VectorCasts
                     .insert(std::make_pair(ScalarTE, FTy->getElementType()))
                     .second) {
-              unsigned BWSz = It->second.second;
+              unsigned BWSz = It->second.first;
               unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
               unsigned VecOpcode;
               if (DstBWSz < BWSz)
@@ -9376,7 +9376,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
               InstructionCost C = TTI->getCastInstrCost(
                   VecOpcode, FTy,
                   FixedVectorType::get(
-                      IntegerType::get(FTy->getContext(), It->second.first),
+                      IntegerType::get(FTy->getContext(), BWSz),
                       FTy->getNumElements()),
                   TTI::CastContextHint::None, CostKind);
               LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C

From 680c780a367bfe1c0cdf786250fd7f565ef6d23d Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Wed, 28 Feb 2024 16:44:34 +0200
Subject: [PATCH 082/114] [AMDGPU][AsmParser] Support structured HWREG
 operands. (#82805)

Symbolic values are to be supported separately.
---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      | 215 +++++++++++-------
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  10 -
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   9 -
 llvm/test/MC/AMDGPU/gfx1011_err.s             |   2 +-
 llvm/test/MC/AMDGPU/gfx1030_err.s             |   2 +-
 llvm/test/MC/AMDGPU/gfx10_err_pos.s           |  10 +-
 llvm/test/MC/AMDGPU/gfx940_asm_features.s     |  10 +-
 llvm/test/MC/AMDGPU/gfx940_err.s              |  12 +-
 llvm/test/MC/AMDGPU/sopk-err.s                | 173 +++++++++++---
 llvm/test/MC/AMDGPU/sopk.s                    |  49 +++-
 10 files changed, 343 insertions(+), 149 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b7b471d8dc7b39c..18d51087ff5fba6 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1685,24 +1685,48 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
 private:
   struct OperandInfoTy {
     SMLoc Loc;
-    int64_t Id;
+    int64_t Val;
     bool IsSymbolic = false;
     bool IsDefined = false;
 
-    OperandInfoTy(int64_t Id_) : Id(Id_) {}
+    OperandInfoTy(int64_t Val) : Val(Val) {}
   };
 
+  struct StructuredOpField : OperandInfoTy {
+    StringLiteral Id;
+    StringLiteral Desc;
+    unsigned Width;
+    bool IsDefined = false;
+
+    StructuredOpField(StringLiteral Id, StringLiteral Desc, unsigned Width,
+                      int64_t Default)
+        : OperandInfoTy(Default), Id(Id), Desc(Desc), Width(Width) {}
+    virtual ~StructuredOpField() = default;
+
+    bool Error(AMDGPUAsmParser &Parser, const Twine &Err) const {
+      Parser.Error(Loc, "invalid " + Desc + ": " + Err);
+      return false;
+    }
+
+    virtual bool validate(AMDGPUAsmParser &Parser) const {
+      if (IsSymbolic && Val == OPR_ID_UNSUPPORTED)
+        return Error(Parser, "not supported on this GPU");
+      if (!isUIntN(Width, Val))
+        return Error(Parser, "only " + Twine(Width) + "-bit values are legal");
+      return true;
+    }
+  };
+
+  ParseStatus parseStructuredOpFields(ArrayRef<StructuredOpField *> Fields);
+  bool validateStructuredOpFields(ArrayRef<const StructuredOpField *> Fields);
+
   bool parseSendMsgBody(OperandInfoTy &Msg, OperandInfoTy &Op, OperandInfoTy &Stream);
   bool validateSendMsg(const OperandInfoTy &Msg,
                        const OperandInfoTy &Op,
                        const OperandInfoTy &Stream);
 
-  bool parseHwregBody(OperandInfoTy &HwReg,
-                      OperandInfoTy &Offset,
-                      OperandInfoTy &Width);
-  bool validateHwreg(const OperandInfoTy &HwReg,
-                     const OperandInfoTy &Offset,
-                     const OperandInfoTy &Width);
+  ParseStatus parseHwregFunc(OperandInfoTy &HwReg, OperandInfoTy &Offset,
+                             OperandInfoTy &Width);
 
   SMLoc getFlatOffsetLoc(const OperandVector &Operands) const;
   SMLoc getSMEMOffsetLoc(const OperandVector &Operands) const;
@@ -7197,71 +7221,44 @@ bool AMDGPUOperand::isDepCtr() const { return isS16Imm(); }
 // hwreg
 //===----------------------------------------------------------------------===//
 
-bool
-AMDGPUAsmParser::parseHwregBody(OperandInfoTy &HwReg,
-                                OperandInfoTy &Offset,
-                                OperandInfoTy &Width) {
+ParseStatus AMDGPUAsmParser::parseHwregFunc(OperandInfoTy &HwReg,
+                                            OperandInfoTy &Offset,
+                                            OperandInfoTy &Width) {
   using namespace llvm::AMDGPU::Hwreg;
 
+  if (!trySkipId("hwreg", AsmToken::LParen))
+    return ParseStatus::NoMatch;
+
   // The register may be specified by name or using a numeric code
   HwReg.Loc = getLoc();
   if (isToken(AsmToken::Identifier) &&
-      (HwReg.Id = getHwregId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
+      (HwReg.Val = getHwregId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
     HwReg.IsSymbolic = true;
     lex(); // skip register name
-  } else if (!parseExpr(HwReg.Id, "a register name")) {
-    return false;
+  } else if (!parseExpr(HwReg.Val, "a register name")) {
+    return ParseStatus::Failure;
   }
 
   if (trySkipToken(AsmToken::RParen))
-    return true;
+    return ParseStatus::Success;
 
   // parse optional params
   if (!skipToken(AsmToken::Comma, "expected a comma or a closing parenthesis"))
-    return false;
+    return ParseStatus::Failure;
 
   Offset.Loc = getLoc();
-  if (!parseExpr(Offset.Id))
-    return false;
+  if (!parseExpr(Offset.Val))
+    return ParseStatus::Failure;
 
   if (!skipToken(AsmToken::Comma, "expected a comma"))
-    return false;
+    return ParseStatus::Failure;
 
   Width.Loc = getLoc();
-  return parseExpr(Width.Id) &&
-         skipToken(AsmToken::RParen, "expected a closing parenthesis");
-}
-
-bool
-AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg,
-                               const OperandInfoTy &Offset,
-                               const OperandInfoTy &Width) {
-
-  using namespace llvm::AMDGPU::Hwreg;
+  if (!parseExpr(Width.Val) ||
+      !skipToken(AsmToken::RParen, "expected a closing parenthesis"))
+    return ParseStatus::Failure;
 
-  if (HwReg.IsSymbolic) {
-    if (HwReg.Id == OPR_ID_UNSUPPORTED) {
-      Error(HwReg.Loc,
-            "specified hardware register is not supported on this GPU");
-      return false;
-    }
-  } else {
-    if (!isValidHwreg(HwReg.Id)) {
-      Error(HwReg.Loc,
-            "invalid code of hardware register: only 6-bit values are legal");
-      return false;
-    }
-  }
-  if (!isValidHwregOffset(Offset.Id)) {
-    Error(Offset.Loc, "invalid bit offset: only 5-bit values are legal");
-    return false;
-  }
-  if (!isValidHwregWidth(Width.Id)) {
-    Error(Width.Loc,
-          "invalid bitfield width: only values from 1 to 32 are legal");
-    return false;
-  }
-  return true;
+  return ParseStatus::Success;
 }
 
 ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
@@ -7270,24 +7267,40 @@ ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
   int64_t ImmVal = 0;
   SMLoc Loc = getLoc();
 
-  if (trySkipId("hwreg", AsmToken::LParen)) {
-    OperandInfoTy HwReg(OPR_ID_UNKNOWN);
-    OperandInfoTy Offset(HwregOffset::Default);
-    OperandInfoTy Width(HwregSize::Default);
-    if (parseHwregBody(HwReg, Offset, Width) &&
-        validateHwreg(HwReg, Offset, Width)) {
-      ImmVal = HwregEncoding::encode(HwReg.Id, Offset.Id, Width.Id);
-    } else {
-      return ParseStatus::Failure;
+  StructuredOpField HwReg("id", "hardware register", HwregId::Width,
+                          HwregId::Default);
+  StructuredOpField Offset("offset", "bit offset", HwregOffset::Width,
+                           HwregOffset::Default);
+  struct : StructuredOpField {
+    using StructuredOpField::StructuredOpField;
+    bool validate(AMDGPUAsmParser &Parser) const override {
+      if (!isUIntN(Width, Val - 1))
+        return Error(Parser, "only values from 1 to 32 are legal");
+      return true;
     }
-  } else if (parseExpr(ImmVal, "a hwreg macro")) {
-    if (ImmVal < 0 || !isUInt<16>(ImmVal))
-      return Error(Loc, "invalid immediate: only 16-bit values are legal");
-  } else {
-    return ParseStatus::Failure;
+  } Width("size", "bitfield width", HwregSize::Width, HwregSize::Default);
+  ParseStatus Res = parseStructuredOpFields({&HwReg, &Offset, &Width});
+
+  if (Res.isNoMatch())
+    Res = parseHwregFunc(HwReg, Offset, Width);
+
+  if (Res.isSuccess()) {
+    if (!validateStructuredOpFields({&HwReg, &Offset, &Width}))
+      return ParseStatus::Failure;
+    ImmVal = HwregEncoding::encode(HwReg.Val, Offset.Val, Width.Val);
   }
 
-  Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg));
+  if (Res.isNoMatch() &&
+      parseExpr(ImmVal, "a hwreg macro, structured immediate"))
+    Res = ParseStatus::Success;
+
+  if (!Res.isSuccess())
+    return ParseStatus::Failure;
+
+  if (!isUInt<16>(ImmVal))
+    return Error(Loc, "invalid immediate: only 16-bit values are legal");
+  Operands.push_back(
+      AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg));
   return ParseStatus::Success;
 }
 
@@ -7307,10 +7320,10 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
 
   Msg.Loc = getLoc();
   if (isToken(AsmToken::Identifier) &&
-      (Msg.Id = getMsgId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
+      (Msg.Val = getMsgId(getTokenStr(), getSTI())) != OPR_ID_UNKNOWN) {
     Msg.IsSymbolic = true;
     lex(); // skip message name
-  } else if (!parseExpr(Msg.Id, "a message name")) {
+  } else if (!parseExpr(Msg.Val, "a message name")) {
     return false;
   }
 
@@ -7318,16 +7331,16 @@ AMDGPUAsmParser::parseSendMsgBody(OperandInfoTy &Msg,
     Op.IsDefined = true;
     Op.Loc = getLoc();
     if (isToken(AsmToken::Identifier) &&
-        (Op.Id = getMsgOpId(Msg.Id, getTokenStr())) >= 0) {
+        (Op.Val = getMsgOpId(Msg.Val, getTokenStr())) >= 0) {
       lex(); // skip operation name
-    } else if (!parseExpr(Op.Id, "an operation name")) {
+    } else if (!parseExpr(Op.Val, "an operation name")) {
       return false;
     }
 
     if (trySkipToken(AsmToken::Comma)) {
       Stream.IsDefined = true;
       Stream.Loc = getLoc();
-      if (!parseExpr(Stream.Id))
+      if (!parseExpr(Stream.Val))
         return false;
     }
   }
@@ -7347,17 +7360,17 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
   bool Strict = Msg.IsSymbolic;
 
   if (Strict) {
-    if (Msg.Id == OPR_ID_UNSUPPORTED) {
+    if (Msg.Val == OPR_ID_UNSUPPORTED) {
       Error(Msg.Loc, "specified message id is not supported on this GPU");
       return false;
     }
   } else {
-    if (!isValidMsgId(Msg.Id, getSTI())) {
+    if (!isValidMsgId(Msg.Val, getSTI())) {
       Error(Msg.Loc, "invalid message id");
       return false;
     }
   }
-  if (Strict && (msgRequiresOp(Msg.Id, getSTI()) != Op.IsDefined)) {
+  if (Strict && (msgRequiresOp(Msg.Val, getSTI()) != Op.IsDefined)) {
     if (Op.IsDefined) {
       Error(Op.Loc, "message does not support operations");
     } else {
@@ -7365,16 +7378,16 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
     }
     return false;
   }
-  if (!isValidMsgOp(Msg.Id, Op.Id, getSTI(), Strict)) {
+  if (!isValidMsgOp(Msg.Val, Op.Val, getSTI(), Strict)) {
     Error(Op.Loc, "invalid operation id");
     return false;
   }
-  if (Strict && !msgSupportsStream(Msg.Id, Op.Id, getSTI()) &&
+  if (Strict && !msgSupportsStream(Msg.Val, Op.Val, getSTI()) &&
       Stream.IsDefined) {
     Error(Stream.Loc, "message operation does not support streams");
     return false;
   }
-  if (!isValidMsgStream(Msg.Id, Op.Id, Stream.Id, getSTI(), Strict)) {
+  if (!isValidMsgStream(Msg.Val, Op.Val, Stream.Val, getSTI(), Strict)) {
     Error(Stream.Loc, "invalid message stream id");
     return false;
   }
@@ -7393,7 +7406,7 @@ ParseStatus AMDGPUAsmParser::parseSendMsg(OperandVector &Operands) {
     OperandInfoTy Stream(STREAM_ID_NONE_);
     if (parseSendMsgBody(Msg, Op, Stream) &&
         validateSendMsg(Msg, Op, Stream)) {
-      ImmVal = encodeMsg(Msg.Id, Op.Id, Stream.Id);
+      ImmVal = encodeMsg(Msg.Val, Op.Val, Stream.Val);
     } else {
       return ParseStatus::Failure;
     }
@@ -7730,6 +7743,48 @@ AMDGPUAsmParser::getConstLoc(const OperandVector &Operands) const {
   return getOperandLoc(Test, Operands);
 }
 
+ParseStatus
+AMDGPUAsmParser::parseStructuredOpFields(ArrayRef<StructuredOpField *> Fields) {
+  if (!trySkipToken(AsmToken::LCurly))
+    return ParseStatus::NoMatch;
+
+  bool First = true;
+  while (!trySkipToken(AsmToken::RCurly)) {
+    if (!First &&
+        !skipToken(AsmToken::Comma, "comma or closing brace expected"))
+      return ParseStatus::Failure;
+
+    StringRef Id = getTokenStr();
+    SMLoc IdLoc = getLoc();
+    if (!skipToken(AsmToken::Identifier, "field name expected") ||
+        !skipToken(AsmToken::Colon, "colon expected"))
+      return ParseStatus::Failure;
+
+    auto I =
+        find_if(Fields, [Id](StructuredOpField *F) { return F->Id == Id; });
+    if (I == Fields.end())
+      return Error(IdLoc, "unknown field");
+    if ((*I)->IsDefined)
+      return Error(IdLoc, "duplicate field");
+
+    // TODO: Support symbolic values.
+    (*I)->Loc = getLoc();
+    if (!parseExpr((*I)->Val))
+      return ParseStatus::Failure;
+    (*I)->IsDefined = true;
+
+    First = false;
+  }
+  return ParseStatus::Success;
+}
+
+bool AMDGPUAsmParser::validateStructuredOpFields(
+    ArrayRef<const StructuredOpField *> Fields) {
+  return all_of(Fields, [this](const StructuredOpField *F) {
+    return F->validate(*this);
+  });
+}
+
 //===----------------------------------------------------------------------===//
 // swizzle
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 177d99a0ac0abe6..963dc2882fcc0d7 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1700,16 +1700,6 @@ int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI) {
   return (Idx < 0) ? Idx : Opr[Idx].Encoding;
 }
 
-bool isValidHwreg(int64_t Id) { return 0 <= Id && isUInt<HwregId::Width>(Id); }
-
-bool isValidHwregOffset(int64_t Offset) {
-  return 0 <= Offset && isUInt<HwregOffset::Width>(Offset);
-}
-
-bool isValidHwregWidth(int64_t Width) {
-  return 0 <= (Width - 1) && isUInt<HwregSize::Width>(Width - 1);
-}
-
 StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI) {
   int Idx = getOprIdx<const MCSubtargetInfo &>(Id, Opr, OPR_SIZE, STI);
   return (Idx < 0) ? "" : Opr[Idx].Name;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 9a6d0834679eae2..b2fc7d874fe5881 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1068,15 +1068,6 @@ using HwregEncoding = EncodingFields<HwregId, HwregOffset, HwregSize>;
 LLVM_READONLY
 int64_t getHwregId(const StringRef Name, const MCSubtargetInfo &STI);
 
-LLVM_READNONE
-bool isValidHwreg(int64_t Id);
-
-LLVM_READNONE
-bool isValidHwregOffset(int64_t Offset);
-
-LLVM_READNONE
-bool isValidHwregWidth(int64_t Width);
-
 LLVM_READNONE
 StringRef getHwreg(unsigned Id, const MCSubtargetInfo &STI);
 
diff --git a/llvm/test/MC/AMDGPU/gfx1011_err.s b/llvm/test/MC/AMDGPU/gfx1011_err.s
index 4b37aaf221e395f..a86e48a29c78f73 100644
--- a/llvm/test/MC/AMDGPU/gfx1011_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1011_err.s
@@ -17,7 +17,7 @@ v_dot8c_i32_i4 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES)
-// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 v_fma_legacy_f32 v0, v1, v2, v3
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s
index ba8784a39c3698f..f4ab5fe5b14a925 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_err.s
@@ -25,7 +25,7 @@ s_get_waveid_in_workgroup s0
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK)
-// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 v_mac_f32 v0, v1, v2
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx10_err_pos.s b/llvm/test/MC/AMDGPU/gfx10_err_pos.s
index 1d34f00ee0f9213..c2679db3b2acfa9 100644
--- a/llvm/test/MC/AMDGPU/gfx10_err_pos.s
+++ b/llvm/test/MC/AMDGPU/gfx10_err_pos.s
@@ -448,7 +448,7 @@ ds_swizzle_b32 v8, v2 offset:SWZ(QUAD_PERM, 0, 1, 2, 3)
 // expected a hwreg macro or an absolute expression
 
 s_setreg_b32 undef, s2
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected a hwreg macro or an absolute expression
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected a hwreg macro, structured immediate or an absolute expression
 // CHECK-NEXT:{{^}}s_setreg_b32 undef, s2
 // CHECK-NEXT:{{^}}             ^
 
@@ -621,10 +621,10 @@ s_setreg_b32  hwreg(3,0,33), s2
 // CHECK-NEXT:{{^}}                        ^
 
 //==============================================================================
-// invalid code of hardware register: only 6-bit values are legal
+// invalid hardware register: only 6-bit values are legal
 
 s_setreg_b32  hwreg(0x40), s2
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: invalid code of hardware register: only 6-bit values are legal
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: only 6-bit values are legal
 // CHECK-NEXT:{{^}}s_setreg_b32  hwreg(0x40), s2
 // CHECK-NEXT:{{^}}                    ^
 
@@ -1158,10 +1158,10 @@ v_movrels_b32_sdwa v0, shared_base
 // CHECK-NEXT:{{^}}                       ^
 
 //==============================================================================
-// specified hardware register is not supported on this GPU
+// invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES)
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // CHECK-NEXT:{{^}}s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES)
 // CHECK-NEXT:{{^}}                       ^
 
diff --git a/llvm/test/MC/AMDGPU/gfx940_asm_features.s b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
index 5ee9480677be92f..e208b6cf903d387 100644
--- a/llvm/test/MC/AMDGPU/gfx940_asm_features.s
+++ b/llvm/test/MC/AMDGPU/gfx940_asm_features.s
@@ -197,23 +197,23 @@ scratch_load_lds_ushort v2, off
 // GFX940: scratch_load_lds_sshort v2, off         ; encoding: [0x00,0x60,0xa4,0xdc,0x02,0x00,0x7f,0x00]
 scratch_load_lds_sshort v2, off
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_XCC_ID)   ; encoding: [0x14,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_XCC_ID)
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA) ; encoding: [0x15,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA)
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA1) ; encoding: [0x16,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_DATA1)
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_LO) ; encoding: [0x17,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_LO)
 
-// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOT-GFX940: :[[@LINE+2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX940: s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_HI) ; encoding: [0x18,0xf8,0x81,0xb8]
 s_getreg_b32 s1, hwreg(HW_REG_SQ_PERF_SNAPSHOT_PC_HI)
 
diff --git a/llvm/test/MC/AMDGPU/gfx940_err.s b/llvm/test/MC/AMDGPU/gfx940_err.s
index 515b89513a8048f..000f3decf96071d 100644
--- a/llvm/test/MC/AMDGPU/gfx940_err.s
+++ b/llvm/test/MC/AMDGPU/gfx940_err.s
@@ -97,22 +97,22 @@ v_cvt_pk_fp8_f32 v1, v2, v3 mul:2
 // GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
 
 s_getreg_b32 s1, hwreg(HW_REG_FLAT_SCR_LO)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_FLAT_SCR_HI)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_XNACK_MASK)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_HW_ID1)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_HW_ID2)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s1, hwreg(HW_REG_POPS_PACKER)
-// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 ds_ordered_count v5, v1 offset:65535 gds
 // GFX940: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/sopk-err.s b/llvm/test/MC/AMDGPU/sopk-err.s
index 504ee1d11cbc97d..cd92343b0e7fe70 100644
--- a/llvm/test/MC/AMDGPU/sopk-err.s
+++ b/llvm/test/MC/AMDGPU/sopk-err.s
@@ -5,48 +5,127 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 -show-encoding %s | FileCheck -check-prefix=GFX10 %s
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck -check-prefix=GFX11 %s
 
-// RUN: not llvm-mc -triple=amdgcn %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX9-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX10-ERR --implicit-check-not=error: %s
-// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX11-ERR --implicit-check-not=error: %s
+// RUN: not llvm-mc -triple=amdgcn %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=tahiti %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=tonga %s 2>&1 | FileCheck -check-prefixes=GCN,SICIVI-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx900 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX9-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX10-ERR --implicit-check-not=error: --strict-whitespace %s
+// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1100 %s 2>&1 | FileCheck -check-prefixes=GCN,GFX11-ERR --implicit-check-not=error: --strict-whitespace %s
 
 s_setreg_b32  0x1f803, s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid immediate: only 16-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  0x1f803, s2
+// GCN-NEXT: {{^}}              ^
 
 s_setreg_b32  typo(0x40), s2
-// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a hwreg macro or an absolute expression
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a hwreg macro, structured immediate or an absolute expression
+// GCN-NEXT: {{^}}s_setreg_b32  typo(0x40), s2
+// GCN-NEXT: {{^}}              ^
 
 s_setreg_b32  hwreg(0x40), s2
-// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid code of hardware register: only 6-bit values are legal
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: only 6-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(0x40), s2
+// GCN-NEXT: {{^}}                    ^
+
+s_setreg_b32  {id: 0x40}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: only 6-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 0x40}, s2
+// GCN-NEXT: {{^}}                   ^
 
 s_setreg_b32  hwreg(HW_REG_WRONG), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a register name or an absolute expression
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(HW_REG_WRONG), s2
+// GCN-NEXT: {{^}}                    ^
 
 s_setreg_b32  hwreg(1 2,3), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a comma or a closing parenthesis
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(1 2,3), s2
+// GCN-NEXT: {{^}}                      ^
 
 s_setreg_b32  hwreg(1,2 3), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a comma
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(1,2 3), s2
+// GCN-NEXT: {{^}}                        ^
 
 s_setreg_b32  hwreg(1,2,3, s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: expected a closing parenthesis
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(1,2,3, s2
+// GCN-NEXT: {{^}}                         ^
+
+s_setreg_b32  {id: 1 offset: 2, size: 3}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: comma or closing brace expected
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1 offset: 2, size: 3}, s2
+// GCN-NEXT: {{^}}                     ^
+
+s_setreg_b32  {id: 1 offset: 2, size: 3}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: comma or closing brace expected
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1 offset: 2, size: 3}, s2
+// GCN-NEXT: {{^}}                     ^
+
+s_setreg_b32  {id 1, offset: 2, size: 3}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: colon expected
+// GCN-NEXT: {{^}}s_setreg_b32  {id 1, offset: 2, size: 3}, s2
+// GCN-NEXT: {{^}}                  ^
+
+s_setreg_b32  {id: 1, offset: 2, size: 3, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: colon expected
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1, offset: 2, size: 3, s2
+// GCN-NEXT: {{^}}                                            ^
+
+s_setreg_b32  {id: 1, offset: 2, blah: 3}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: unknown field
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1, offset: 2, blah: 3}, s2
+// GCN-NEXT: {{^}}                                 ^
+
+s_setreg_b32  {id: 1, id: 2}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: duplicate field
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 1, id: 2}, s2
+// GCN-NEXT: {{^}}                      ^
 
 s_setreg_b32  hwreg(3,32,32), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bit offset: only 5-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(3,32,32), s2
+// GCN-NEXT: {{^}}                      ^
+
+s_setreg_b32  {id: 3, offset: 32, size: 32}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bit offset: only 5-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 3, offset: 32, size: 32}, s2
+// GCN-NEXT: {{^}}                              ^
 
 s_setreg_b32  hwreg(3,0,33), s2
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bitfield width: only values from 1 to 32 are legal
+// GCN-NEXT: {{^}}s_setreg_b32  hwreg(3,0,33), s2
+// GCN-NEXT: {{^}}                        ^
+
+s_setreg_b32  {id: 3, offset: 0, size: 33}, s2
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bitfield width: only values from 1 to 32 are legal
+// GCN-NEXT: {{^}}s_setreg_b32  {id: 3, offset: 0, size: 33}, s2
+// GCN-NEXT: {{^}}                                       ^
 
 s_setreg_imm32_b32  0x1f803, 0xff
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid immediate: only 16-bit values are legal
+// GCN-NEXT: {{^}}s_setreg_imm32_b32  0x1f803, 0xff
+// GCN-NEXT: {{^}}                    ^
 
 s_setreg_imm32_b32  hwreg(3,0,33), 0xff
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bitfield width: only values from 1 to 32 are legal
+// GCN-NEXT: {{^}}s_setreg_imm32_b32  hwreg(3,0,33), 0xff
+// GCN-NEXT: {{^}}                              ^
+
+s_setreg_imm32_b32  {id: 3, offset: 0, size: 33}, 0xff
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bitfield width: only values from 1 to 32 are legal
+// GCN-NEXT: {{^}}s_setreg_imm32_b32  {id: 3, offset: 0, size: 33}, 0xff
+// GCN-NEXT: {{^}}                                             ^
 
 s_getreg_b32  s2, hwreg(3,32,32)
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bit offset: only 5-bit values are legal
+// GCN-NEXT: {{^}}s_getreg_b32  s2, hwreg(3,32,32)
+// GCN-NEXT: {{^}}                          ^
+
+s_getreg_b32  s2, {id: 3, offset: 32, size: 32}
+// GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid bit offset: only 5-bit values are legal
+// GCN-NEXT: {{^}}s_getreg_b32  s2, {id: 3, offset: 32, size: 32}
+// GCN-NEXT: {{^}}                                  ^
 
 s_cbranch_i_fork s[2:3], 0x6
 // SICI: s_cbranch_i_fork s[2:3], 6 ; encoding: [0x06,0x00,0x82,0xb8]
@@ -57,69 +136,109 @@ s_cbranch_i_fork s[2:3], 0x6
 
 s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES)
 // GFX10:  s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES) ; encoding: [0x0f,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES)
+// SICIVI-ERR-NEXT: {{^}}                       ^
 // GFX9: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES) ; encoding: [0x0f,0xf8,0x82,0xb8]
 // GFX11: s_getreg_b32 s2, hwreg(HW_REG_SH_MEM_BASES) ; encoding: [0x0f,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_TBA_LO)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_TBA_LO) ; encoding: [0x10,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TBA_LO)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX11-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TBA_LO)
+// GFX11-ERR-NEXT: {{^}}                       ^
 // GFX9:     s_getreg_b32 s2, hwreg(HW_REG_TBA_LO)   ; encoding: [0x10,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_TBA_HI)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_TBA_HI) ; encoding: [0x11,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TBA_HI)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX11-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TBA_HI)
+// GFX11-ERR-NEXT: {{^}}                       ^
 // GFX9:     s_getreg_b32 s2, hwreg(HW_REG_TBA_HI)   ; encoding: [0x11,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_TMA_LO)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_TMA_LO) ; encoding: [0x12,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TMA_LO)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX11-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TMA_LO)
+// GFX11-ERR-NEXT: {{^}}                       ^
 // GFX9:     s_getreg_b32 s2, hwreg(HW_REG_TMA_LO)   ; encoding: [0x12,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_TMA_HI)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_TMA_HI) ; encoding: [0x13,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TMA_HI)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX11-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_TMA_HI)
+// GFX11-ERR-NEXT: {{^}}                       ^
 // GFX9:     s_getreg_b32 s2, hwreg(HW_REG_TMA_HI)   ; encoding: [0x13,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO) ; encoding: [0x14,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX9-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX9-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX9-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO)
+// GFX9-ERR-NEXT: {{^}}                       ^
 // GFX11: s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_LO) ; encoding: [0x14,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI) ; encoding: [0x15,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX9-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX9-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX9-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI)
+// GFX9-ERR-NEXT: {{^}}                       ^
 // GFX11: s_getreg_b32 s2, hwreg(HW_REG_FLAT_SCR_HI) ; encoding: [0x15,0xf8,0x82,0xb8]
 
 s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK) ; encoding: [0x16,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX9-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-4]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX9-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR: :[[@LINE-6]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_getreg_b32 s2, hwreg(HW_REG_POPS_PACKER)
 // GFX10:    s_getreg_b32 s2, hwreg(HW_REG_POPS_PACKER) ; encoding: [0x19,0xf8,0x02,0xb9]
-// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX9-ERR: :[[@LINE-3]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// GFX11-ERR: :[[@LINE-4]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// SICIVI-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// SICIVI-ERR-NEXT: {{^}}s_getreg_b32 s2, hwreg(HW_REG_POPS_PACKER)
+// SICIVI-ERR-NEXT: {{^}}                       ^
+// GFX9-ERR: :[[@LINE-5]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// GFX11-ERR: :[[@LINE-6]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_cmpk_le_u32 s2, -1
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_cmpk_le_u32 s2, -1
+// GCN-NEXT: {{^}}                  ^
 
 s_cmpk_le_u32 s2, 0x1ffff
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_cmpk_le_u32 s2, 0x1ffff
+// GCN-NEXT: {{^}}                  ^
 
 s_cmpk_le_u32 s2, 0x10000
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_cmpk_le_u32 s2, 0x10000
+// GCN-NEXT: {{^}}                  ^
 
 s_mulk_i32 s2, 0xFFFFFFFFFFFF0000
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_mulk_i32 s2, 0xFFFFFFFFFFFF0000
+// GCN-NEXT: {{^}}               ^
 
 s_mulk_i32 s2, 0x10000
 // GCN: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GCN-NEXT: {{^}}s_mulk_i32 s2, 0x10000
+// GCN-NEXT: {{^}}               ^
diff --git a/llvm/test/MC/AMDGPU/sopk.s b/llvm/test/MC/AMDGPU/sopk.s
index 2b20c35aa7719ae..c912b83ca61c27c 100644
--- a/llvm/test/MC/AMDGPU/sopk.s
+++ b/llvm/test/MC/AMDGPU/sopk.s
@@ -158,6 +158,12 @@ s_getreg_b32 s2, hwreg(51, 1, 31)
 // GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
 // GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
 
+s_getreg_b32 s2, {id: 51, offset: 1, size: 31}
+// SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+
 // HW register code of unknown HW register, default offset/width
 s_getreg_b32 s2, hwreg(51)
 // SICI: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
@@ -165,6 +171,27 @@ s_getreg_b32 s2, hwreg(51)
 // GFX10: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
 // GFX11: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
 
+// Structured form using default values.
+s_getreg_b32 s2, {id: 51}
+// SICI: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
+
+// Fields may come in any order.
+s_getreg_b32 s2, {size: 32, id: 51}
+// SICI: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51) ; encoding: [0x33,0xf8,0x82,0xb8]
+
+// Empty field lists are allowed.
+s_getreg_b32 s2, {}
+// SICI: s_getreg_b32 s2, hwreg(0) ; encoding: [0x00,0xf8,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(0) ; encoding: [0x00,0xf8,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(0) ; encoding: [0x00,0xf8,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(0) ; encoding: [0x00,0xf8,0x82,0xb8]
+
 // HW register code of unknown HW register, valid symbolic name range but no name available
 s_getreg_b32 s2, hwreg(10)
 // SICI: s_getreg_b32 s2, hwreg(10) ; encoding: [0x0a,0xf8,0x02,0xb9]
@@ -271,17 +298,17 @@ s_setreg_b32 hwreg(HW_REG_HW_ID), s2
 // SICI: s_setreg_b32 hwreg(HW_REG_HW_ID), s2       ; encoding: [0x04,0xf8,0x82,0xb9]
 // VI9:  s_setreg_b32 hwreg(HW_REG_HW_ID), s2       ; encoding: [0x04,0xf8,0x02,0xb9]
 // GFX10: s_setreg_b32 hwreg(HW_REG_HW_ID1), s2   ; encoding: [0x17,0xf8,0x82,0xb9]
-// NOGFX11: :[[@LINE-4]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOGFX11: :[[@LINE-4]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 
 s_setreg_b32 hwreg(HW_REG_HW_ID1), s2
-// NOSICIVI: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// NOGFX9: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOSICIVI: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// NOGFX9: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX10: s_setreg_b32 hwreg(HW_REG_HW_ID1), s2      ; encoding: [0x17,0xf8,0x82,0xb9]
 // GFX11: s_setreg_b32 hwreg(HW_REG_HW_ID1), s2 ; encoding: [0x17,0xf8,0x02,0xb9]
 
 s_setreg_b32 hwreg(HW_REG_HW_ID2), s2
-// NOSICIVI: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
-// NOGFX9: :[[@LINE-2]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
+// NOSICIVI: :[[@LINE-1]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
+// NOGFX9: :[[@LINE-2]]:{{[0-9]+}}: error: invalid hardware register: not supported on this GPU
 // GFX10: s_setreg_b32 hwreg(HW_REG_HW_ID2), s2      ; encoding: [0x18,0xf8,0x82,0xb9]
 // GFX11: s_setreg_b32 hwreg(HW_REG_HW_ID2), s2 ; encoding: [0x18,0xf8,0x02,0xb9]
 
@@ -427,12 +454,24 @@ s_getreg_b32 s2, hwreg(reg + 1, offset - 1, width + 1)
 // GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
 // GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
 
+s_getreg_b32 s2, {id: reg + 1, offset: offset - 1, size: width + 1}
+// SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+
 s_getreg_b32 s2, hwreg(1 + reg, -1 + offset, 1 + width)
 // SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
 // VI9:  s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
 // GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
 // GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
 
+s_getreg_b32 s2, {id: 1 + reg, offset: -1 + offset, size: 1 + width}
+// SICI: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// VI9:  s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+// GFX10: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x02,0xb9]
+// GFX11: s_getreg_b32 s2, hwreg(51, 1, 31) ; encoding: [0x73,0xf0,0x82,0xb8]
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//

From fdd60c7b961cd7beb2aaa4c5d0c559c34bbfafe8 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <5361294+alexey-bataev@users.noreply.github.com>
Date: Wed, 28 Feb 2024 10:15:52 -0500
Subject: [PATCH 083/114] [LV][NFC]Preselect folding style while chosing the
 max VF, NFC.

Selects the tail-folding style while choosing the max vector
factor and storing it in the data member rather than calculating it each
time upon getTailFoldingStyle call.

Part of https://github.com/llvm/llvm-project/pull/76172

Reviewers: ayalz, fhahn

Reviewed By: fhahn

Pull Request: https://github.com/llvm/llvm-project/pull/81885
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 42 +++++++++++++------
 1 file changed, 30 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2e76adb83bd5039..e5bce7299a970fe 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1510,19 +1510,36 @@ class LoopVectorizationCostModel {
   }
 
   /// Returns the TailFoldingStyle that is best for the current loop.
-  TailFoldingStyle
-  getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
-    if (!CanFoldTailByMasking)
-      return TailFoldingStyle::None;
+  TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
+    return IVUpdateMayOverflow ? ChosenTailFoldingStyle.first
+                               : ChosenTailFoldingStyle.second;
+  }
+
+  /// Selects and saves TailFoldingStyle for 2 options - if IV update may
+  /// overflow or not.
+  void setTailFoldinStyles() {
+    assert(ChosenTailFoldingStyle.first == TailFoldingStyle::None &&
+           ChosenTailFoldingStyle.second == TailFoldingStyle::None &&
+           "Tail folding must not be selected yet.");
+    if (!Legal->prepareToFoldTailByMasking())
+      return;
 
-    if (ForceTailFoldingStyle.getNumOccurrences())
-      return ForceTailFoldingStyle;
+    if (ForceTailFoldingStyle.getNumOccurrences()) {
+      ChosenTailFoldingStyle.first = ChosenTailFoldingStyle.second =
+          ForceTailFoldingStyle;
+      return;
+    }
 
-    return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow);
+    ChosenTailFoldingStyle.first =
+        TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true);
+    ChosenTailFoldingStyle.second =
+        TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false);
   }
 
   /// Returns true if all loop blocks should be masked to fold tail loop.
   bool foldTailByMasking() const {
+    // TODO: check if it is possible to check for None style independent of
+    // IVUpdateMayOverflow flag in getTailFoldingStyle.
     return getTailFoldingStyle() != TailFoldingStyle::None;
   }
 
@@ -1675,8 +1692,10 @@ class LoopVectorizationCostModel {
   /// iterations to execute in the scalar loop.
   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
 
-  /// All blocks of loop are to be masked to fold tail of scalar iterations.
-  bool CanFoldTailByMasking = false;
+  /// Control finally chosen tail folding style. The first element is used if
+  /// the IV update may overflow, the second element - if it does not.
+  std::pair<TailFoldingStyle, TailFoldingStyle> ChosenTailFoldingStyle =
+      std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
 
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
@@ -4633,10 +4652,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  if (Legal->prepareToFoldTailByMasking()) {
-    CanFoldTailByMasking = true;
+  setTailFoldinStyles();
+  if (foldTailByMasking())
     return MaxFactors;
-  }
 
   // If there was a tail-folding hint/switch, but we can't fold the tail by
   // masking, fallback to a vectorization with a scalar epilogue.

From cb6c0f1d28c0d1915d1ca9a198254e3828af2384 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 28 Feb 2024 13:06:41 +0100
Subject: [PATCH 084/114] [clang][Interp] Ignore ArrayDecay ops for null
 pointers

Just don't do anything and let later operations handle the diagnostics.
---
 clang/lib/AST/Interp/Interp.h   | 2 +-
 clang/test/AST/Interp/cxx11.cpp | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index db52f6649c18ba2..241d5941e143ee1 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1933,7 +1933,7 @@ inline bool ArrayElemPop(InterpState &S, CodePtr OpPC, uint32_t Index) {
 inline bool ArrayDecay(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
-  if (Ptr.isDummy()) {
+  if (Ptr.isZero() || Ptr.isDummy()) {
     S.Stk.push<Pointer>(Ptr);
     return true;
   }
diff --git a/clang/test/AST/Interp/cxx11.cpp b/clang/test/AST/Interp/cxx11.cpp
index 29098ea694c6881..993e3618a37848f 100644
--- a/clang/test/AST/Interp/cxx11.cpp
+++ b/clang/test/AST/Interp/cxx11.cpp
@@ -28,3 +28,5 @@ struct S {
 };
 constexpr S s = { 5 };
 constexpr const int *p = &s.m + 1;
+
+constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; // ok

From 3e35ba53e20dbbd3ccc191d71ed75d52dc36ec59 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Wed, 28 Feb 2024 16:18:04 +0100
Subject: [PATCH 085/114] AMDGPU/GFX12: Insert waitcnts before stores with
 scope_sys (#82996)

Insert waitcnts for loads and atomics before stores with system scope.
Scope is field in instruction encoding and corresponds to desired
coherence level in cache hierarchy.
Intrinsic stores can set scope in cache policy operand.
If volatile keyword is used on generic stores memory legalizer will set
scope to system. Generic stores, by default, get lowest scope level.
Waitcnts are not required if it is guaranteed that memory is cached.
For example vulkan shaders can guarantee this.
TODO: implement flag for frontends to give us a hint not to insert
waits.
Expecting vulkan flag to be implemented as vulkan:private MMRA.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  2 +
 llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp  | 47 +++++++++++++++++++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  1 +
 .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll |  8 ++++
 llvm/test/CodeGen/AMDGPU/clamp.ll             |  1 +
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      | 20 ++++++++
 .../lower-work-group-id-intrinsics-hsa.ll     |  1 +
 llvm/test/CodeGen/AMDGPU/omod.ll              |  2 +-
 .../AMDGPU/vgpr-mark-last-scratch-load.ll     |  4 ++
 .../wait-before-stores-with-scope_sys.ll      | 26 ++++++++++
 .../wait-before-stores-with-scope_sys.mir     | 43 +++++++++++++++++
 11 files changed, 154 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index d774826c1d08c03..a8a33a5fecb4134 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -949,6 +949,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
       return AMDGPU::S_WAIT_BVHCNT;
     case AMDGPU::S_WAIT_DSCNT_soft:
       return AMDGPU::S_WAIT_DSCNT;
+    case AMDGPU::S_WAIT_KMCNT_soft:
+      return AMDGPU::S_WAIT_KMCNT;
     default:
       return Opcode;
     }
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index f62e808b33e42b2..4069a368f68719b 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -312,6 +312,10 @@ class SICacheControl {
                                               SIMemOp Op, bool IsVolatile,
                                               bool IsNonTemporal) const = 0;
 
+  virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
+    return false;
+  };
+
   /// Inserts any necessary instructions at position \p Pos relative
   /// to instruction \p MI to ensure memory instructions before \p Pos of kind
   /// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -589,6 +593,15 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
   bool setScope(const MachineBasicBlock::iterator MI,
                 AMDGPU::CPol::CPol Value) const;
 
+  // Stores with system scope (SCOPE_SYS) need to wait for:
+  // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
+  // - non-returning-atomics       - wait for STORECNT==0
+  //   TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
+  //   since it does not distinguish atomics-with-return from regular stores.
+  // There is no need to wait if memory is cached (mtype != UC).
+  bool
+  insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
+
 public:
   SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
 
@@ -603,6 +616,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
                                       SIAtomicAddrSpace AddrSpace, SIMemOp Op,
                                       bool IsVolatile,
                                       bool IsNonTemporal) const override;
+
+  bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
 };
 
 class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -2194,6 +2209,22 @@ bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
   return false;
 }
 
+bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
+    const MachineBasicBlock::iterator MI) const {
+  // TODO: implement flag for frontend to give us a hint not to insert waits.
+
+  MachineBasicBlock &MBB = *MI->getParent();
+  const DebugLoc &DL = MI->getDebugLoc();
+
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
+  BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
+
+  return true;
+}
+
 bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
                                      SIAtomicScope Scope,
                                      SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -2364,6 +2395,9 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   if (IsVolatile) {
     Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
 
+    if (Op == SIMemOp::STORE)
+      Changed |= insertWaitsBeforeSystemScopeStore(MI);
+
     // Ensure operation has completed at system scope to cause all volatile
     // operations to be visible outside the program in a global order. Do not
     // request cross address space as only the global address space can be
@@ -2381,6 +2415,15 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
   return Changed;
 }
 
+bool SIGfx12CacheControl::expandSystemScopeStore(
+    MachineBasicBlock::iterator &MI) const {
+  MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+  if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
+    return insertWaitsBeforeSystemScopeStore(MI);
+
+  return false;
+}
+
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
   if (AtomicPseudoMIs.empty())
     return false;
@@ -2467,6 +2510,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
   Changed |= CC->enableVolatileAndOrNonTemporal(
       MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
       MOI.isNonTemporal());
+
+  // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
+  // instruction field, do not confuse it with atomic scope.
+  Changed |= CC->expandSystemScopeStore(MI);
   return Changed;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 0fe2845f8edc310..b5de311f8c58cef 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1601,6 +1601,7 @@ let SubtargetPredicate = isGFX12Plus in {
   def S_WAIT_SAMPLECNT_soft : SOPP_Pseudo <"s_soft_wait_samplecnt", (ins s16imm:$simm16), "$simm16">;
   def S_WAIT_BVHCNT_soft : SOPP_Pseudo <"s_soft_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">;
   def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
+  def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
 }
 
 def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 921bdb5015c79ac..63e7339d829e1d5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -256,6 +256,7 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -607,6 +608,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -921,6 +923,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -1089,6 +1092,7 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -1242,6 +1246,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -1306,6 +1311,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -1389,6 +1395,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_mov_b32 s0, 1
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
 ; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -1478,6 +1485,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_mov_b32 s0, 1
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 7c1c24f4a67dfa7..dfadd8d205b04e1 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -525,6 +525,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
 ; GFX12-NEXT:    v_max_num_f32_e32 v1, 0, v1
 ; GFX12-NEXT:    v_min_num_f32_e32 v2, 1.0, v1
 ; GFX12-NEXT:    global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v1, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 687d8456569229f..850be72f06c7d06 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -893,6 +893,7 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -964,6 +965,7 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -2137,6 +2139,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -2221,6 +2224,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -3382,6 +3386,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -3468,6 +3473,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -3714,6 +3720,7 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -3789,6 +3796,7 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -3998,6 +4006,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4055,6 +4064,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4107,6 +4117,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4164,6 +4175,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4220,6 +4232,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-NEXT:    v_mov_b32_e32 v3, 3
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -4282,6 +4295,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v3, 3
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -4340,6 +4354,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
@@ -4405,6 +4420,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
 ; GFX12-PAL-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
@@ -4456,6 +4472,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
@@ -4523,6 +4540,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
@@ -4576,6 +4594,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
@@ -4644,6 +4663,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
 ; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
 ; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
index afa914c8375f64a..9547f08d3eba6b4 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -269,6 +269,7 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
 ; GFX12-NEXT:    v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
 ; GFX12-NEXT:    s_lshr_b32 s1, ttmp7, 16
 ; GFX12-NEXT:    v_mov_b32_e32 v8, s1
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[2:3], v7, off scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll
index fa1ca66ef4156c5..769d035858ca83a 100644
--- a/llvm/test/CodeGen/AMDGPU/omod.ll
+++ b/llvm/test/CodeGen/AMDGPU/omod.ll
@@ -651,8 +651,8 @@ define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
 ; GFX12-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_mul_f32_e32 v1, 4.0, v0
-; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b32 v[0:1], v1, off
+; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
 ; GFX12-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
index 137bd0f5d9f14e2..4efa1e9353ab3a5 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll
@@ -28,6 +28,7 @@ define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
 ; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v5, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
@@ -116,6 +117,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
@@ -174,6 +176,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    scratch_store_b32 off, v0, off offset:32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v10, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
@@ -208,6 +211,7 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
 ; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off th:TH_LOAD_LU ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
 ; CHECK-NEXT:    s_wait_storecnt 0x0
 ; CHECK-NEXT:    scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
new file mode 100644
index 000000000000000..e6fbe97f8dc0a5c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+
+define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+; GFX12-LABEL: intrinsic_store_system_scope:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 24)
+  ret void
+}
+
+define amdgpu_ps void @generic_store_volatile(i32 %val, ptr addrspace(1) %out) {
+; GFX12-LABEL: generic_store_volatile:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    global_store_b32 v[1:2], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  store volatile i32 %val, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir
new file mode 100644
index 000000000000000..acf8bd3a6ab563e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.mir
@@ -0,0 +1,43 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-memory-legalizer  %s -o - | FileCheck -check-prefix=GFX12 %s
+
+---
+name: intrinsic_store_system_scope
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX12-LABEL: name: intrinsic_store_system_scope
+    ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
+    ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_KMCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+    ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+    ; GFX12-NEXT: S_ENDPGM 0
+    BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
+    S_ENDPGM 0
+...
+
+---
+name: generic_store_volatile
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX12-LABEL: name: generic_store_volatile
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
+    ; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_KMCNT_soft 0
+    ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 24, implicit $exec :: (volatile store (s32), addrspace 1)
+    ; GFX12-NEXT: S_WAIT_STORECNT_soft 0
+    ; GFX12-NEXT: S_ENDPGM 0
+    GLOBAL_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
+    S_ENDPGM 0
+...

From 6067129fbe8944d5bd39dc1bc2331ce5ed6ecee8 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn.dawkins@gmail.com>
Date: Wed, 28 Feb 2024 10:52:57 -0500
Subject: [PATCH 086/114] Revert "[mlir][vector] Add a pattern to fuse
 extract(constant_mask) (#81057)" (#83275)

This reverts commit 5cdb8c0c8854d08ac7ca131ce3e8d78a32eb6716.

This pattern is producing incorrect IR. For example,

```mlir
func.func @extract_subvector_from_constant_mask() -> vector<16xi1> {
  %mask = vector.constant_mask [2, 3] : vector<16x16xi1>
  %extract = vector.extract %mask[8] : vector<16xi1> from vector<16x16xi1>
  return %extract : vector<16xi1>
}
```

Canonicalizes to

```mlir
func.func @extract_subvector_from_constant_mask() -> vector<16xi1> {
  %0 = vector.constant_mask [3] : vector<16xi1>
  return %0 : vector<16xi1>
}
```

Where it should be a zero mask because the extraction index (8) is
greater than the constant mask size along that dim (2).
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp   | 74 +---------------------
 mlir/test/Dialect/Vector/canonicalize.mlir | 73 ---------------------
 2 files changed, 1 insertion(+), 146 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 8c341a347ff6628..5be6a628904cdf7 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -2040,77 +2040,6 @@ class ExtractOpFromCreateMask final : public OpRewritePattern<ExtractOp> {
   }
 };
 
-// Patterns to rewrite ExtractOp(ConstantMaskOp)
-//
-// When the result of ExtractOp is a subvector of input, we can rewrite it as
-// a ConstantMaskOp with subvector ranks.
-//
-// ExtractOp(ConstantMaskOp) -> ConstantMaskOp
-//
-// When the result of ExtractOp is a scalar, we can get the scalar value
-// directly.
-//
-// ExtractOp(ConstantMaskOp) -> ConstantOp
-class ExtractOpFromConstantMask final : public OpRewritePattern<ExtractOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ExtractOp extractOp,
-                                PatternRewriter &rewriter) const override {
-    auto constantMaskOp =
-        extractOp.getVector().getDefiningOp<vector::ConstantMaskOp>();
-    if (!constantMaskOp)
-      return failure();
-
-    // All indices must be static.
-    ArrayRef<int64_t> extractOpPos = extractOp.getStaticPosition();
-    unsigned dynamicPosCount =
-        llvm::count_if(extractOpPos, ShapedType::isDynamic);
-    // If there is any dynamic position in ExtractOp, we cannot determine the
-    // scalar value.
-    if (dynamicPosCount)
-      return failure();
-
-    ArrayRef<Attribute> maskDimSizes =
-        constantMaskOp.getMaskDimSizes().getValue();
-    Type resultTy = extractOp.getResult().getType();
-    if (resultTy.isa<mlir::VectorType>()) {
-      auto resultVectorTy = resultTy.cast<mlir::VectorType>();
-      int64_t resultRank = resultVectorTy.getRank();
-      int64_t n = maskDimSizes.size();
-      std::vector<int64_t> indices;
-      for (auto i = n - resultRank; i < n; ++i)
-        indices.push_back(cast<IntegerAttr>(maskDimSizes[i]).getInt());
-
-      rewriter.replaceOpWithNewOp<vector::ConstantMaskOp>(
-          extractOp, resultVectorTy,
-          vector::getVectorSubscriptAttr(rewriter, indices));
-
-      return success();
-    } else if (resultTy.isa<mlir::IntegerType>()) {
-      // ConstantMaskOp creates and returns a vector mask where elements of the
-      // result vector are set to ‘0’ or ‘1’, based on whether the element
-      // indices are contained within a hyper-rectangular region.
-      // We go through ExtractOp static positions to determine the position is
-      // within the hyper-rectangular region or not.
-      Type boolType = rewriter.getI1Type();
-      IntegerAttr setAttr = IntegerAttr::get(boolType, 1);
-      for (size_t i = 0, end = extractOpPos.size(); i < end; ++i) {
-        if (cast<IntegerAttr>(maskDimSizes[i]).getInt() <= extractOpPos[i]) {
-          setAttr = IntegerAttr::get(boolType, 0);
-          break;
-        }
-      }
-
-      rewriter.replaceOpWithNewOp<arith::ConstantOp>(extractOp, boolType,
-                                                     setAttr);
-      return success();
-    }
-
-    return failure();
-  }
-};
-
 // Folds extract(shape_cast(..)) into shape_cast when the total element count
 // does not change.
 LogicalResult foldExtractFromShapeCastToShapeCast(ExtractOp extractOp,
@@ -2137,8 +2066,7 @@ LogicalResult foldExtractFromShapeCastToShapeCast(ExtractOp extractOp,
 void ExtractOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                             MLIRContext *context) {
   results.add<ExtractOpSplatConstantFolder, ExtractOpNonSplatConstantFolder,
-              ExtractOpFromBroadcast, ExtractOpFromCreateMask,
-              ExtractOpFromConstantMask>(context);
+              ExtractOpFromBroadcast, ExtractOpFromCreateMask>(context);
   results.add(foldExtractFromShapeCastToShapeCast);
 }
 
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index dc486181ebe9605..e6f045e12e51973 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -2567,76 +2567,3 @@ func.func @load_store_forwarding_rank_mismatch(%v0: vector<4x1x1xf32>, %arg0: te
       tensor<4x4x4xf32>, vector<1x100x4x5xf32>
   return %r : vector<1x100x4x5xf32>
 }
-
-// -----
-
-// CHECK-LABEL: func.func @extract_true_from_constant_mask() -> i1 {
-func.func @extract_true_from_constant_mask() -> i1 {
-// CHECK:      %[[TRUE:.*]] = arith.constant true
-// CHECK-NEXT: return %[[TRUE]] : i1
-  %mask = vector.constant_mask [2, 2, 3] : vector<4x4x4xi1>
-  %extract = vector.extract %mask[1, 1, 2] : i1 from vector<4x4x4xi1>
-  return %extract : i1
-}
-
-// -----
-
-// CHECK-LABEL: func.func @extract_false_from_constant_mask() -> i1 {
-func.func @extract_false_from_constant_mask() -> i1 {
-// CHECK:      %[[FALSE:.*]] = arith.constant false
-// CHECK-NEXT: return %[[FALSE]] : i1
-  %mask = vector.constant_mask [2, 2, 3] : vector<4x4x4xi1>
-  %extract = vector.extract %mask[1, 2, 2] : i1 from vector<4x4x4xi1>
-  return %extract : i1
-}
-
-// -----
-
-// CHECK-LABEL: func.func @extract_from_create_mask() -> i1 {
-func.func @extract_from_create_mask() -> i1 {
-// CHECK:      %[[TRUE:.*]] = arith.constant true
-// CHECK-NEXT: return %[[TRUE]] : i1
-  %c2 = arith.constant 2 : index
-  %c3 = arith.constant 3 : index
-  %mask = vector.create_mask %c2, %c2, %c3 : vector<4x4x4xi1>
-  %extract = vector.extract %mask[1, 1, 2] : i1 from vector<4x4x4xi1>
-  return %extract : i1
-}
-
-// -----
-
-// CHECK-LABEL: func.func @extract_subvector_from_constant_mask() ->
-// CHECK-SAME:  vector<6xi1> {
-func.func @extract_subvector_from_constant_mask() -> vector<6xi1> {
-// CHECK:      %[[S0:.*]] = vector.constant_mask [4] : vector<6xi1>
-// CHECK-NEXT: return %[[S0]] : vector<6xi1>
-  %mask = vector.constant_mask [2, 3, 4] : vector<4x5x6xi1>
-  %extract = vector.extract %mask[1, 2] : vector<6xi1> from vector<4x5x6xi1>
-  return %extract : vector<6xi1>
-}
-
-// -----
-
-// CHECK-LABEL: func.func @extract_scalar_with_dynamic_positions(
-// CHECK-SAME:    %[[INDEX:.*]]: index) -> i1 {
-func.func @extract_scalar_with_dynamic_positions(%index: index) -> i1 {
-// CHECK:       %[[S0:.*]] = vector.constant_mask [2, 2, 3] : vector<4x4x4xi1>
-// CHECK-NEXT:  %[[S1:.*]] = vector.extract %[[S0]][1, 1, %[[INDEX]]] : i1 from vector<4x4x4xi1>
-// CHECK-NEXT:  return %[[S1]] : i1
-  %mask = vector.constant_mask [2, 2, 3] : vector<4x4x4xi1>
-  %extract = vector.extract %mask[1, 1, %index] : i1 from vector<4x4x4xi1>
-  return %extract : i1
-}
-
-// -----
-
-// CHECK-LABEL: func.func @extract_subvector_with_dynamic_positions
-// CHECK-SAME:    %[[INDEX:.*]]: index) -> vector<6xi1> {
-func.func @extract_subvector_with_dynamic_positions(%index: index) -> vector<6xi1> {
-// CHECK:      %[[S0:.*]] = vector.constant_mask [2, 3, 4] : vector<4x5x6xi1>
-// CHECK-NEXT: %[[S1:.*]] = vector.extract %[[S0]][1, %[[INDEX]]] : vector<6xi1> from vector<4x5x6xi1>
-// CHECK-NEXT: return %[[S1]] : vector<6xi1>
-  %mask = vector.constant_mask [2, 3, 4] : vector<4x5x6xi1>
-  %extract = vector.extract %mask[1, %index] : vector<6xi1> from vector<4x5x6xi1>
-  return %extract : vector<6xi1>
-}

From 80cff273906b200eed2f779913f49a64cad8c0c6 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 28 Feb 2024 07:27:39 -0800
Subject: [PATCH 087/114] [LV][NFC]Fix a misprint, NFC.

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e5bce7299a970fe..6b16923213c86eb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1517,7 +1517,7 @@ class LoopVectorizationCostModel {
 
   /// Selects and saves TailFoldingStyle for 2 options - if IV update may
   /// overflow or not.
-  void setTailFoldinStyles() {
+  void setTailFoldingStyles() {
     assert(ChosenTailFoldingStyle.first == TailFoldingStyle::None &&
            ChosenTailFoldingStyle.second == TailFoldingStyle::None &&
            "Tail folding must not be selected yet.");
@@ -4652,7 +4652,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  setTailFoldinStyles();
+  setTailFoldingStyles();
   if (foldTailByMasking())
     return MaxFactors;
 

From 4623c114fb51e48b5cbb352adecb60d76077cb0a Mon Sep 17 00:00:00 2001
From: Diego Caballero <diegocaballero@google.com>
Date: Wed, 28 Feb 2024 08:15:47 -0800
Subject: [PATCH 088/114] [mlir][Vector] Support vector.insert in bubbling
 bitcast patterns (#82843)

This PR is adds support for `vector.insert` to the patterns that bubble up and down `vector.bitcat` ops across `vector.extract/extract_slice/insert_slice` ops.
---
 .../Vector/Transforms/VectorTransforms.cpp    | 74 ++++++++++++++++++-
 .../Dialect/Vector/vector-transforms.mlir     | 45 +++++++++++
 2 files changed, 117 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 74dd1dfaca0da83..a2d4e216633181f 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -710,6 +710,76 @@ struct BubbleDownBitCastForStridedSliceExtract
   }
 };
 
+// Shuffles vector.bitcast op before vector.insert_strided_slice op.
+//
+// This transforms IR like:
+//   %0 = vector.insert %val, %dst[4] : vector<32xi4> into vector<8x32xi4>
+//   %1 = vector.bitcast %0 : vector<8x32xi4> to vector<8x16xi8>
+// Into:
+//   %0 = vector.bitcast %val : vector<32xi4> to vector<16xi8>
+//   %1 = vector.bitcast %dst : vector<8x32xi4> to vector<8x16xi8>
+//   %2 = vector.insert %0, %1 [4] : vector<16xi8> into vector<8x16xi8>
+//
+struct BubbleUpBitCastForInsert : public OpRewritePattern<vector::BitCastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::BitCastOp bitcastOp,
+                                PatternRewriter &rewriter) const override {
+    VectorType castSrcType = bitcastOp.getSourceVectorType();
+    VectorType castDstType = bitcastOp.getResultVectorType();
+
+    // 0-D and scalable vectors are not supported yet.
+    if (castSrcType.getRank() == 0 || castSrcType.isScalable() ||
+        castDstType.isScalable())
+      return failure();
+
+    int64_t castSrcLastDim = castSrcType.getShape().back();
+    int64_t castDstLastDim = castDstType.getShape().back();
+    bool isNumElemsShrink = castSrcLastDim >= castDstLastDim;
+    int64_t ratio;
+    if (isNumElemsShrink) {
+      assert(castSrcLastDim % castDstLastDim == 0);
+      ratio = castSrcLastDim / castDstLastDim;
+    } else {
+      assert(castDstLastDim % castSrcLastDim == 0);
+      ratio = castDstLastDim / castSrcLastDim;
+    }
+
+    auto insertOp = bitcastOp.getSource().getDefiningOp<vector::InsertOp>();
+    if (!insertOp)
+      return failure();
+
+    // Only vector sources are supported for now.
+    auto insertSrcType = dyn_cast<VectorType>(insertOp.getSourceType());
+    if (!insertSrcType)
+      return failure();
+
+    // Bitcast the source.
+    SmallVector<int64_t> srcDims(insertSrcType.getShape());
+    srcDims.back() =
+        isNumElemsShrink ? srcDims.back() / ratio : srcDims.back() * ratio;
+    VectorType newCastSrcType =
+        VectorType::get(srcDims, castDstType.getElementType());
+    auto newCastSrcOp = rewriter.create<vector::BitCastOp>(
+        bitcastOp.getLoc(), newCastSrcType, insertOp.getSource());
+
+    SmallVector<int64_t> dstDims(insertOp.getDestVectorType().getShape());
+    dstDims.back() =
+        isNumElemsShrink ? dstDims.back() / ratio : dstDims.back() * ratio;
+    VectorType newCastDstType =
+        VectorType::get(dstDims, castDstType.getElementType());
+
+    // Bitcast the destination.
+    auto newCastDstOp = rewriter.create<vector::BitCastOp>(
+        bitcastOp.getLoc(), newCastDstType, insertOp.getDest());
+
+    // Generate new insert.
+    rewriter.replaceOpWithNewOp<vector::InsertOp>(
+        bitcastOp, newCastSrcOp, newCastDstOp, insertOp.getMixedPosition());
+    return success();
+  }
+};
+
 // Shuffles vector.bitcast op before vector.insert_strided_slice op.
 //
 // This transforms IR like:
@@ -1782,8 +1852,8 @@ void mlir::vector::populateBubbleVectorBitCastOpPatterns(
     RewritePatternSet &patterns, PatternBenefit benefit) {
   patterns.add<BubbleDownVectorBitCastForExtract,
                BubbleDownBitCastForStridedSliceExtract,
-               BubbleUpBitCastForStridedSliceInsert>(patterns.getContext(),
-                                                     benefit);
+               BubbleUpBitCastForInsert, BubbleUpBitCastForStridedSliceInsert>(
+      patterns.getContext(), benefit);
 }
 
 void mlir::vector::populateBreakDownVectorBitCastOpPatterns(
diff --git a/mlir/test/Dialect/Vector/vector-transforms.mlir b/mlir/test/Dialect/Vector/vector-transforms.mlir
index ea10bd56390c785..eda6a5cc40d9993 100644
--- a/mlir/test/Dialect/Vector/vector-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-transforms.mlir
@@ -339,6 +339,51 @@ func.func @bubble_down_bitcast_in_strided_slice_extract_odd_size(%arg0: vector<4
   return %0: vector<3xf16>
 }
 
+// CHECK-LABEL:   func.func @bubble_up_bitcast_in_insert_i4_i8(
+// CHECK-SAME:                                                 %[[VAL:.*]]: vector<32xi4>,
+// CHECK-SAME:                                                 %[[DST:.*]]: vector<8x32xi4>) -> vector<8x16xi8> {
+func.func @bubble_up_bitcast_in_insert_i4_i8(%val: vector<32xi4>, %src: vector<8x32xi4>) -> vector<8x16xi8> {
+// CHECK:           %[[BC_VAL:.*]] = vector.bitcast %[[VAL]] : vector<32xi4> to vector<16xi8>
+// CHECK:           %[[BC_DST:.*]] = vector.bitcast %[[DST]] : vector<8x32xi4> to vector<8x16xi8>
+// CHECK:           vector.insert %[[BC_VAL]], %[[BC_DST]] [4] : vector<16xi8> into vector<8x16xi8>
+  %0 = vector.insert %val, %src[4] : vector<32xi4> into vector<8x32xi4>
+  %1 = vector.bitcast %0 : vector<8x32xi4> to vector<8x16xi8>
+  return %1 : vector<8x16xi8>
+}
+
+// CHECK-LABEL:   func.func @bubble_up_bitcast_in_insert_i8_i4(
+// CHECK-SAME:                                                 %[[VAL:.*]]: vector<16xi8>,
+// CHECK-SAME:                                                 %[[DST:.*]]: vector<8x16xi8>) -> vector<8x32xi4> {
+func.func @bubble_up_bitcast_in_insert_i8_i4(%val: vector<16xi8>, %src: vector<8x16xi8>) -> vector<8x32xi4> {
+// CHECK:           %[[BC_VAL:.*]] = vector.bitcast %[[VAL]] : vector<16xi8> to vector<32xi4>
+// CHECK:           %[[BC_DST:.*]] = vector.bitcast %[[DST]] : vector<8x16xi8> to vector<8x32xi4>
+// CHECK:           vector.insert %[[BC_VAL]], %[[BC_DST]] [4] : vector<32xi4> into vector<8x32xi4>
+  %0 = vector.insert %val, %src[4] : vector<16xi8> into vector<8x16xi8>
+  %1 = vector.bitcast %0 : vector<8x16xi8> to vector<8x32xi4>
+  return %1 : vector<8x32xi4>
+}
+
+// CHECK-LABEL:   func.func @bubble_up_bitcast_in_insert_i32_f32(
+// CHECK-SAME:                                                 %[[VAL:.*]]: vector<16xi32>,
+// CHECK-SAME:                                                 %[[DST:.*]]: vector<8x16xi32>) -> vector<8x16xf32> {
+func.func @bubble_up_bitcast_in_insert_i32_f32(%val: vector<16xi32>, %src: vector<8x16xi32>) -> vector<8x16xf32> {
+// CHECK:           %[[BC_VAL:.*]] = vector.bitcast %[[VAL]] : vector<16xi32> to vector<16xf32>
+// CHECK:           %[[BC_DST:.*]] = vector.bitcast %[[DST]] : vector<8x16xi32> to vector<8x16xf32>
+// CHECK:           vector.insert %[[BC_VAL]], %[[BC_DST]] [4] : vector<16xf32> into vector<8x16xf32>
+  %0 = vector.insert %val, %src[4] : vector<16xi32> into vector<8x16xi32>
+  %1 = vector.bitcast %0 : vector<8x16xi32> to vector<8x16xf32>
+  return %1 : vector<8x16xf32>
+}
+
+// CHECK-LABEL:   func.func @bubble_up_bitcast_in_insert_scalar(
+func.func @bubble_up_bitcast_in_insert_scalar(%val: i8, %src: vector<8x16xi8>) -> vector<8x32xi4> {
+// CHECK:           vector.insert
+// CHECK-NEXT:      vector.bitcast
+  %0 = vector.insert %val, %src[4, 8] : i8 into vector<8x16xi8>
+  %1 = vector.bitcast %0 : vector<8x16xi8> to vector<8x32xi4>
+  return %1 : vector<8x32xi4>
+}
+
 // CHECK-LABEL: func @bubble_up_bitcast_in_strided_slice_insert
 //  CHECK-SAME: (%[[DST:.+]]: vector<8xf16>, %[[SRC1:.+]]: vector<4xf16>, %[[SRC2:.+]]: vector<4xf16>)
 func.func @bubble_up_bitcast_in_strided_slice_insert(%dst: vector<8xf16>, %src1: vector<4xf16>, %src2: vector<4xf16>) -> vector<4xf32> {

From 3fac0562f8bdf193f039945eb40c51a5e6c24de1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 28 Feb 2024 16:27:56 +0000
Subject: [PATCH 089/114] [VPlan] Reset trip count when replacing ExpandSCEV
 recipe.

Otherwise accessing the trip count may accesses freed memory.

Fixes https://lab.llvm.org/buildbot/#/builders/74/builds/26239 and
others.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 ++
 llvm/lib/Transforms/Vectorize/VPlan.h           | 8 ++++++++
 2 files changed, 10 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6b16923213c86eb..ea77b6018c0d15f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10093,6 +10093,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
           auto *ExpandedVal = BestEpiPlan.getVPValueOrAddLiveIn(
               ExpandedSCEVs.find(ExpandR->getSCEV())->second);
           ExpandR->replaceAllUsesWith(ExpandedVal);
+          if (BestEpiPlan.getTripCount() == ExpandR)
+            BestEpiPlan.resetTripCount(ExpandedVal);
           ExpandR->eraseFromParent();
         }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 47cebecdb27083a..16c09a83e777dde 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2931,6 +2931,14 @@ class VPlan {
     return TripCount;
   }
 
+  /// Resets the trip count for the VPlan. The caller must make sure all uses of
+  /// the original trip count have been replaced.
+  void resetTripCount(VPValue *NewTripCount) {
+    assert(TripCount && NewTripCount && TripCount->getNumUsers() == 0 &&
+           "TripCount always must be set");
+    TripCount = NewTripCount;
+  }
+
   /// The backedge taken count of the original loop.
   VPValue *getOrCreateBackedgeTakenCount() {
     if (!BackedgeTakenCount)

From 64422cf826354ee1d586c2484ec72d66db898e75 Mon Sep 17 00:00:00 2001
From: Leandro Lupori <leandro.lupori@linaro.org>
Date: Wed, 28 Feb 2024 13:33:42 -0300
Subject: [PATCH 090/114] [llvm][mlir][OMPIRBuilder] Translate omp.single's
 copyprivate (#80488)

Use the new copyprivate list from omp.single to emit calls to
__kmpc_copyprivate, during the creation of the single operation
in OMPIRBuilder.

This is patch 4 of 4, to add support for COPYPRIVATE in Flang.
Original PR: https://github.com/llvm/llvm-project/pull/73128
---
 flang/test/Integration/OpenMP/copyprivate.f90 |  97 +++++++++++
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   6 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  33 +++-
 .../Frontend/OpenMPIRBuilderTest.cpp          | 153 +++++++++++++++++-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  16 +-
 mlir/test/Target/LLVMIR/openmp-llvm.mlir      |  37 +++++
 6 files changed, 330 insertions(+), 12 deletions(-)
 create mode 100644 flang/test/Integration/OpenMP/copyprivate.f90

diff --git a/flang/test/Integration/OpenMP/copyprivate.f90 b/flang/test/Integration/OpenMP/copyprivate.f90
new file mode 100644
index 000000000000000..9318b743a95290c
--- /dev/null
+++ b/flang/test/Integration/OpenMP/copyprivate.f90
@@ -0,0 +1,97 @@
+!===----------------------------------------------------------------------===!
+! This directory can be used to add Integration tests involving multiple
+! stages of the compiler (for eg. from Fortran to LLVM IR). It should not
+! contain executable tests. We should only add tests here sparingly and only
+! if there is no other way to test. Repeat this message in each test that is
+! added to this directory and sub-directories.
+!===----------------------------------------------------------------------===!
+
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s
+
+!CHECK-DAG: define void @_copy_box_Uxi32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_10xi32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_i64(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_box_Uxi64(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_f32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_2x3xf32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_z32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_10xz32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_l32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_5xl32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_c8x8(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_10xc8x8(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_c16x5(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_rec__QFtest_typesTdt(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_box_heap_Uxi32(ptr %{{.*}}, ptr %{{.*}})
+!CHECK-DAG: define void @_copy_box_ptr_Uxc8x9(ptr %{{.*}}, ptr %{{.*}})
+
+!CHECK-LABEL: define void @_copy_i32(
+!CHECK-SAME:                         ptr %[[DST:.*]], ptr %[[SRC:.*]]) {
+!CHECK-NEXT:    %[[SRC_VAL:.*]] = load i32, ptr %[[SRC]]
+!CHECK-NEXT:    store i32 %[[SRC_VAL]], ptr %[[DST]]
+!CHECK-NEXT:    ret void
+!CHECK-NEXT:  }
+
+!CHECK-LABEL: define internal void @test_scalar_..omp_par({{.*}})
+!CHECK:         %[[I:.*]] = alloca i32, i64 1
+!CHECK:         %[[J:.*]] = alloca i32, i64 1
+!CHECK:         %[[DID_IT:.*]] = alloca i32
+!CHECK:         store i32 0, ptr %[[DID_IT]]
+!CHECK:         %[[THREAD_NUM1:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[LOC:.*]])
+!CHECK:         %[[RET:.*]] = call i32 @__kmpc_single({{.*}})
+!CHECK:         %[[NOT_ZERO:.*]] = icmp ne i32 %[[RET]], 0
+!CHECK:         br i1 %[[NOT_ZERO]], label %[[OMP_REGION_BODY:.*]], label %[[OMP_REGION_END:.*]]
+
+!CHECK:       [[OMP_REGION_END]]:
+!CHECK:         %[[THREAD_NUM2:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[LOC:.*]])
+!CHECK:         %[[DID_IT_VAL:.*]] = load i32, ptr %[[DID_IT]]
+!CHECK:         call void @__kmpc_copyprivate(ptr @[[LOC]], i32 %[[THREAD_NUM2]], i64 0, ptr %[[I]], ptr @_copy_i32, i32 %[[DID_IT_VAL]])
+!CHECK:         %[[THREAD_NUM3:.*]] = call i32 @__kmpc_global_thread_num(ptr @[[LOC]])
+!CHECK:         %[[DID_IT_VAL2:.*]] = load i32, ptr %[[DID_IT]]
+!CHECK:         call void @__kmpc_copyprivate(ptr @[[LOC]], i32 %[[THREAD_NUM3]], i64 0, ptr %[[J]], ptr @_copy_i32, i32 %[[DID_IT_VAL2]])
+
+!CHECK:       [[OMP_REGION_BODY]]:
+!CHECK:         br label %[[OMP_SINGLE_REGION:.*]]
+!CHECK:       [[OMP_SINGLE_REGION]]:
+!CHECK:         store i32 11, ptr %[[I]]
+!CHECK:         store i32 22, ptr %[[J]]
+!CHECK:         br label %[[OMP_REGION_CONT3:.*]]
+!CHECK:       [[OMP_REGION_CONT3:.*]]:
+!CHECK:         store i32 1, ptr %[[DID_IT]]
+!CHECK:         call void @__kmpc_end_single(ptr @[[LOC]], i32 %[[THREAD_NUM1]])
+!CHECK:         br label %[[OMP_REGION_END]]
+subroutine test_scalar()
+  integer :: i, j
+
+  !$omp parallel private(i, j)
+  !$omp single
+  i = 11
+  j = 22
+  !$omp end single copyprivate(i, j)
+  !$omp end parallel
+end subroutine
+
+subroutine test_types(a, n)
+  integer :: a(:), n
+  integer(4) :: i4, i4a(10)
+  integer(8) :: i8, i8a(n)
+  real :: r, ra(2, 3)
+  complex :: z, za(10)
+  logical :: l, la(5)
+  character(kind=1, len=8) :: c1, c1a(10)
+  character(kind=2, len=5) :: c2
+
+  type dt
+    integer :: i
+    real :: r
+  end type
+  type(dt) :: t
+
+  integer, allocatable :: aloc(:)
+  character(kind=1, len=9), pointer :: ptr(:)
+
+  !$omp parallel private(a, i4, i4a, i8, i8a, r, ra, z, za, l, la, c1, c1a, c2, t, aloc, ptr)
+  !$omp single
+  !$omp end single copyprivate(a, i4, i4a, i8, i8a, r, ra, z, za, l, la, c1, c1a, c2, t, aloc, ptr)
+  !$omp end parallel
+end subroutine
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 589a9066ac57aff..5bbaa8c208b8cdc 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1834,13 +1834,15 @@ class OpenMPIRBuilder {
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param FiniCB Callback to finalize variable copies.
   /// \param IsNowait If false, a barrier is emitted.
-  /// \param DidIt Local variable used as a flag to indicate 'single' thread
+  /// \param CPVars copyprivate variables.
+  /// \param CPFuncs copy functions to use for each copyprivate variable.
   ///
   /// \returns The insertion position *after* the single call.
   InsertPointTy createSingle(const LocationDescription &Loc,
                              BodyGenCallbackTy BodyGenCB,
                              FinalizeCallbackTy FiniCB, bool IsNowait,
-                             llvm::Value *DidIt);
+                             ArrayRef<llvm::Value *> CPVars = {},
+                             ArrayRef<llvm::Function *> CPFuncs = {});
 
   /// Generator for '#omp master'
   ///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 09f59c81123ea56..d65ed8c11d86cc0 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4047,13 +4047,17 @@ OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
 
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSingle(
     const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
-    FinalizeCallbackTy FiniCB, bool IsNowait, llvm::Value *DidIt) {
+    FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
+    ArrayRef<llvm::Function *> CPFuncs) {
 
   if (!updateToLocation(Loc))
     return Loc.IP;
 
-  // If needed (i.e. not null), initialize `DidIt` with 0
-  if (DidIt) {
+  // If needed allocate and initialize `DidIt` with 0.
+  // DidIt: flag variable: 1=single thread; 0=not single thread.
+  llvm::Value *DidIt = nullptr;
+  if (!CPVars.empty()) {
+    DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
     Builder.CreateStore(Builder.getInt32(0), DidIt);
   }
 
@@ -4070,17 +4074,36 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createSingle(
   Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
   Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
 
+  auto FiniCBWrapper = [&](InsertPointTy IP) {
+    FiniCB(IP);
+
+    // The thread that executes the single region must set `DidIt` to 1.
+    // This is used by __kmpc_copyprivate, to know if the caller is the
+    // single thread or not.
+    if (DidIt)
+      Builder.CreateStore(Builder.getInt32(1), DidIt);
+  };
+
   // generates the following:
   // if (__kmpc_single()) {
   //		.... single region ...
   // 		__kmpc_end_single
   // }
+  // __kmpc_copyprivate
   // __kmpc_barrier
 
-  EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
+  EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
                        /*Conditional*/ true,
                        /*hasFinalize*/ true);
-  if (!IsNowait)
+
+  if (DidIt) {
+    for (size_t I = 0, E = CPVars.size(); I < E; ++I)
+      // NOTE BufSize is currently unused, so just pass 0.
+      createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
+                        /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
+                        CPFuncs[I], DidIt);
+    // NOTE __kmpc_copyprivate already inserts a barrier
+  } else if (!IsNowait)
     createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
                   omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
                   /* CheckCancelFlag */ false);
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index d923b25fda9f96e..fdbe8df783b117a 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -3327,8 +3327,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirective) {
     EXPECT_NE(IPBB->end(), IP.getPoint());
   };
 
-  Builder.restoreIP(OMPBuilder.createSingle(
-      Builder, BodyGenCB, FiniCB, /*IsNowait*/ false, /*DidIt*/ nullptr));
+  Builder.restoreIP(
+      OMPBuilder.createSingle(Builder, BodyGenCB, FiniCB, /*IsNowait*/ false));
   Value *EntryBBTI = EntryBB->getTerminator();
   EXPECT_NE(EntryBBTI, nullptr);
   EXPECT_TRUE(isa<BranchInst>(EntryBBTI));
@@ -3417,8 +3417,8 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) {
     EXPECT_NE(IPBB->end(), IP.getPoint());
   };
 
-  Builder.restoreIP(OMPBuilder.createSingle(
-      Builder, BodyGenCB, FiniCB, /*IsNowait*/ true, /*DidIt*/ nullptr));
+  Builder.restoreIP(
+      OMPBuilder.createSingle(Builder, BodyGenCB, FiniCB, /*IsNowait*/ true));
   Value *EntryBBTI = EntryBB->getTerminator();
   EXPECT_NE(EntryBBTI, nullptr);
   EXPECT_TRUE(isa<BranchInst>(EntryBBTI));
@@ -3464,6 +3464,151 @@ TEST_F(OpenMPIRBuilderTest, SingleDirectiveNowait) {
   EXPECT_EQ(ExitBarrier, nullptr);
 }
 
+// Helper class to check each instruction of a BB.
+class BBInstIter {
+  BasicBlock *BB;
+  BasicBlock::iterator BBI;
+
+public:
+  BBInstIter(BasicBlock *BB) : BB(BB), BBI(BB->begin()) {}
+
+  bool hasNext() const { return BBI != BB->end(); }
+
+  template <typename InstTy> InstTy *next() {
+    if (!hasNext())
+      return nullptr;
+    Instruction *Cur = &*BBI++;
+    if (!isa<InstTy>(Cur))
+      return nullptr;
+    return cast<InstTy>(Cur);
+  }
+};
+
+TEST_F(OpenMPIRBuilderTest, SingleDirectiveCopyPrivate) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+
+  AllocaInst *PrivAI = nullptr;
+
+  BasicBlock *EntryBB = nullptr;
+  BasicBlock *ThenBB = nullptr;
+
+  Value *CPVar = Builder.CreateAlloca(F->arg_begin()->getType());
+  Builder.CreateStore(F->arg_begin(), CPVar);
+
+  FunctionType *CopyFuncTy = FunctionType::get(
+      Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getPtrTy()}, false);
+  Function *CopyFunc =
+      Function::Create(CopyFuncTy, Function::PrivateLinkage, "copy_var", *M);
+
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+    if (AllocaIP.isSet())
+      Builder.restoreIP(AllocaIP);
+    else
+      Builder.SetInsertPoint(&*(F->getEntryBlock().getFirstInsertionPt()));
+    PrivAI = Builder.CreateAlloca(F->arg_begin()->getType());
+    Builder.CreateStore(F->arg_begin(), PrivAI);
+
+    llvm::BasicBlock *CodeGenIPBB = CodeGenIP.getBlock();
+    llvm::Instruction *CodeGenIPInst = &*CodeGenIP.getPoint();
+    EXPECT_EQ(CodeGenIPBB->getTerminator(), CodeGenIPInst);
+
+    Builder.restoreIP(CodeGenIP);
+
+    // collect some info for checks later
+    ThenBB = Builder.GetInsertBlock();
+    EntryBB = ThenBB->getUniquePredecessor();
+
+    // simple instructions for body
+    Value *PrivLoad =
+        Builder.CreateLoad(PrivAI->getAllocatedType(), PrivAI, "local.use");
+    Builder.CreateICmpNE(F->arg_begin(), PrivLoad);
+  };
+
+  auto FiniCB = [&](InsertPointTy IP) {
+    BasicBlock *IPBB = IP.getBlock();
+    // IP must be before the unconditional branch to ExitBB
+    EXPECT_NE(IPBB->end(), IP.getPoint());
+  };
+
+  Builder.restoreIP(OMPBuilder.createSingle(Builder, BodyGenCB, FiniCB,
+                                            /*IsNowait*/ false, {CPVar},
+                                            {CopyFunc}));
+  Value *EntryBBTI = EntryBB->getTerminator();
+  EXPECT_NE(EntryBBTI, nullptr);
+  EXPECT_TRUE(isa<BranchInst>(EntryBBTI));
+  BranchInst *EntryBr = cast<BranchInst>(EntryBB->getTerminator());
+  EXPECT_TRUE(EntryBr->isConditional());
+  EXPECT_EQ(EntryBr->getSuccessor(0), ThenBB);
+  BasicBlock *ExitBB = ThenBB->getUniqueSuccessor();
+  EXPECT_EQ(EntryBr->getSuccessor(1), ExitBB);
+
+  CmpInst *CondInst = cast<CmpInst>(EntryBr->getCondition());
+  EXPECT_TRUE(isa<CallInst>(CondInst->getOperand(0)));
+
+  CallInst *SingleEntryCI = cast<CallInst>(CondInst->getOperand(0));
+  EXPECT_EQ(SingleEntryCI->arg_size(), 2U);
+  EXPECT_EQ(SingleEntryCI->getCalledFunction()->getName(), "__kmpc_single");
+  EXPECT_TRUE(isa<GlobalVariable>(SingleEntryCI->getArgOperand(0)));
+
+  // check ThenBB
+  BBInstIter ThenBBI(ThenBB);
+  // load PrivAI
+  auto *PrivLI = ThenBBI.next<LoadInst>();
+  EXPECT_NE(PrivLI, nullptr);
+  EXPECT_EQ(PrivLI->getPointerOperand(), PrivAI);
+  // icmp
+  EXPECT_TRUE(ThenBBI.next<ICmpInst>());
+  // store 1, DidIt
+  auto *DidItSI = ThenBBI.next<StoreInst>();
+  EXPECT_NE(DidItSI, nullptr);
+  EXPECT_EQ(DidItSI->getValueOperand(),
+            ConstantInt::get(Type::getInt32Ty(Ctx), 1));
+  Value *DidIt = DidItSI->getPointerOperand();
+  // call __kmpc_end_single
+  auto *SingleEndCI = ThenBBI.next<CallInst>();
+  EXPECT_NE(SingleEndCI, nullptr);
+  EXPECT_EQ(SingleEndCI->getCalledFunction()->getName(), "__kmpc_end_single");
+  EXPECT_EQ(SingleEndCI->arg_size(), 2U);
+  EXPECT_TRUE(isa<GlobalVariable>(SingleEndCI->getArgOperand(0)));
+  EXPECT_EQ(SingleEndCI->getArgOperand(1), SingleEntryCI->getArgOperand(1));
+  // br ExitBB
+  auto *ExitBBBI = ThenBBI.next<BranchInst>();
+  EXPECT_NE(ExitBBBI, nullptr);
+  EXPECT_TRUE(ExitBBBI->isUnconditional());
+  EXPECT_EQ(ExitBBBI->getOperand(0), ExitBB);
+  EXPECT_FALSE(ThenBBI.hasNext());
+
+  // check ExitBB
+  BBInstIter ExitBBI(ExitBB);
+  // call __kmpc_global_thread_num
+  auto *ThreadNumCI = ExitBBI.next<CallInst>();
+  EXPECT_NE(ThreadNumCI, nullptr);
+  EXPECT_EQ(ThreadNumCI->getCalledFunction()->getName(),
+            "__kmpc_global_thread_num");
+  // load DidIt
+  auto *DidItLI = ExitBBI.next<LoadInst>();
+  EXPECT_NE(DidItLI, nullptr);
+  EXPECT_EQ(DidItLI->getPointerOperand(), DidIt);
+  // call __kmpc_copyprivate
+  auto *CopyPrivateCI = ExitBBI.next<CallInst>();
+  EXPECT_NE(CopyPrivateCI, nullptr);
+  EXPECT_EQ(CopyPrivateCI->arg_size(), 6U);
+  EXPECT_TRUE(isa<AllocaInst>(CopyPrivateCI->getArgOperand(3)));
+  EXPECT_EQ(CopyPrivateCI->getArgOperand(3), CPVar);
+  EXPECT_TRUE(isa<Function>(CopyPrivateCI->getArgOperand(4)));
+  EXPECT_EQ(CopyPrivateCI->getArgOperand(4), CopyFunc);
+  EXPECT_TRUE(isa<LoadInst>(CopyPrivateCI->getArgOperand(5)));
+  DidItLI = cast<LoadInst>(CopyPrivateCI->getArgOperand(5));
+  EXPECT_EQ(DidItLI->getOperand(0), DidIt);
+  EXPECT_FALSE(ExitBBI.hasNext());
+}
+
 TEST_F(OpenMPIRBuilderTest, OMPAtomicReadFlt) {
   OpenMPIRBuilder OMPBuilder(*M);
   OMPBuilder.initialize();
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8c20689c4a39dd3..fd1de274da60e88 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -667,8 +667,22 @@ convertOmpSingle(omp::SingleOp &singleOp, llvm::IRBuilderBase &builder,
                         moduleTranslation, bodyGenStatus);
   };
   auto finiCB = [&](InsertPointTy codeGenIP) {};
+
+  // Handle copyprivate
+  Operation::operand_range cpVars = singleOp.getCopyprivateVars();
+  std::optional<ArrayAttr> cpFuncs = singleOp.getCopyprivateFuncs();
+  llvm::SmallVector<llvm::Value *> llvmCPVars;
+  llvm::SmallVector<llvm::Function *> llvmCPFuncs;
+  for (size_t i = 0, e = cpVars.size(); i < e; ++i) {
+    llvmCPVars.push_back(moduleTranslation.lookupValue(cpVars[i]));
+    auto llvmFuncOp = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
+        singleOp, cast<SymbolRefAttr>((*cpFuncs)[i]));
+    llvmCPFuncs.push_back(
+        moduleTranslation.lookupFunction(llvmFuncOp.getName()));
+  }
+
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createSingle(
-      ompLoc, bodyCB, finiCB, singleOp.getNowait(), /*DidIt=*/nullptr));
+      ompLoc, bodyCB, finiCB, singleOp.getNowait(), llvmCPVars, llvmCPFuncs));
   return bodyGenStatus;
 }
 
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 39a1e036e85c027..12bd108ba86cb60 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -2186,6 +2186,43 @@ llvm.func @single_nowait(%x: i32, %y: i32, %zaddr: !llvm.ptr) {
 
 // -----
 
+llvm.func @copy_i32(!llvm.ptr, !llvm.ptr)
+llvm.func @copy_f32(!llvm.ptr, !llvm.ptr)
+
+// CHECK-LABEL: @single_copyprivate
+// CHECK-SAME: (ptr %[[ip:.*]], ptr %[[fp:.*]])
+llvm.func @single_copyprivate(%ip: !llvm.ptr, %fp: !llvm.ptr) {
+  // CHECK: %[[didit_addr:.*]] = alloca i32
+  // CHECK: store i32 0, ptr %[[didit_addr]]
+  // CHECK: call i32 @__kmpc_single
+  omp.single copyprivate(%ip -> @copy_i32 : !llvm.ptr, %fp -> @copy_f32 : !llvm.ptr) {
+    // CHECK: %[[i:.*]] = load i32, ptr %[[ip]]
+    %i = llvm.load %ip : !llvm.ptr -> i32
+    // CHECK: %[[i2:.*]] = add i32 %[[i]], %[[i]]
+    %i2 = llvm.add %i, %i : i32
+    // CHECK: store i32 %[[i2]], ptr %[[ip]]
+    llvm.store %i2, %ip : i32, !llvm.ptr
+    // CHECK: %[[f:.*]] = load float, ptr %[[fp]]
+    %f = llvm.load %fp : !llvm.ptr -> f32
+    // CHECK: %[[f2:.*]] = fadd float %[[f]], %[[f]]
+    %f2 = llvm.fadd %f, %f : f32
+    // CHECK: store float %[[f2]], ptr %[[fp]]
+    llvm.store %f2, %fp : f32, !llvm.ptr
+    // CHECK: store i32 1, ptr %[[didit_addr]]
+    // CHECK: call void @__kmpc_end_single
+    // CHECK: %[[didit:.*]] = load i32, ptr %[[didit_addr]]
+    // CHECK: call void @__kmpc_copyprivate({{.*}}, ptr %[[ip]], ptr @copy_i32, i32 %[[didit]])
+    // CHECK: %[[didit2:.*]] = load i32, ptr %[[didit_addr]]
+    // CHECK: call void @__kmpc_copyprivate({{.*}}, ptr %[[fp]], ptr @copy_f32, i32 %[[didit2]])
+    // CHECK-NOT: call void @__kmpc_barrier
+    omp.terminator
+  }
+  // CHECK: ret void
+  llvm.return
+}
+
+// -----
+
 // CHECK: @_QFsubEx = internal global i32 undef
 // CHECK: @_QFsubEx.cache = common global ptr null
 

From cd344a4c20e295d49f8163ec9a0656c1061a6e42 Mon Sep 17 00:00:00 2001
From: Walter Erquinigo <a20012251@gmail.com>
Date: Wed, 28 Feb 2024 11:43:36 -0500
Subject: [PATCH 091/114] [LLDB] Fix completion of space-only lines in the REPL
 on Linux (#83203)

https://github.com/modularml/mojo/issues/1796 discovered that if you try
to complete a space-only line in the REPL on Linux, LLDB crashes. I
suspect that editline doesn't behave the same way on linux and on
darwin, because I can't replicate this on darwin.

Adding a boundary check in the completion code prevents the crash from
happening.
---
 lldb/source/Host/common/Editline.cpp      |  5 +++-
 lldb/test/API/repl/clang/TestClangREPL.py | 32 ++++++++++++++++-------
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp
index ce707e530d008b1..e66271e8a6ee99d 100644
--- a/lldb/source/Host/common/Editline.cpp
+++ b/lldb/source/Host/common/Editline.cpp
@@ -1029,8 +1029,11 @@ unsigned char Editline::TabCommand(int ch) {
     case CompletionMode::Normal: {
       std::string to_add = completion.GetCompletion();
       // Terminate the current argument with a quote if it started with a quote.
-      if (!request.GetParsedLine().empty() && request.GetParsedArg().IsQuoted())
+      Args &parsedLine = request.GetParsedLine();
+      if (!parsedLine.empty() && request.GetCursorIndex() < parsedLine.size() &&
+          request.GetParsedArg().IsQuoted()) {
         to_add.push_back(request.GetParsedArg().GetQuoteChar());
+      }
       to_add.push_back(' ');
       el_deletestr(m_editline, request.GetCursorArgumentPrefix().size());
       el_insertstr(m_editline, to_add.c_str());
diff --git a/lldb/test/API/repl/clang/TestClangREPL.py b/lldb/test/API/repl/clang/TestClangREPL.py
index 0b67955a7833c65..c37557fb94735d8 100644
--- a/lldb/test/API/repl/clang/TestClangREPL.py
+++ b/lldb/test/API/repl/clang/TestClangREPL.py
@@ -1,7 +1,6 @@
-import lldb
 from lldbsuite.test.decorators import *
-from lldbsuite.test.lldbtest import *
 from lldbsuite.test.lldbpexpect import PExpectTest
+from lldbsuite.test.lldbtest import *
 
 
 class TestCase(PExpectTest):
@@ -17,13 +16,7 @@ def expect_repl(self, expr, substrs=[]):
         self.current_repl_line_number += 1
         self.child.expect_exact(str(self.current_repl_line_number) + ">")
 
-    # PExpect uses many timeouts internally and doesn't play well
-    # under ASAN on a loaded machine..
-    @skipIfAsan
-    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
-    @skipIfEditlineSupportMissing
-    def test_basic_completion(self):
-        """Test that we can complete a simple multiline expression"""
+    def start_repl(self):
         self.build()
         self.current_repl_line_number = 1
 
@@ -41,6 +34,14 @@ def test_basic_completion(self):
         self.child.send("expression --repl -l c --\n")
         self.child.expect_exact("1>")
 
+    # PExpect uses many timeouts internally and doesn't play well
+    # under ASAN on a loaded machine..
+    @skipIfAsan
+    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
+    @skipIfEditlineSupportMissing
+    def test_basic_completion(self):
+        """Test that we can complete a simple multiline expression"""
+        self.start_repl()
         # Try evaluating a simple expression.
         self.expect_repl("3 + 3", substrs=["(int) $0 = 6"])
 
@@ -54,3 +55,16 @@ def test_basic_completion(self):
         self.expect_repl("$persistent + 10", substrs=["(long) $2 = 17"])
 
         self.quit()
+
+    # PExpect uses many timeouts internally and doesn't play well
+    # under ASAN on a loaded machine..
+    @skipIfAsan
+    @skipIf(oslist=["linux"], archs=["arm", "aarch64"])  # Randomly fails on buildbot
+    @skipIfEditlineSupportMissing
+    def test_completion_with_space_only_line(self):
+        """Test that we don't crash when completing lines with spaces only"""
+        self.start_repl()
+
+        self.child.send("   ")
+        self.child.send("\t")
+        self.expect_repl("3 + 3", substrs=["(int) $0 = 6"])

From 26402777ebf4eb3d8f3d5a45943b451c740b2d76 Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac@arm.com>
Date: Wed, 28 Feb 2024 16:45:39 +0000
Subject: [PATCH 092/114] [AArch64] Optimized generated assembly for bool to
 svbool_t conversions (#83001)

In certain cases Legalizer was generating `AND(WHILELO, SPLAT 1)` instruction pattern, when `WHILELO` would be sufficient.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  6 +--
 .../AArch64/sve-intrinsics-reinterpret.ll     | 42 ++++++++++++++++++-
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b86661c8a4e0d41..c21bc3a4abbc00c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -271,11 +271,9 @@ static bool isMergePassthruOpcode(unsigned Opc) {
 static bool isZeroingInactiveLanes(SDValue Op) {
   switch (Op.getOpcode()) {
   default:
-    // We guarantee i1 splat_vectors to zero the other lanes by
-    // implementing it with ptrue and possibly a punpklo for nxv1i1.
-    if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
-      return true;
     return false;
+  // We guarantee i1 splat_vectors to zero the other lanes
+  case ISD::SPLAT_VECTOR:
   case AArch64ISD::PTRUE:
   case AArch64ISD::SETCC_MERGE_ZERO:
     return true;
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
index 82bf756f8228984..c7c102f5d567d99 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-reinterpret.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
 
@@ -150,6 +150,46 @@ define <vscale x 16 x i1> @chained_reinterpret() {
   ret <vscale x 16 x i1> %out
 }
 
+define <vscale x 16 x i1> @reinterpret_scalar_bool_h(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @reinterpret_scalar_bool_s(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @reinterpret_scalar_bool_q(i1 %x){
+; CHECK-LABEL: reinterpret_scalar_bool_q:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ret
+  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %x, i64 0
+  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %out = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %.splat)
+  ret <vscale x 16 x i1> %out
+}
+
+
 declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 immarg)
 declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 immarg)
 declare <vscale x 8 x i1> @llvm.aarch64.sve.cmpgt.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)

From 634b0243b8f7acc85af4f16b70e91d86ded4dc83 Mon Sep 17 00:00:00 2001
From: SivanShani-Arm <sivan.shani@arm.com>
Date: Wed, 28 Feb 2024 17:02:51 +0000
Subject: [PATCH 093/114] [llvm][arm] add T1 and T2 assembly options for vlldm
 and vlstm (#83116)

T1 allows for an optional registers list, the register list must be {d0-d15}.
T2 defines a mandatory register list, the register list must be {d0-d31}.

The requirements for T1/T2 are as follows:
                T1              T2
Require:        v8-M.Main,      v8.1-M.Main,
                secure state    secure state
16 D Regs       valid           valid
32 D Regs       UNDEFINED       valid
No D Regs       NOP             NOP
---
 llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp  | 57 +++++++++-----
 llvm/lib/Target/ARM/ARMInstrFormats.td        | 31 ++++++++
 llvm/lib/Target/ARM/ARMInstrVFP.td            | 64 +++++++++++-----
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 76 ++++++++++++++++---
 .../ARM/Disassembler/ARMDisassembler.cpp      | 23 ++++++
 .../ARM/MCTargetDesc/ARMInstPrinter.cpp       | 32 ++++++++
 .../CodeGen/ARM/cmse-vlldm-no-reorder.mir     |  4 +-
 llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir    |  9 +--
 llvm/test/MC/ARM/thumbv8m.s                   |  8 +-
 llvm/test/MC/ARM/vlstm-vlldm-8.1m.s           | 11 +++
 llvm/test/MC/ARM/vlstm-vlldm-8m.s             | 17 +++++
 llvm/test/MC/ARM/vlstm-vlldm-diag.s           | 61 +++++++++++++++
 .../ARM/armv8.1m-vlldm_vlstm-8.1.main.txt     | 11 +++
 .../ARM/armv8.1m-vlldm_vlstm-8.main.txt       | 17 +++++
 .../unittests/Target/ARM/MachineInstrTest.cpp |  2 +
 15 files changed, 362 insertions(+), 61 deletions(-)
 create mode 100644 llvm/test/MC/ARM/vlstm-vlldm-8.1m.s
 create mode 100644 llvm/test/MC/ARM/vlstm-vlldm-8m.s
 create mode 100644 llvm/test/MC/ARM/vlstm-vlldm-diag.s
 create mode 100644 llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt
 create mode 100644 llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt

diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index f0b69b0b09809f9..e78ea63c3b4a561 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1468,15 +1468,21 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8(
   if (passesFPReg)
     assert(STI->hasFPRegs() && "Subtarget needs fpregs");
 
-  // Lazy store all fp registers to the stack.
+  // Lazy store all fp registers to the stack
   // This executes as NOP in the absence of floating-point support.
-  MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
-                                  .addReg(ARM::SP)
-                                  .add(predOps(ARMCC::AL));
-  for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
-                 ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
-    VLSTM.addReg(R, RegState::Implicit |
-                        (LiveRegs.contains(R) ? 0 : RegState::Undef));
+  MachineInstrBuilder VLSTM =
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+          .addReg(ARM::SP)
+          .add(predOps(ARMCC::AL))
+          .addImm(0); // Represents a pseoudo register list, has no effect on
+                      // the encoding.
+  // Mark non-live registers as undef
+  for (MachineOperand &MO : VLSTM->implicit_operands()) {
+    if (MO.isReg() && !MO.isDef()) {
+      Register Reg = MO.getReg();
+      MO.setIsUndef(!LiveRegs.contains(Reg));
+    }
+  }
 
   // Restore all arguments
   for (const auto &Regs : ClearedFPRegs) {
@@ -1563,14 +1569,20 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV81(MachineBasicBlock &MBB,
         .addImm(CMSE_FP_SAVE_SIZE >> 2)
         .add(predOps(ARMCC::AL));
 
-    // Lazy store all FP registers to the stack
-    MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
-                                    .addReg(ARM::SP)
-                                    .add(predOps(ARMCC::AL));
-    for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
-                   ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
-      VLSTM.addReg(R, RegState::Implicit |
-                          (LiveRegs.contains(R) ? 0 : RegState::Undef));
+    // Lazy store all fp registers to the stack.
+    MachineInstrBuilder VLSTM =
+        BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+            .addReg(ARM::SP)
+            .add(predOps(ARMCC::AL))
+            .addImm(0); // Represents a pseoudo register list, has no effect on
+                        // the encoding.
+    // Mark non-live registers as undef
+    for (MachineOperand &MO : VLSTM->implicit_operands()) {
+      if (MO.isReg() && MO.isImplicit() && !MO.isDef()) {
+        Register Reg = MO.getReg();
+        MO.setIsUndef(!LiveRegs.contains(Reg));
+      }
+    }
   } else {
     // Push all the callee-saved registers (s16-s31).
     MachineInstrBuilder VPUSH =
@@ -1673,9 +1685,12 @@ void ARMExpandPseudo::CMSERestoreFPRegsV8(
 
   // Lazy load fp regs from stack.
   // This executes as NOP in the absence of floating-point support.
-  MachineInstrBuilder VLLDM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
-                                  .addReg(ARM::SP)
-                                  .add(predOps(ARMCC::AL));
+  MachineInstrBuilder VLLDM =
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
+          .addReg(ARM::SP)
+          .add(predOps(ARMCC::AL))
+          .addImm(0); // Represents a pseoudo register list, has no effect on
+                      // the encoding.
 
   if (STI->fixCMSE_CVE_2021_35465()) {
     auto Bundler = MIBundleBuilder(MBB, VLLDM);
@@ -1757,7 +1772,9 @@ void ARMExpandPseudo::CMSERestoreFPRegsV81(
     // Load FP registers from stack.
     BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
         .addReg(ARM::SP)
-        .add(predOps(ARMCC::AL));
+        .add(predOps(ARMCC::AL))
+        .addImm(0); // Represents a pseoudo register list, has no effect on the
+                    // encoding.
 
     // Pop the stack space
     BuildMI(MBB, MBBI, DL, TII->get(ARM::tADDspi), ARM::SP)
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 14e315534570d2d..404085820a66605 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -1749,6 +1749,37 @@ class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
   let Inst{8}     = 0;          // Single precision
 }
 
+// Single Precision with fixed registers.
+// For when the registers-to-be-stored/loaded are fixed, e.g. VLLDM and VLSTM
+class AXSI4FR<string asm, bit et, bit load>
+    : InstARM<AddrMode4, 4, IndexModeNone, VFPLdStMulFrm, VFPDomain, "", NoItinerary> {
+  // Instruction operands.
+  bits<4> Rn;
+  bits<13> regs;    // Does not affect encoding, for assembly/disassembly only.
+  list<Predicate> Predicates = [HasVFP2];
+  let OutOperandList = (outs);
+  let InOperandList = (ins GPRnopc:$Rn, pred:$p, dpr_reglist:$regs);
+  let AsmString = asm;
+  let Pattern = [];
+  let DecoderNamespace = "VFP";
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{31-28} = 0b1110;
+  let Inst{27-25} = 0b110;
+  let Inst{24}    = 0b0;
+  let Inst{23}    = 0b0;
+  let Inst{22}    = 0b0;
+  let Inst{21}    = 0b1;
+  let Inst{20}    = load;       // Distinguishes vlldm from vlstm
+  let Inst{15-12} = 0b0000;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{7}     = et;         // encoding type, 0 for T1 and 1 for T2.
+  let Inst{6-0}   = 0b0000000;
+  let mayLoad     = load;
+  let mayStore    = !eq(load, 0);
+}
+
 // Double precision, unary
 class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index 55d3efbd9b9a2b4..3094a4db2b4d123 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -313,29 +313,51 @@ def : MnemonicAlias<"vstm", "vstmia">;
 //===----------------------------------------------------------------------===//
 //  Lazy load / store multiple Instructions
 //
-def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
-                  NoItinerary, "vlldm${p}\t$Rn", "", []>,
+// VLLDM and VLSTM:
+// 2 encoding options:
+// T1 (bit 7 is 0):
+// T1 takes an optional dpr_reglist, must be '{d0-d15}' (exactly)
+// T1 require v8-M.Main, secure state, target with 16 D registers (or with no D registers - NOP)
+// T2 (bit 7 is 1):
+// T2 takes a mandatory dpr_reglist, must be '{d0-d31}' (exactly)
+// T2 require v8.1-M.Main, secure state, target with 16/32 D registers (or with no D registers - NOP)
+// (source: Arm v8-M ARM, DDI0553B.v ID16122022)
+
+def VLLDM : AXSI4FR<"vlldm${p}\t$Rn, $regs", 0, 1>,
             Requires<[HasV8MMainline, Has8MSecExt]> {
-    let Inst{24-23} = 0b00;
-    let Inst{22}    = 0;
-    let Inst{21}    = 1;
-    let Inst{20}    = 1;
-    let Inst{15-12} = 0;
-    let Inst{7-0}   = 0;
-    let mayLoad     = 1;
-    let Defs = [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, VPR, FPSCR, FPSCR_NZCV];
-}
-
-def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
-                  NoItinerary, "vlstm${p}\t$Rn", "", []>,
+    let Defs = [VPR, FPSCR, FPSCR_NZCV, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
+}
+// T1: assembly does not contains the register list.
+def : InstAlias<"vlldm${p}\t$Rn", (VLLDM GPRnopc:$Rn, pred:$p, 0)>,
+                Requires<[HasV8MMainline, Has8MSecExt]>;
+// T2: assembly must contains the register list.
+// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
+def VLLDM_T2 : AXSI4FR<"vlldm${p}\t$Rn, $regs", 1, 1>,
+            Requires<[HasV8_1MMainline, Has8MSecExt]> {
+    let Defs = [VPR, FPSCR, FPSCR_NZCV, D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
+                                        D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
+}
+// T1: assembly contains the register list.
+// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
+def VLSTM : AXSI4FR<"vlstm${p}\t$Rn, $regs", 0, 0>,
             Requires<[HasV8MMainline, Has8MSecExt]> {
-    let Inst{24-23} = 0b00;
-    let Inst{22}    = 0;
-    let Inst{21}    = 1;
-    let Inst{20}    = 0;
-    let Inst{15-12} = 0;
-    let Inst{7-0}   = 0;
-    let mayStore    = 1;
+    let Defs = [VPR, FPSCR, FPSCR_NZCV];
+    let Uses = [VPR, FPSCR, FPSCR_NZCV, D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
+}
+// T1: assembly does not contain the register list.
+def : InstAlias<"vlstm${p}\t$Rn", (VLSTM GPRnopc:$Rn, pred:$p, 0)>,
+                Requires<[HasV8MMainline, Has8MSecExt]>;
+// T2: assembly must contain the register list.
+// The register list has no effect on the encoding, it is for assembly/disassembly purposes only.
+def VLSTM_T2 : AXSI4FR<"vlstm${p}\t$Rn, $regs", 1, 0>,
+            Requires<[HasV8_1MMainline, Has8MSecExt]> {
+    let Defs = [VPR, FPSCR, FPSCR_NZCV];
+    let Uses = [VPR, FPSCR, FPSCR_NZCV, D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
+                                        D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30, D31];
+    let DecoderMethod = "DecodeLazyLoadStoreMul";
 }
 
 def : InstAlias<"vpush${p} $r", (VSTMDDB_UPD SP, pred:$p, dpr_reglist:$r), 0>,
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 37bfb76a494dee6..5efbaf0d41060c8 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -450,11 +450,12 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool validatetSTMRegList(const MCInst &Inst, const OperandVector &Operands,
                            unsigned ListNo);
 
-  int tryParseRegister();
+  int tryParseRegister(bool AllowOutofBoundReg = false);
   bool tryParseRegisterWithWriteBack(OperandVector &);
   int tryParseShiftRegister(OperandVector &);
   bool parseRegisterList(OperandVector &, bool EnforceOrder = true,
-                         bool AllowRAAC = false);
+                         bool AllowRAAC = false,
+                         bool AllowOutOfBoundReg = false);
   bool parseMemory(OperandVector &);
   bool parseOperand(OperandVector &, StringRef Mnemonic);
   bool parseImmExpr(int64_t &Out);
@@ -4072,7 +4073,7 @@ ParseStatus ARMAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
 /// Try to parse a register name.  The token must be an Identifier when called,
 /// and if it is a register name the token is eaten and the register number is
 /// returned.  Otherwise return -1.
-int ARMAsmParser::tryParseRegister() {
+int ARMAsmParser::tryParseRegister(bool AllowOutOfBoundReg) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) return -1;
@@ -4116,7 +4117,8 @@ int ARMAsmParser::tryParseRegister() {
   }
 
   // Some FPUs only have 16 D registers, so D16-D31 are invalid
-  if (!hasD32() && RegNum >= ARM::D16 && RegNum <= ARM::D31)
+  if (!AllowOutOfBoundReg && !hasD32() && RegNum >= ARM::D16 &&
+      RegNum <= ARM::D31)
     return -1;
 
   Parser.Lex(); // Eat identifier token.
@@ -4456,7 +4458,7 @@ insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
 
 /// Parse a register list.
 bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
-                                     bool AllowRAAC) {
+                                     bool AllowRAAC, bool AllowOutOfBoundReg) {
   MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::LCurly))
     return TokError("Token is not a Left Curly Brace");
@@ -4510,7 +4512,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
         return Error(RegLoc, "pseudo-register not allowed");
       Parser.Lex(); // Eat the minus.
       SMLoc AfterMinusLoc = Parser.getTok().getLoc();
-      int EndReg = tryParseRegister();
+      int EndReg = tryParseRegister(AllowOutOfBoundReg);
       if (EndReg == -1)
         return Error(AfterMinusLoc, "register expected");
       if (EndReg == ARM::RA_AUTH_CODE)
@@ -4545,7 +4547,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
     RegLoc = Parser.getTok().getLoc();
     int OldReg = Reg;
     const AsmToken RegTok = Parser.getTok();
-    Reg = tryParseRegister();
+    Reg = tryParseRegister(AllowOutOfBoundReg);
     if (Reg == -1)
       return Error(RegLoc, "register expected");
     if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE)
@@ -6085,8 +6087,11 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
   }
   case AsmToken::LBrac:
     return parseMemory(Operands);
-  case AsmToken::LCurly:
-    return parseRegisterList(Operands, !Mnemonic.starts_with("clr"));
+  case AsmToken::LCurly: {
+    bool AllowOutOfBoundReg = Mnemonic == "vlldm" || Mnemonic == "vlstm";
+    return parseRegisterList(Operands, !Mnemonic.starts_with("clr"), false,
+                             AllowOutOfBoundReg);
+  }
   case AsmToken::Dollar:
   case AsmToken::Hash: {
     // #42 -> immediate
@@ -7596,6 +7601,33 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
 
   const unsigned Opcode = Inst.getOpcode();
   switch (Opcode) {
+  case ARM::VLLDM:
+  case ARM::VLLDM_T2:
+  case ARM::VLSTM:
+  case ARM::VLSTM_T2: {
+    // Since in some cases both T1 and T2 are valid, tablegen can not always
+    // pick the correct instruction.
+    if (Operands.size() == 4) { // a register list has been provided
+      ARMOperand &Op = static_cast<ARMOperand &>(
+          *Operands[3]); // the register list, a dpr_reglist
+      assert(Op.isDPRRegList());
+      auto &RegList = Op.getRegList();
+      // T2 requires v8.1-M.Main (cannot be handled by tablegen)
+      if (RegList.size() == 32 && !hasV8_1MMainline()) {
+        return Error(Op.getEndLoc(), "T2 version requires v8.1-M.Main");
+      }
+      // When target has 32 D registers, T1 is undefined.
+      if (hasD32() && RegList.size() != 32) {
+        return Error(Op.getEndLoc(), "operand must be exactly {d0-d31}");
+      }
+      // When target has 16 D registers, both T1 and T2 are valid.
+      if (!hasD32() && (RegList.size() != 16 && RegList.size() != 32)) {
+        return Error(Op.getEndLoc(),
+                     "operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)");
+      }
+    }
+    return false;
+  }
   case ARM::t2IT: {
     // Encoding is unpredictable if it ever results in a notional 'NV'
     // predicate. Since we don't parse 'NV' directly this means an 'AL'
@@ -8731,6 +8763,32 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   }
 
   switch (Inst.getOpcode()) {
+  case ARM::VLLDM:
+  case ARM::VLSTM: {
+    // In some cases both T1 and T2 are valid, causing tablegen pick T1 instead
+    // of T2
+    if (Operands.size() == 4) { // a register list has been provided
+      ARMOperand &Op = static_cast<ARMOperand &>(
+          *Operands[3]); // the register list, a dpr_reglist
+      assert(Op.isDPRRegList());
+      auto &RegList = Op.getRegList();
+      // When the register list is {d0-d31} the instruction has to be the T2
+      // variant
+      if (RegList.size() == 32) {
+        const unsigned Opcode =
+            (Inst.getOpcode() == ARM::VLLDM) ? ARM::VLLDM_T2 : ARM::VLSTM_T2;
+        MCInst TmpInst;
+        TmpInst.setOpcode(Opcode);
+        TmpInst.addOperand(Inst.getOperand(0));
+        TmpInst.addOperand(Inst.getOperand(1));
+        TmpInst.addOperand(Inst.getOperand(2));
+        TmpInst.addOperand(Inst.getOperand(3));
+        Inst = TmpInst;
+        return true;
+      }
+    }
+    return false;
+  }
   // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
   case ARM::LDRT_POST:
   case ARM::LDRBT_POST: {
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 604f22d71119001..705f3cbce12f02d 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -700,6 +700,9 @@ DecodeMVEOverlappingLongShift(MCInst &Inst, unsigned Insn, uint64_t Address,
 static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
                                         uint64_t Address,
                                         const MCDisassembler *Decoder);
+static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder);
 
 #include "ARMGenDisassemblerTables.inc"
 
@@ -7030,3 +7033,23 @@ static DecodeStatus DecodeT2AddSubSPImm(MCInst &Inst, unsigned Insn,
 
   return DS;
 }
+
+static DecodeStatus DecodeLazyLoadStoreMul(MCInst &Inst, unsigned Insn,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  const unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  // Adding Rn, holding memory location to save/load to/from, the only argument
+  // that is being encoded.
+  // '$Rn' in the assembly.
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  // An optional predicate, '$p' in the assembly.
+  DecodePredicateOperand(Inst, ARMCC::AL, Address, Decoder);
+  // An immediate that represents a floating point registers list. '$regs' in
+  // the assembly.
+  Inst.addOperand(MCOperand::createImm(0)); // Arbitrary value, has no effect.
+
+  return S;
+}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index fbd067d79af0b3c..24e627cd9a4e1ff 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -91,6 +91,38 @@ void ARMInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   unsigned Opcode = MI->getOpcode();
 
   switch (Opcode) {
+  case ARM::VLLDM: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlldm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d15}";
+    return;
+  }
+  case ARM::VLLDM_T2: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlldm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d31}";
+    return;
+  }
+  case ARM::VLSTM: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlstm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d15}";
+    return;
+  }
+  case ARM::VLSTM_T2: {
+    const MCOperand &Reg = MI->getOperand(0);
+    O << '\t' << "vlstm" << '\t';
+    printRegName(O, Reg.getReg());
+    O << ", "
+      << "{d0 - d31}";
+    return;
+  }
   // Check for MOVs and print canonical forms, instead.
   case ARM::MOVsr: {
     // FIXME: Thumb variants?
diff --git a/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir b/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
index 2bc4288884f1925..d006aa9ba38e844 100644
--- a/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
+++ b/llvm/test/CodeGen/ARM/cmse-vlldm-no-reorder.mir
@@ -89,7 +89,7 @@ body:             |
 # CHECK: $sp = t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, $r4, $r5, $r6, undef $r7, $r8, $r9, $r10, $r11
 # CHECK-NEXT:  $r0 = t2BICri $r0, 1, 14 /* CC::al */, $noreg, $noreg
 # CHECK-NEXT:  $sp = tSUBspi $sp, 34, 14 /* CC::al */, $noreg
-# CHECK-NEXT:  VLSTM $sp, 14 /* CC::al */, $noreg, implicit undef $vpr, implicit undef $fpscr, implicit undef $fpscr_nzcv, implicit undef $q0, implicit undef $q1, implicit undef $q2, implicit undef $q3, implicit undef $q4, implicit undef $q5, implicit undef $q6, implicit undef $q7
+# CHECK-NEXT:  VLSTM $sp, 14 /* CC::al */, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit undef $vpr, implicit undef $fpscr, implicit undef $fpscr_nzcv, implicit undef $d0, implicit undef $d1, implicit undef $d2, implicit undef $d3, implicit undef $d4, implicit undef $d5, implicit undef $d6, implicit undef $d7, implicit $d8, implicit $d9, implicit $d10, implicit $d11, implicit $d12, implicit $d13, implicit $d14, implicit $d15
 # CHECK-NEXT:  $r1 = tMOVr $r0, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $r2 = tMOVr $r0, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $r3 = tMOVr $r0, 14 /* CC::al */, $noreg
@@ -105,7 +105,7 @@ body:             |
 # CHECK-NEXT:  t2MSR_M 3072, $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr
 # CHECK-NEXT:  tBLXNSr 14 /* CC::al */, $noreg, killed $r0, csr_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $s0
 # CHECK-NEXT:  $r12 = VMOVRS $s0, 14 /* CC::al */, $noreg
-# CHECK-NEXT:  VLLDM $sp, 14 /* CC::al */, $noreg, implicit-def $q0, implicit-def $q1, implicit-def $q2, implicit-def $q3, implicit-def $q4, implicit-def $q5, implicit-def $q6, implicit-def $q7, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv
+# CHECK-NEXT:  VLLDM $sp, 14 /* CC::al */, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit-def $d0, implicit-def $d1, implicit-def $d2, implicit-def $d3, implicit-def $d4, implicit-def $d5, implicit-def $d6, implicit-def $d7, implicit-def $d8, implicit-def $d9, implicit-def $d10, implicit-def $d11, implicit-def $d12, implicit-def $d13, implicit-def $d14, implicit-def $d15
 # CHECK-NEXT:  $s0 = VMOVSR $r12, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $sp = tADDspi $sp, 34, 14 /* CC::al */, $noreg
 # CHECK-NEXT:  $sp = t2LDMIA_UPD $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11
diff --git a/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir b/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
index 8c49a5316741150..ad53addcc21a353 100644
--- a/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
+++ b/llvm/test/CodeGen/ARM/vlldm-vlstm-uops.mir
@@ -2,7 +2,7 @@
 --- |
   target triple = "thumbv8m.main-arm-none-eabi"
 
-  define hidden void @foo(ptr nocapture %baz) local_unnamed_addr #0 {
+  define hidden void @foo(void ()* nocapture %baz) local_unnamed_addr #0 {
   entry:
     %call = call i32 @bar() #0
     %tobool = icmp eq i32 %call, 0
@@ -55,14 +55,13 @@ body:             |
     tBL 14, $noreg, @bar, csr_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def dead $r0
 
   bb.2.land.end:
-    liveins: $r4
-
+    liveins: $r4, $vpr, $fpscr, $fpscr_nzcv, $d0, $d1, $d2, $d3, $d4, $d5, $d6, $d7, $d8, $d9, $d10, $d11, $d12, $d13, $d14, $d15
     $sp = t2STMDB_UPD $sp, 14, $noreg, $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $r11
     $r4 = t2BICri $r4, 1, 14, $noreg, $noreg
     $sp = tSUBspi $sp, 34, 14, $noreg
-    VLSTM $sp, 14, $noreg
+    VLSTM $sp, 14, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit $vpr, implicit $fpscr, implicit $fpscr_nzcv, implicit $d0, implicit $d1, implicit $d2, implicit $d3, implicit $d4, implicit $d5, implicit $d6, implicit $d7, implicit $d8, implicit $d9, implicit $d10, implicit $d11, implicit $d12, implicit $d13, implicit $d14, implicit $d15
     tBLXNSr 14, $noreg, killed $r4, csr_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp
-    VLLDM $sp, 14, $noreg, implicit-def $q0, implicit-def $q1, implicit-def $q2, implicit-def $q3, implicit-def $q4, implicit-def $q5, implicit-def $q6, implicit-def $q7, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv
+    VLLDM $sp, 14, $noreg, 0, implicit-def $vpr, implicit-def $fpscr, implicit-def $fpscr_nzcv, implicit-def $d0, implicit-def $d1, implicit-def $d2, implicit-def $d3, implicit-def $d4, implicit-def $d5, implicit-def $d6, implicit-def $d7, implicit-def $d8, implicit-def $d9, implicit-def $d10, implicit-def $d11, implicit-def $d12, implicit-def $d13, implicit-def $d14, implicit-def $d15
     $sp = tADDspi $sp, 34, 14, $noreg
     $sp = t2LDMIA_UPD $sp, 14, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11
     $sp = t2LDMIA_RET $sp, 14, $noreg, def $r4, def $pc
diff --git a/llvm/test/MC/ARM/thumbv8m.s b/llvm/test/MC/ARM/thumbv8m.s
index 0e9ab4a9b3bf911..f03dd03dae3a4f9 100644
--- a/llvm/test/MC/ARM/thumbv8m.s
+++ b/llvm/test/MC/ARM/thumbv8m.s
@@ -184,13 +184,13 @@ ttat r0, r1
 // 'Lazy Load/Store Multiple'
 
 // UNDEF-BASELINE: error: instruction requires: armv8m.main
-// CHECK-MAINLINE: vlldm r5          @ encoding: [0x35,0xec,0x00,0x0a]
-// CHECK-MAINLINE_DSP: vlldm r5      @ encoding: [0x35,0xec,0x00,0x0a]
+// CHECK-MAINLINE: vlldm r5, {d0 - d15} @ encoding: [0x35,0xec,0x00,0x0a]
+// CHECK-MAINLINE_DSP: vlldm r5, {d0 - d15} @ encoding: [0x35,0xec,0x00,0x0a]
 vlldm r5
 
 // UNDEF-BASELINE: error: instruction requires: armv8m.main
-// CHECK-MAINLINE: vlstm r10         @ encoding: [0x2a,0xec,0x00,0x0a]
-// CHECK-MAINLINE_DSP: vlstm r10     @ encoding: [0x2a,0xec,0x00,0x0a]
+// CHECK-MAINLINE: vlstm r10, {d0 - d15} @ encoding: [0x2a,0xec,0x00,0x0a]
+// CHECK-MAINLINE_DSP: vlstm r10, {d0 - d15} @ encoding: [0x2a,0xec,0x00,0x0a]
 vlstm r10
 
 // New SYSm's
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s b/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s
new file mode 100644
index 000000000000000..4e35883ffe43327
--- /dev/null
+++ b/llvm/test/MC/ARM/vlstm-vlldm-8.1m.s
@@ -0,0 +1,11 @@
+// RUN: llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+// RUN: llvm-mc -triple=thumbv8.1m.main-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+vlstm r8, {d0 - d31}
+// CHECK: vlstm	r8, {d0 - d31} @ encoding: [0x28,0xec,0x80,0x0a]
+
+vlldm r8, {d0 - d31}
+// CHECK: vlldm	r8, {d0 - d31} @ encoding: [0x38,0xec,0x80,0x0a]
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-8m.s b/llvm/test/MC/ARM/vlstm-vlldm-8m.s
new file mode 100644
index 000000000000000..bbc95318aeb3d09
--- /dev/null
+++ b/llvm/test/MC/ARM/vlstm-vlldm-8m.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple=armv8m.main-arm-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+// RUN: llvm-mc -triple=thumbv8m.main-none-eabi -mcpu=generic -show-encoding %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+vlstm r8, {d0 - d15}
+// CHECK: vlstm	r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+vlldm r8, {d0 - d15}
+// CHECK: vlldm	r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
+
+vlstm r8
+// CHECK: vlstm	r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+vlldm r8
+// CHECK: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-diag.s b/llvm/test/MC/ARM/vlstm-vlldm-diag.s
new file mode 100644
index 000000000000000..b57f535c6a25cf7
--- /dev/null
+++ b/llvm/test/MC/ARM/vlstm-vlldm-diag.s
@@ -0,0 +1,61 @@
+// RUN: not llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s 2>&1 >/dev/null \
+// RUN: | FileCheck --check-prefixes=ERR %s
+
+// RUN: not llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding %s 2>&1 >/dev/null \
+// RUN: | FileCheck --check-prefixes=ERRT2 %s
+
+vlstm r8, {d0 - d11}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d0 - d11}
+
+vlldm r8, {d0 - d11}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d0 - d11}
+
+vlstm r8, {d3 - d15}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d3 - d15}
+
+vlldm r8, {d3 - d15}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d3 - d15}
+
+vlstm r8, {d0 - d29}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d0 - d29}
+
+vlldm r8, {d0 - d29}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d0 - d29}
+
+vlstm r8, {d3 - d31}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d3 - d31}
+
+vlldm r8, {d3 - d31}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d3 - d31}
+
+vlstm r8, {d0 - d35}
+// ERR: error: register expected
+// ERR-NEXT: vlstm r8, {d0 - d35}
+
+vlldm r8, {d0 - d35}
+// ERR: error: register expected
+// ERR-NEXT: vlldm r8, {d0 - d35}
+
+vlstm pc
+// ERR: error: operand must be a register in range [r0, r14]
+// ERR-NEXT: vlstm pc
+
+vlldm pc
+// ERR: error: operand must be a register in range [r0, r14]
+// ERR-NEXT: vlldm pc
+
+vlstm pc
+// ERRT2: error: operand must be a register in range [r0, r14]
+// ERRT2-NEXT: vlstm pc
+
+vlldm pc
+// ERRT2: error: operand must be a register in range [r0, r14]
+// ERRT2-NEXT: vlldm pc
\ No newline at end of file
diff --git a/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt
new file mode 100644
index 000000000000000..6b9882454c06a32
--- /dev/null
+++ b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.1.main.txt
@@ -0,0 +1,11 @@
+// RUN: llvm-mc -triple=armv8.1m.main-arm-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+// RUN: llvm-mc -triple=thumbv8.1m.main-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+[0x28,0xec,0x80,0x0a]
+// CHECK-DISS: vlstm r8, {d0 - d31} @ encoding: [0x28,0xec,0x80,0x0a]
+
+[0x38,0xec,0x80,0x0a]
+// CHECK-DISS: vlldm r8, {d0 - d31} @ encoding: [0x38,0xec,0x80,0x0a]
\ No newline at end of file
diff --git a/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt
new file mode 100644
index 000000000000000..1e28d5284c5b2ae
--- /dev/null
+++ b/llvm/test/MC/Disassembler/ARM/armv8.1m-vlldm_vlstm-8.main.txt
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple=armv8m.main-arm-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+// RUN: llvm-mc -triple=thumbv8m.main-none-eabi -mcpu=generic -show-encoding -disassemble %s \
+// RUN: | FileCheck %s --check-prefixes=CHECK-DISS
+
+[0x28,0xec,0x00,0x0a]
+// CHECK-DISS: vlstm r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+[0x38,0xec,0x00,0x0a]
+// CHECK-DISS: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
+
+[0x28,0xec,0x00,0x0a]
+// CHECK-DISS: vlstm r8, {d0 - d15} @ encoding: [0x28,0xec,0x00,0x0a]
+
+[0x38,0xec,0x00,0x0a]
+// CHECK-DISS: vlldm r8, {d0 - d15} @ encoding: [0x38,0xec,0x00,0x0a]
\ No newline at end of file
diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
index aeb25bf012d0345..3a76054ca4f36df 100644
--- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp
+++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
@@ -1126,7 +1126,9 @@ TEST(MachineInstr, HasSideEffects) {
       VLDR_VPR_post,
       VLDR_VPR_pre,
       VLLDM,
+      VLLDM_T2,
       VLSTM,
+      VLSTM_T2,
       VMRS,
       VMRS_FPCXTNS,
       VMRS_FPCXTS,

From 28162b7832b87c85faedfca661e47d60e74337e9 Mon Sep 17 00:00:00 2001
From: Leandro Lupori <leandro.lupori@linaro.org>
Date: Wed, 28 Feb 2024 14:04:23 -0300
Subject: [PATCH 094/114] [flang][OpenMP] Fix copyprivate test on ppc64le

---
 flang/test/Integration/OpenMP/copyprivate.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/test/Integration/OpenMP/copyprivate.f90 b/flang/test/Integration/OpenMP/copyprivate.f90
index 9318b743a95290c..d32319a18c28bba 100644
--- a/flang/test/Integration/OpenMP/copyprivate.f90
+++ b/flang/test/Integration/OpenMP/copyprivate.f90
@@ -26,7 +26,7 @@
 !CHECK-DAG: define void @_copy_box_ptr_Uxc8x9(ptr %{{.*}}, ptr %{{.*}})
 
 !CHECK-LABEL: define void @_copy_i32(
-!CHECK-SAME:                         ptr %[[DST:.*]], ptr %[[SRC:.*]]) {
+!CHECK-SAME:                         ptr %[[DST:.*]], ptr %[[SRC:.*]]){{.*}} {
 !CHECK-NEXT:    %[[SRC_VAL:.*]] = load i32, ptr %[[SRC]]
 !CHECK-NEXT:    store i32 %[[SRC_VAL]], ptr %[[DST]]
 !CHECK-NEXT:    ret void

From 95e036956f0a610027907df9a8f99d1f3c3b4cf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 28 Feb 2024 16:14:49 +0100
Subject: [PATCH 095/114] [clang][Interp] Note UB when converting Inf to
 integer

The F.isFinite() check here was introduced back when the first
floating point implementation was pushed, but I don't think it's
necessary and it fixes the attached test case.
---
 clang/lib/AST/Interp/Interp.h   | 2 +-
 clang/test/AST/Interp/cxx20.cpp | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 241d5941e143ee1..13e004371f912ca 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1680,7 +1680,7 @@ bool CastFloatingIntegral(InterpState &S, CodePtr OpPC) {
     auto Status = F.convertToInteger(Result);
 
     // Float-to-Integral overflow check.
-    if ((Status & APFloat::opStatus::opInvalidOp) && F.isFinite()) {
+    if ((Status & APFloat::opStatus::opInvalidOp)) {
       const Expr *E = S.Current->getExpr(OpPC);
       QualType Type = E->getType();
 
diff --git a/clang/test/AST/Interp/cxx20.cpp b/clang/test/AST/Interp/cxx20.cpp
index 2c28e53784c5c63..000ffe39eb94a73 100644
--- a/clang/test/AST/Interp/cxx20.cpp
+++ b/clang/test/AST/Interp/cxx20.cpp
@@ -765,3 +765,12 @@ namespace FailingDestructor {
     f<D{0, false}>(); // both-error {{no matching function}}
   }
 }
+
+
+void overflowInSwitchCase(int n) {
+  switch (n) {
+  case (int)(float)1e300: // both-error {{constant expression}} \
+                          // both-note {{value +Inf is outside the range of representable values of type 'int'}}
+    break;
+  }
+}

From b6f4dd9ee8cd4aa2c41622d75790df9341d9e043 Mon Sep 17 00:00:00 2001
From: srcarroll <50210727+srcarroll@users.noreply.github.com>
Date: Wed, 28 Feb 2024 11:19:06 -0600
Subject: [PATCH 096/114] [mlir][transform] Implement
 `FlattenElementwiseLinalgOp` transform op (#81431)

A `transform.structured.flatten_elementwise` op is implemented for
flattening the iteration space and (applicable) operands/results to a
single dimension.
---
 .../Linalg/TransformOps/LinalgTransformOps.td |  43 +++++++
 .../Dialect/Linalg/Transforms/Transforms.h    |  10 +-
 .../TransformOps/LinalgTransformOps.cpp       |  25 ++++
 .../Linalg/Transforms/ElementwiseOpFusion.cpp | 114 ++++++++++--------
 .../Dialect/Linalg/flatten-elementwise.mlir   |  99 +++++++++++++++
 5 files changed, 241 insertions(+), 50 deletions(-)
 create mode 100644 mlir/test/Dialect/Linalg/flatten-elementwise.mlir

diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 309573a562872fb..53ed31877c6f24f 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -2295,6 +2295,49 @@ def ConvertConv2DToImg2ColOp : Op<Transform_Dialect,
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// FlattenElementwiseLinalgOp
+//===----------------------------------------------------------------------===//
+
+def FlattenElementwiseLinalgOp : Op<Transform_Dialect,
+    "structured.flatten_elementwise",
+    [FunctionalStyleTransformOpTrait,
+     MemoryEffectsOpInterface,
+     TransformOpInterface,
+     TransformEachOpTrait,
+     ReportTrackingListenerFailuresOpTrait]> {
+  let description = [{
+    Flattens the iteration space and (applicable) operands of elementwise
+    linalg ops to a single dimension.
+
+    Returns one handle:
+    - Flattened linalg operation.
+
+    #### Return modes:
+
+    Returns a definite failure if target is not isolated from above.
+    Returns a silenceable failure if the pattern application failed.
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$target);
+  let results = (outs TransformHandleTypeInterface:$transformed);
+
+  let assemblyFormat =
+    "$target attr-dict `:` functional-type($target, results)";
+
+  let builders = [
+    OpBuilder<(ins "Value":$target)>
+  ];
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::linalg::LinalgOp target,
+        ::mlir::transform::ApplyToEachResultList &results,
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // Transpose Conv2D
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index a848d12fbbb50e9..65cf19e7a4fcd6a 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -1074,6 +1074,11 @@ bool isDimSequencePreserved(AffineMap map, ReassociationIndicesRef dimSequence);
 bool areDimSequencesPreserved(ArrayRef<AffineMap> maps,
                               ArrayRef<ReassociationIndices> dimSequences);
 
+struct CollapseResult {
+  SmallVector<Value> results;
+  LinalgOp collapsedOp;
+};
+
 /// Collapses dimensions of linalg.generic/linalg.copy operation. A precondition
 /// to calling this method is that for each list in `foldedIterationDim`, the
 /// sequence of dimensions is contiguous in domains of all `indexing_maps` of
@@ -1081,9 +1086,8 @@ bool areDimSequencesPreserved(ArrayRef<AffineMap> maps,
 /// When valid, the method also collapses the operands of the op. Returns
 /// replacement values of the results of the original `linalgOp` by inserting
 /// reshapes to get back values of compatible types.
-template <typename LinalgType>
-FailureOr<SmallVector<Value>>
-collapseOpIterationDims(LinalgType op,
+FailureOr<CollapseResult>
+collapseOpIterationDims(LinalgOp op,
                         ArrayRef<ReassociationIndices> foldedIterationDims,
                         RewriterBase &rewriter);
 
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 299965bcfc3ab36..ef9cd5561665fc7 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -3244,6 +3244,31 @@ DiagnosedSilenceableFailure transform::ConvertConv2DToImg2ColOp::applyToOne(
   return DiagnosedSilenceableFailure::success();
 }
 
+//===----------------------------------------------------------------------===//
+// FlattenElementwiseLinalgOp.
+//===----------------------------------------------------------------------===//
+
+DiagnosedSilenceableFailure transform::FlattenElementwiseLinalgOp::applyToOne(
+    transform::TransformRewriter &rewriter, linalg::LinalgOp target,
+    transform::ApplyToEachResultList &results,
+    transform::TransformState &state) {
+  rewriter.setInsertionPoint(target);
+  if (target.getNumLoops() <= 1)
+    return DiagnosedSilenceableFailure::success();
+  ReassociationIndices reassociation(target.getNumLoops());
+  std::iota(reassociation.begin(), reassociation.end(), 0);
+  auto maybeFlattened =
+      (isElementwise(target))
+          ? collapseOpIterationDims(target, reassociation, rewriter)
+          : FailureOr<CollapseResult>(rewriter.notifyMatchFailure(
+                target, "only elementwise flattening is supported"));
+  if (failed(maybeFlattened))
+    return emitDefaultSilenceableFailure(target);
+  results.push_back(maybeFlattened->collapsedOp);
+  rewriter.replaceOp(target, maybeFlattened->results);
+  return DiagnosedSilenceableFailure::success();
+}
+
 //===----------------------------------------------------------------------===//
 // TransposeConv2DOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 4977940cfbd7977..4797bfb2267d7fd 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -1446,24 +1446,20 @@ void generateCollapsedIndexingRegion(Location loc, Block *block,
   }
 }
 
-template <typename LinalgType>
-Operation *createCollapsedOp(LinalgType op,
-                             const CollapsingInfo &collapsingInfo,
-                             RewriterBase &rewriter) {
-  static_assert(llvm::is_one_of<LinalgType, GenericOp, CopyOp>::value,
-                "unsupported linalg op type to create");
+void collapseOperandsAndResults(LinalgOp op,
+                                const CollapsingInfo &collapsingInfo,
+                                RewriterBase &rewriter,
+                                SmallVectorImpl<Value> &inputOperands,
+                                SmallVectorImpl<Value> &outputOperands,
+                                SmallVectorImpl<Type> &resultTypes) {
   Location loc = op->getLoc();
-
-  // Get the input operands.
-  SmallVector<Value> inputOperands =
+  inputOperands =
       llvm::map_to_vector(op.getDpsInputOperands(), [&](OpOperand *opOperand) {
         return getCollapsedOpOperand(loc, op, opOperand, collapsingInfo,
                                      rewriter);
       });
 
   // Get the output operands and result types.
-  SmallVector<Type> resultTypes;
-  SmallVector<Value> outputOperands;
   resultTypes.reserve(op.getNumDpsInits());
   outputOperands.reserve(op.getNumDpsInits());
   for (OpOperand &output : op.getDpsInitsMutable()) {
@@ -1475,41 +1471,69 @@ Operation *createCollapsedOp(LinalgType op,
     if (!op.hasPureBufferSemantics())
       resultTypes.push_back(newOutput.getType());
   }
+}
 
-  if (isa<linalg::CopyOp>(op)) {
-    return rewriter.create<linalg::CopyOp>(loc, inputOperands[0],
-                                           outputOperands[0]);
-  }
+/// Clone a `LinalgOp` to a collapsed version of same name
+template <typename OpTy>
+OpTy cloneToCollapsedOp(RewriterBase &rewriter, OpTy origOp,
+                        const CollapsingInfo &collapsingInfo) {
+  return nullptr;
+}
 
-  // Get the iterator types for the operand.
-  SmallVector<utils::IteratorType> iteratorTypes =
-      getCollapsedOpIteratorTypes(op.getIteratorTypesArray(), collapsingInfo);
+/// Collapse any `LinalgOp` that does not require any specialization such as
+/// indexing_maps, iterator_types, etc.
+template <>
+LinalgOp cloneToCollapsedOp<LinalgOp>(RewriterBase &rewriter, LinalgOp origOp,
+                                      const CollapsingInfo &collapsingInfo) {
+  SmallVector<Value> inputOperands, outputOperands;
+  SmallVector<Type> resultTypes;
+  collapseOperandsAndResults(origOp, collapsingInfo, rewriter, inputOperands,
+                             outputOperands, resultTypes);
+  return cast<LinalgOp>(clone(
+      rewriter, origOp, resultTypes,
+      llvm::to_vector(llvm::concat<Value>(inputOperands, outputOperands))));
+}
 
-  // Get the indexing maps.
-  auto indexingMaps =
-      llvm::map_to_vector(op.getIndexingMapsArray(), [&](AffineMap map) {
+/// Collapse a `GenericOp`
+template <>
+GenericOp cloneToCollapsedOp<GenericOp>(RewriterBase &rewriter,
+                                        GenericOp origOp,
+                                        const CollapsingInfo &collapsingInfo) {
+  SmallVector<Value> inputOperands, outputOperands;
+  SmallVector<Type> resultTypes;
+  collapseOperandsAndResults(origOp, collapsingInfo, rewriter, inputOperands,
+                             outputOperands, resultTypes);
+  SmallVector<AffineMap> indexingMaps(
+      llvm::map_range(origOp.getIndexingMapsArray(), [&](AffineMap map) {
         return getCollapsedOpIndexingMap(map, collapsingInfo);
-      });
+      }));
+
+  SmallVector<utils::IteratorType> iteratorTypes(getCollapsedOpIteratorTypes(
+      origOp.getIteratorTypesArray(), collapsingInfo));
 
-  Operation *collapsedOp = rewriter.create<linalg::GenericOp>(
-      loc, resultTypes, inputOperands, outputOperands, indexingMaps,
+  GenericOp collapsedOp = rewriter.create<linalg::GenericOp>(
+      origOp.getLoc(), resultTypes, inputOperands, outputOperands, indexingMaps,
       iteratorTypes, [](OpBuilder &builder, Location loc, ValueRange args) {});
-  Block *origOpBlock = &op->getRegion(0).front();
+  Block *origOpBlock = &origOp->getRegion(0).front();
   Block *collapsedOpBlock = &collapsedOp->getRegion(0).front();
   rewriter.mergeBlocks(origOpBlock, collapsedOpBlock,
                        collapsedOpBlock->getArguments());
-
   return collapsedOp;
 }
 
+LinalgOp createCollapsedOp(LinalgOp op, const CollapsingInfo &collapsingInfo,
+                           RewriterBase &rewriter) {
+  if (GenericOp genericOp = dyn_cast<GenericOp>(op.getOperation())) {
+    return cloneToCollapsedOp(rewriter, genericOp, collapsingInfo);
+  } else {
+    return cloneToCollapsedOp(rewriter, op, collapsingInfo);
+  }
+}
+
 /// Implementation of fusion with reshape operation by collapsing dimensions.
-template <typename LinalgType>
-FailureOr<SmallVector<Value>> mlir::linalg::collapseOpIterationDims(
-    LinalgType op, ArrayRef<ReassociationIndices> foldedIterationDims,
+FailureOr<CollapseResult> mlir::linalg::collapseOpIterationDims(
+    LinalgOp op, ArrayRef<ReassociationIndices> foldedIterationDims,
     RewriterBase &rewriter) {
-  static_assert(llvm::is_one_of<LinalgType, GenericOp, CopyOp>::value,
-                "unsupported linalg op type to collapse");
-
   // Bail on trivial no-op cases.
   if (op.getNumLoops() <= 1 || foldedIterationDims.empty() ||
       llvm::all_of(foldedIterationDims, [](ReassociationIndicesRef foldedDims) {
@@ -1538,8 +1562,7 @@ FailureOr<SmallVector<Value>> mlir::linalg::collapseOpIterationDims(
   }
 
   // Bail on non-canonical ranges.
-  SmallVector<Range> loopRanges =
-      cast<LinalgOp>(op.getOperation()).createLoopRanges(rewriter, op.getLoc());
+  SmallVector<Range> loopRanges = op.createLoopRanges(rewriter, op.getLoc());
   auto opFoldIsConstantValue = [](OpFoldResult ofr, int64_t value) {
     if (auto attr = llvm::dyn_cast_if_present<Attribute>(ofr))
       return cast<IntegerAttr>(attr).getInt() == value;
@@ -1555,8 +1578,7 @@ FailureOr<SmallVector<Value>> mlir::linalg::collapseOpIterationDims(
         op, "expected all loop ranges to have zero start and unit stride");
   }
 
-  LinalgType collapsedOp = cast<LinalgType>(
-      createCollapsedOp<LinalgType>(op, collapsingInfo, rewriter));
+  LinalgOp collapsedOp = createCollapsedOp(op, collapsingInfo, rewriter);
 
   Location loc = op->getLoc();
   if (collapsedOp.hasIndexSemantics()) {
@@ -1597,7 +1619,7 @@ FailureOr<SmallVector<Value>> mlir::linalg::collapseOpIterationDims(
       results.push_back(collapsedOpResult);
     }
   }
-  return results;
+  return CollapseResult{results, collapsedOp};
 }
 
 namespace {
@@ -1629,15 +1651,14 @@ class FoldWithProducerReshapeOpByCollapsing
         continue;
       }
 
-      std::optional<SmallVector<Value>> replacements =
-          collapseOpIterationDims<linalg::GenericOp>(
-              genericOp, collapsableIterationDims, rewriter);
-      if (!replacements) {
+      std::optional<CollapseResult> collapseResult = collapseOpIterationDims(
+          genericOp, collapsableIterationDims, rewriter);
+      if (!collapseResult) {
         return rewriter.notifyMatchFailure(
             genericOp, "failed to do the fusion by collapsing transformation");
       }
 
-      rewriter.replaceOp(genericOp, *replacements);
+      rewriter.replaceOp(genericOp, collapseResult->results);
       return success();
     }
     return failure();
@@ -1671,13 +1692,12 @@ class CollapseLinalgDimensions : public OpRewritePattern<LinalgType> {
           op, "specified dimensions cannot be collapsed");
     }
 
-    std::optional<SmallVector<Value>> replacements =
-        collapseOpIterationDims<LinalgType>(op, collapsableIterationDims,
-                                            rewriter);
-    if (!replacements) {
+    std::optional<CollapseResult> collapseResult =
+        collapseOpIterationDims(op, collapsableIterationDims, rewriter);
+    if (!collapseResult) {
       return rewriter.notifyMatchFailure(op, "failed to collapse dimensions");
     }
-    rewriter.replaceOp(op, *replacements);
+    rewriter.replaceOp(op, collapseResult->results);
     return success();
   }
 
diff --git a/mlir/test/Dialect/Linalg/flatten-elementwise.mlir b/mlir/test/Dialect/Linalg/flatten-elementwise.mlir
new file mode 100644
index 000000000000000..858c133dd536cae
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/flatten-elementwise.mlir
@@ -0,0 +1,99 @@
+// RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func.func @fill(
+// CHECK-SAME:                  %[[ARG0:.*]]: f32,
+// CHECK-SAME:                  %[[ARG1:.*]]: memref<32x7xf32>
+// CHECK-NEXT:    %[[FLATTENED:.*]] = memref.collapse_shape %[[ARG1]] {{\[}}[0, 1]]
+// CHECK-NEXT:    linalg.fill ins(%[[ARG0]] : f32) outs(%[[FLATTENED]] : memref<224xf32>)
+func.func @fill(%cst: f32, %arg: memref<32x7xf32>) {
+    linalg.fill ins(%cst: f32) outs(%arg: memref<32x7xf32>)
+    return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %flattened = transform.structured.flatten_elementwise %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @fill_tensor(
+// CHECK-SAME:                         %[[ARG0:.*]]: f32,
+// CHECK-SAME:                         %[[ARG1:.*]]: tensor<32x7xf32>
+// CHECK-NEXT:    %[[FLATTENED:.*]] = tensor.collapse_shape %[[ARG1]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_RESULT:.*]] = linalg.fill ins(%[[ARG0]] : f32) outs(%[[FLATTENED]] : tensor<224xf32>)
+// CHECK-NEXT:    %[[RESULT:.*]] = tensor.expand_shape %[[FLATTENED_RESULT]] {{\[}}[0, 1]]
+func.func @fill_tensor(%cst: f32, %arg: tensor<32x7xf32>) -> tensor<32x7xf32> {
+    %0 = linalg.fill ins(%cst: f32) outs(%arg: tensor<32x7xf32>) ->  tensor<32x7xf32>
+    return %0 :  tensor<32x7xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %flattened = transform.structured.flatten_elementwise %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @map(
+// CHECK-SAME:                 %[[ARG0:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-SAME:                 %[[ARG1:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-SAME:                 %[[ARG2:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-NEXT:    %[[FLATTENED_0:.*]] = memref.collapse_shape %[[ARG0]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_1:.*]] = memref.collapse_shape %[[ARG1]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_2:.*]] = memref.collapse_shape %[[ARG2]] {{\[}}[0, 1]]
+// CHECK-NEXT:    linalg.map { arith.addf } ins(%[[FLATTENED_0]], %[[FLATTENED_1]] : memref<224xf32>, memref<224xf32>) outs(%[[FLATTENED_2]] : memref<224xf32>)
+func.func @map(%arg0: memref<32x7xf32>, %arg1: memref<32x7xf32>, %arg2: memref<32x7xf32>) {
+    linalg.map {arith.addf} ins(%arg0, %arg1: memref<32x7xf32>, memref<32x7xf32>) outs(%arg2: memref<32x7xf32>)
+    return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %flattened = transform.structured.flatten_elementwise %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK: #[[$MAP0:.*]] = affine_map<(d0) -> (d0)>
+// CHECK-LABEL: func.func @generic
+// CHECK-SAME:                 %[[ARG0:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-SAME:                 %[[ARG1:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-SAME:                 %[[ARG2:[a-zA-Z0-9_]*]]: memref<32x7xf32>
+// CHECK-NEXT:    %[[FLATTENED_0:.*]] = memref.collapse_shape %[[ARG0]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_1:.*]] = memref.collapse_shape %[[ARG1]] {{\[}}[0, 1]]
+// CHECK-NEXT:    %[[FLATTENED_2:.*]] = memref.collapse_shape %[[ARG2]] {{\[}}[0, 1]]
+// CHECK-NEXT:    linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP0]], #[[$MAP0]]], iterator_types = ["parallel"]} ins(%[[FLATTENED_0]], %[[FLATTENED_1]] : memref<224xf32>, memref<224xf32>) outs(%[[FLATTENED_2]] : memref<224xf32>)
+// CHECK-NEXT:       ^bb0(%[[A:.*]]: f32, %[[B:.*]]: f32, %[[C:.*]]: f32)
+// CHECK-NEXT:         %[[SUM:.*]] = arith.addf %[[A]], %[[B]]
+// CHECK-NEXT:         linalg.yield %[[SUM]]
+#map = affine_map<(d0, d1) -> (d0, d1)>
+func.func @generic( %arg0: memref<32x7xf32>, %arg1: memref<32x7xf32>, %arg2: memref<32x7xf32>) {
+    linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1: memref<32x7xf32>, memref<32x7xf32>) outs(%arg2: memref<32x7xf32>) {
+        ^bb0(%a: f32, %b: f32, %c: f32):
+            %0 = arith.addf %a, %b : f32
+            linalg.yield %0 : f32
+    }
+    return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %flattened = transform.structured.flatten_elementwise %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}

From 15b6908afa9b74ae62821ff44179d659277eef69 Mon Sep 17 00:00:00 2001
From: Paul T Robinson <paul.robinson@sony.com>
Date: Wed, 28 Feb 2024 09:36:28 -0800
Subject: [PATCH 097/114] [Headers][X86] Make brief descriptions briefer
 (#82422)

In Sony's document processing, the first "paragraph" of the description
is duplicated into a Brief Description section. Add paragraph breaks to make
those brief descriptions less verbose.

In fmaintrin.h we were including the \code blocks, which are
inappropriate for a brief description. While I was in there, change
\code to \code{.operation} which the Intel document processing wants.
---
 clang/lib/Headers/emmintrin.h    | 121 +++++++++++++------------
 clang/lib/Headers/fmaintrin.h    |  48 ++++++----
 clang/lib/Headers/mmintrin.h     | 148 +++++++++++++++----------------
 clang/lib/Headers/prfchwintrin.h |  16 ++--
 clang/lib/Headers/smmintrin.h    |  20 ++---
 clang/lib/Headers/tmmintrin.h    |  36 ++++----
 6 files changed, 201 insertions(+), 188 deletions(-)

diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 96e3ebdecbdf83c..1d451b5f5b25de4 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -2099,9 +2099,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
 }
 
 /// Adds, with saturation, the corresponding elements of two 128-bit
-///    signed [16 x i8] vectors, saving each sum in the corresponding element of
-///    a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
-///    saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
+///    signed [16 x i8] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [16 x i8].
+///
+///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
+///    less than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2119,10 +2121,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
 }
 
 /// Adds, with saturation, the corresponding elements of two 128-bit
-///    signed [8 x i16] vectors, saving each sum in the corresponding element of
-///    a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
-///    are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-///    0x8000.
+///    signed [8 x i16] vectors, saving each sum in the corresponding element
+///    of a 128-bit result vector of [8 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2141,8 +2144,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
 
 /// Adds, with saturation, the corresponding elements of two 128-bit
 ///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
-///    of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
-///    are saturated to 0xFF. Negative sums are saturated to 0x00.
+///    of a 128-bit result vector of [16 x i8].
+///
+///    Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
+///    saturated to 0x00.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2161,8 +2166,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
 
 /// Adds, with saturation, the corresponding elements of two 128-bit
 ///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
-///    of a 128-bit result vector of [8 x i16]. Positive sums greater than
-///    0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
+///    of a 128-bit result vector of [8 x i16].
+///
+///    Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
+///    are saturated to 0x0000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2518,10 +2525,12 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
   return (__m128i)((__v2du)__a - (__v2du)__b);
 }
 
-/// Subtracts corresponding 8-bit signed integer values in the input and
-///    returns the differences in the corresponding bytes in the destination.
-///    Differences greater than 0x7F are saturated to 0x7F, and differences less
-///    than 0x80 are saturated to 0x80.
+/// Subtracts, with saturation, corresponding 8-bit signed integer values in
+///    the input and returns the differences in the corresponding bytes in the
+///    destination.
+///
+///    Differences greater than 0x7F are saturated to 0x7F, and differences
+///    less than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2538,8 +2547,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
   return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
 }
 
-/// Subtracts corresponding 16-bit signed integer values in the input and
-///    returns the differences in the corresponding bytes in the destination.
+/// Subtracts, with saturation, corresponding 16-bit signed integer values in
+///    the input and returns the differences in the corresponding bytes in the
+///    destination.
+///
 ///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
 ///    than 0x8000 are saturated to 0x8000.
 ///
@@ -2558,9 +2569,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
   return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
 }
 
-/// Subtracts corresponding 8-bit unsigned integer values in the input
-///    and returns the differences in the corresponding bytes in the
-///    destination. Differences less than 0x00 are saturated to 0x00.
+/// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
+///    the input and returns the differences in the corresponding bytes in the
+///    destination.
+///
+///    Differences less than 0x00 are saturated to 0x00.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -2577,9 +2590,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
   return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
 }
 
-/// Subtracts corresponding 16-bit unsigned integer values in the input
-///    and returns the differences in the corresponding bytes in the
-///    destination. Differences less than 0x0000 are saturated to 0x0000.
+/// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
+///    the input and returns the differences in the corresponding bytes in the
+///    destination.
+///
+///    Differences less than 0x0000 are saturated to 0x0000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -4050,26 +4065,22 @@ void _mm_mfence(void);
 } // extern "C"
 #endif
 
-/// Converts 16-bit signed integers from both 128-bit integer vector
-///    operands into 8-bit signed integers, and packs the results into the
-///    destination. Positive values greater than 0x7F are saturated to 0x7F.
-///    Negative values less than 0x80 are saturated to 0x80.
+/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
+///    vector operands into 8-bit signed integers, and packs the results into
+///    the destination.
+///
+///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
+///    less than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
 ///
 /// \param __a
-///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-///   a signed integer and is converted to a 8-bit signed integer with
-///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
-///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///   written to the lower 64 bits of the result.
 /// \param __b
-///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-///   a signed integer and is converted to a 8-bit signed integer with
-///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
-///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///   written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
@@ -4077,26 +4088,22 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// Converts 32-bit signed integers from both 128-bit integer vector
-///    operands into 16-bit signed integers, and packs the results into the
-///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-///    Negative values less than 0x8000 are saturated to 0x8000.
+/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
+///    vector operands into 16-bit signed integers, and packs the results into
+///    the destination.
+///
+///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
+///    values less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
 ///
 /// \param __a
-///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
-///    a signed integer and is converted to a 16-bit signed integer with
-///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
-///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
 ///    are written to the lower 64 bits of the result.
 /// \param __b
-///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
-///    a signed integer and is converted to a 16-bit signed integer with
-///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
-///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
 ///    are written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
@@ -4104,26 +4111,22 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
 }
 
-/// Converts 16-bit signed integers from both 128-bit integer vector
-///    operands into 8-bit unsigned integers, and packs the results into the
-///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0x00 are saturated to 0x00.
+/// Converts, with saturation, 16-bit signed integers from both 128-bit integer
+///    vector operands into 8-bit unsigned integers, and packs the results into
+///    the destination.
+///
+///    Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
+///    are saturated to 0x00.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
 ///
 /// \param __a
-///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-///    a signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///    written to the lower 64 bits of the result.
 /// \param __b
-///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-///    a signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
 ///    written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
diff --git a/clang/lib/Headers/fmaintrin.h b/clang/lib/Headers/fmaintrin.h
index ea832fac4f99226..22d1a780bbfd4eb 100644
--- a/clang/lib/Headers/fmaintrin.h
+++ b/clang/lib/Headers/fmaintrin.h
@@ -60,7 +60,8 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a scalar multiply-add of the single-precision values in the
 ///    low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@@ -88,7 +89,8 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a scalar multiply-add of the double-precision values in the
 ///    low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@@ -156,7 +158,8 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a scalar multiply-subtract of the single-precision values in
 ///    the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@@ -184,7 +187,8 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a scalar multiply-subtract of the double-precision values in
 ///    the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@@ -252,7 +256,8 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a scalar negated multiply-add of the single-precision values in
 ///    the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@@ -280,7 +285,8 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a scalar negated multiply-add of the double-precision values
 ///    in the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@@ -348,7 +354,8 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a scalar negated multiply-subtract of the single-precision
 ///    values in the low 32 bits of 128-bit vectors of [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[127:32] = __A[127:32]
 /// \endcode
@@ -376,7 +383,8 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a scalar negated multiply-subtract of the double-precision
 ///    values in the low 64 bits of 128-bit vectors of [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = __A[127:64]
 /// \endcode
@@ -404,7 +412,8 @@ _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
@@ -430,7 +439,8 @@ _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
 /// \endcode
@@ -454,7 +464,8 @@ _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [4 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
@@ -480,7 +491,8 @@ _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 
 /// Computes a multiply with alternating add/subtract of 128-bit vectors of
 ///    [2 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
 /// \endcode
@@ -664,7 +676,8 @@ _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 
 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
 ///    [8 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
@@ -694,7 +707,8 @@ _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 
 /// Computes a multiply with alternating add/subtract of 256-bit vectors of
 ///    [4 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
 /// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
@@ -720,7 +734,8 @@ _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 
 /// Computes a vector multiply with alternating add/subtract of 256-bit
 ///    vectors of [8 x float].
-/// \code
+///
+/// \code{.operation}
 /// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
 /// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
 /// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
@@ -750,7 +765,8 @@ _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 
 /// Computes a vector multiply with alternating add/subtract of 256-bit
 ///    vectors of [4 x double].
-/// \code
+///
+/// \code{.operation}
 /// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
 /// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
 /// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h
index 08849f01071aea7..962d24738e7aa48 100644
--- a/clang/lib/Headers/mmintrin.h
+++ b/clang/lib/Headers/mmintrin.h
@@ -105,28 +105,23 @@ _mm_cvtm64_si64(__m64 __m)
     return (long long)__m;
 }
 
-/// Converts 16-bit signed integers from both 64-bit integer vector
-///    parameters of [4 x i16] into 8-bit signed integer values, and constructs
-///    a 64-bit integer vector of [8 x i8] as the result. Positive values
-///    greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
-///    are saturated to 0x80.
+/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
+///    vector parameters of [4 x i16] into 8-bit signed integer values, and
+///    constructs a 64-bit integer vector of [8 x i8] as the result.
+///
+///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
+///    less than 0x80 are saturated to 0x80.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit signed integer with
-///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
-///    Negative values less than 0x80 are saturated to 0x80. The converted
-///    [4 x i8] values are written to the lower 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the lower 32 bits of the result.
 /// \param __m2
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit signed integer with
-///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
-///    Negative values less than 0x80 are saturated to 0x80. The converted
-///    [4 x i8] values are written to the upper 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
@@ -135,28 +130,23 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// Converts 32-bit signed integers from both 64-bit integer vector
-///    parameters of [2 x i32] into 16-bit signed integer values, and constructs
-///    a 64-bit integer vector of [4 x i16] as the result. Positive values
-///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
-///    0x8000 are saturated to 0x8000.
+/// Converts, with saturation, 32-bit signed integers from both 64-bit integer
+///    vector parameters of [2 x i32] into 16-bit signed integer values, and
+///    constructs a 64-bit integer vector of [4 x i16] as the result.
+///
+///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
+///    values less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
-///    32-bit signed integer and is converted to a 16-bit signed integer with
-///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-///    Negative values less than 0x8000 are saturated to 0x8000. The converted
-///    [2 x i16] values are written to the lower 32 bits of the result.
+///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
+///    written to the lower 32 bits of the result.
 /// \param __m2
-///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
-///    32-bit signed integer and is converted to a 16-bit signed integer with
-///    saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-///    Negative values less than 0x8000 are saturated to 0x8000. The converted
-///    [2 x i16] values are written to the upper 32 bits of the result.
+///    A 64-bit integer vector of [2 x i32]. The converted [2 x i16] values are
+///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
 ///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
@@ -165,28 +155,23 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
 }
 
-/// Converts 16-bit signed integers from both 64-bit integer vector
-///    parameters of [4 x i16] into 8-bit unsigned integer values, and
-///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
-///    greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
-///    to 0.
+/// Converts, with saturation, 16-bit signed integers from both 64-bit integer
+///    vector parameters of [4 x i16] into 8-bit unsigned integer values, and
+///    constructs a 64-bit integer vector of [8 x i8] as the result.
+///
+///    Values greater than 0xFF are saturated to 0xFF. Values less than 0 are
+///    saturated to 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
 ///
 /// \param __m1
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0 are saturated to 0. The converted [4 x i8] values are written to
-///    the lower 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the lower 32 bits of the result.
 /// \param __m2
-///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-///    16-bit signed integer and is converted to an 8-bit unsigned integer with
-///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-///    than 0 are saturated to 0. The converted [4 x i8] values are written to
-///    the upper 32 bits of the result.
+///    A 64-bit integer vector of [4 x i16]. The converted [4 x i8] values are
+///    written to the upper 32 bits of the result.
 /// \returns A 64-bit integer vector of [8 x i8] containing the converted
 ///    values.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
@@ -400,11 +385,13 @@ _mm_add_pi32(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
 }
 
-/// Adds each 8-bit signed integer element of the first 64-bit integer
-///    vector of [8 x i8] to the corresponding 8-bit signed integer element of
-///    the second 64-bit integer vector of [8 x i8]. Positive sums greater than
-///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
-///    0x80. The results are packed into a 64-bit integer vector of [8 x i8].
+/// Adds, with saturation, each 8-bit signed integer element of the first
+///    64-bit integer vector of [8 x i8] to the corresponding 8-bit signed
+///    integer element of the second 64-bit integer vector of [8 x i8].
+///
+///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
+///    less than 0x80 are saturated to 0x80. The results are packed into a
+///    64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -422,12 +409,13 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// Adds each 16-bit signed integer element of the first 64-bit integer
-///    vector of [4 x i16] to the corresponding 16-bit signed integer element of
-///    the second 64-bit integer vector of [4 x i16]. Positive sums greater than
-///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
-///    saturated to 0x8000. The results are packed into a 64-bit integer vector
-///    of [4 x i16].
+/// Adds, with saturation, each 16-bit signed integer element of the first
+///    64-bit integer vector of [4 x i16] to the corresponding 16-bit signed
+///    integer element of the second 64-bit integer vector of [4 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000. The results are packed into a
+///    64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -445,11 +433,12 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
 }
 
-/// Adds each 8-bit unsigned integer element of the first 64-bit integer
-///    vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
-///    the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
-///    saturated to 0xFF. The results are packed into a 64-bit integer vector of
-///    [8 x i8].
+/// Adds, with saturation, each 8-bit unsigned integer element of the first
+///    64-bit integer vector of [8 x i8] to the corresponding 8-bit unsigned
+///    integer element of the second 64-bit integer vector of [8 x i8].
+///
+///    Sums greater than 0xFF are saturated to 0xFF. The results are packed
+///    into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -467,11 +456,12 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// Adds each 16-bit unsigned integer element of the first 64-bit integer
-///    vector of [4 x i16] to the corresponding 16-bit unsigned integer element
-///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
-///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
-///    integer vector of [4 x i16].
+/// Adds, with saturation, each 16-bit unsigned integer element of the first
+///    64-bit integer vector of [4 x i16] to the corresponding 16-bit unsigned
+///    integer element of the second 64-bit integer vector of [4 x i16].
+///
+///    Sums greater than 0xFFFF are saturated to 0xFFFF. The results are packed
+///    into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -552,12 +542,13 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
 }
 
-/// Subtracts each 8-bit signed integer element of the second 64-bit
-///    integer vector of [8 x i8] from the corresponding 8-bit signed integer
-///    element of the first 64-bit integer vector of [8 x i8]. Positive results
-///    greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
-///    are saturated to 0x80. The results are packed into a 64-bit integer
-///    vector of [8 x i8].
+/// Subtracts, with saturation, each 8-bit signed integer element of the second
+///    64-bit integer vector of [8 x i8] from the corresponding 8-bit signed
+///    integer element of the first 64-bit integer vector of [8 x i8].
+///
+///    Positive results greater than 0x7F are saturated to 0x7F. Negative
+///    results less than 0x80 are saturated to 0x80. The results are packed
+///    into a 64-bit integer vector of [8 x i8].
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -575,12 +566,13 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2)
     return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
 }
 
-/// Subtracts each 16-bit signed integer element of the second 64-bit
-///    integer vector of [4 x i16] from the corresponding 16-bit signed integer
-///    element of the first 64-bit integer vector of [4 x i16]. Positive results
-///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
-///    0x8000 are saturated to 0x8000. The results are packed into a 64-bit
-///    integer vector of [4 x i16].
+/// Subtracts, with saturation, each 16-bit signed integer element of the
+///    second 64-bit integer vector of [4 x i16] from the corresponding 16-bit
+///    signed integer element of the first 64-bit integer vector of [4 x i16].
+///
+///    Positive results greater than 0x7FFF are saturated to 0x7FFF. Negative
+///    results less than 0x8000 are saturated to 0x8000. The results are packed
+///    into a 64-bit integer vector of [4 x i16].
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/Headers/prfchwintrin.h b/clang/lib/Headers/prfchwintrin.h
index d2f91aa0123e0ec..8a13784543c5f1e 100644
--- a/clang/lib/Headers/prfchwintrin.h
+++ b/clang/lib/Headers/prfchwintrin.h
@@ -15,9 +15,10 @@
 #define __PRFCHWINTRIN_H
 
 /// Loads a memory sequence containing the specified memory address into
-///    all data cache levels. The cache-coherency state is set to exclusive.
-///    Data can be read from and written to the cache line without additional
-///    delay.
+///    all data cache levels.
+///
+///    The cache-coherency state is set to exclusive. Data can be read from
+///    and written to the cache line without additional delay.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -32,10 +33,11 @@ _m_prefetch(void *__P)
 }
 
 /// Loads a memory sequence containing the specified memory address into
-///    the L1 data cache and sets the cache-coherency to modified. This
-///    provides a hint to the processor that the cache line will be modified.
-///    It is intended for use when the cache line will be written to shortly
-///    after the prefetch is performed.
+///    the L1 data cache and sets the cache-coherency state to modified.
+///
+///    This provides a hint to the processor that the cache line will be
+///    modified. It is intended for use when the cache line will be written to
+///    shortly after the prefetch is performed.
 ///
 ///    Note that the effect of this intrinsic is dependent on the processor
 ///    implementation.
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 005d7db9c3c3087..c52ffb77e33d5c4 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1431,8 +1431,10 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
 }
 
 /* SSE4 Pack with Unsigned Saturation.  */
-/// Converts 32-bit signed integers from both 128-bit integer vector
-///    operands into 16-bit unsigned integers, and returns the packed result.
+/// Converts, with saturation, 32-bit signed integers from both 128-bit integer
+///    vector operands into 16-bit unsigned integers, and returns the packed
+///    result.
+///
 ///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
 ///    0x0000 are saturated to 0x0000.
 ///
@@ -1441,17 +1443,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
 /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
 ///
 /// \param __V1
-///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
-///    signed integer and is converted to a 16-bit unsigned integer with
-///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
-///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
-///    are written to the lower 64 bits of the result.
+///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
+///    written to the lower 64 bits of the result.
 /// \param __V2
-///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
-///    signed integer and is converted to a 16-bit unsigned integer with
-///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
-///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
-///    are written to the higher 64 bits of the result.
+///    A 128-bit vector of [4 x i32]. The converted [4 x i16] values are
+///    written to the higher 64 bits of the result.
 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
                                                               __m128i __V2) {
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index 7d8dc46c57bfeb2..bf8327b692d1cfb 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -271,10 +271,11 @@ _mm_hadd_pi32(__m64 __a, __m64 __b)
     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
 }
 
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
-///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-///    0x8000.
+/// Horizontally adds, with saturation, the adjacent pairs of values contained
+///    in two packed 128-bit vectors of [8 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -296,10 +297,11 @@ _mm_hadds_epi16(__m128i __a, __m128i __b)
     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
-///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-///    0x8000.
+/// Horizontally adds, with saturation, the adjacent pairs of values contained
+///    in two packed 64-bit vectors of [4 x i16].
+///
+///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
+///    less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -413,10 +415,11 @@ _mm_hsub_pi32(__m64 __a, __m64 __b)
     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
 }
 
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
-///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
-///    saturated to 0x8000.
+/// Horizontally subtracts, with saturation, the adjacent pairs of values
+///    contained in two packed 128-bit vectors of [8 x i16].
+///
+///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative differences less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -438,10 +441,11 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b)
     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
 }
 
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
-///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
-///    saturated to 0x8000.
+/// Horizontally subtracts, with saturation, the adjacent pairs of values
+///    contained in two packed 64-bit vectors of [4 x i16].
+///
+///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative differences less than 0x8000 are saturated to 0x8000.
 ///
 /// \headerfile <x86intrin.h>
 ///

From 9da9b5f86750fc5286a89cc75f5f731affec7dfa Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Wed, 28 Feb 2024 09:43:44 -0800
Subject: [PATCH 098/114] [libc][docs] Document policy for non-standard func
 (#83212)

As encountered with <sys/queue.h>, we need a policy for how to handle
implementing functions that users need, but has no specific standard. In
that case, we should treat existing implementations as the standard and
try to match their behavior as best as possible.
---
 libc/docs/dev/undefined_behavior.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libc/docs/dev/undefined_behavior.rst b/libc/docs/dev/undefined_behavior.rst
index 0cb25c7f2a23366..6e73a305e8e054b 100644
--- a/libc/docs/dev/undefined_behavior.rst
+++ b/libc/docs/dev/undefined_behavior.rst
@@ -20,6 +20,7 @@ guidelines and the resulting code should behave predictably even in unexpected
 situations.
 
 #. Follow the standards.
+    #. If there is no standard, first ask yourself if this implementation is necessary (are there users who need this functionality?). If it truly is, then match existing implementations. Creating competing designs just causes confusion (see the history of qsort_r).
 #. Avoid giving an incorrect answer.
     #. In general, correct answer > correct answer (wrong format) > no answer > crash the program >>>>>>> incorrect answer.
     #. The C library is called frequently in performance critical situations, and so can't afford to do thorough error checking and correction.
@@ -61,7 +62,7 @@ Often the standard will imply an intended behavior through what it states is und
 
 Ignoring Bug-For-Bug Compatibility
 ----------------------------------
-Any long running implementations will have bugs and deviations from the standard. Hyrum's Law states that “all observable behaviors of your system will be depended on by somebody” which includes these bugs. An example of a long-standing bug is glibc's scanf float parsing behavior. The behavior is specifically defined in the standard, but it isn't adhered to by all libc implementations. There is a longstanding bug in glibc where it incorrectly parses the string 100er and this caused the C standard to add that specific example to the definition for scanf. The intended behavior is for scanf, when parsing a float, to parse the longest possibly valid prefix and then accept it if and only if that complete parsed value is a float. In the case of 100er the longest possibly valid prefix is 100e but the float parsed from that string is only 100. Since there is no number after the e it shouldn't be included in the float, so scanf should return a parsing error. For LLVM's libc it was decided to follow the standard, even though glibc's version is slightly simpler to implement and this edge case is rare. Following the standard must be the first priority, since that's the goal of the library.
+Any long running implementations will have bugs and deviations from the standard. Hyrum's Law states that “all observable behaviors of your system will be depended on by somebody” which includes these bugs. An example of a long-standing bug is glibc's scanf float parsing behavior. The behavior is specifically defined in the standard, but it isn't adhered to by all libc implementations. There is a longstanding bug in glibc where it incorrectly parses the string 100er and this caused the C standard to add that specific example to the definition for scanf. The intended behavior is for scanf, when parsing a float, to parse the longest possibly valid prefix and then accept it if and only if that complete parsed value is a float. In the case of 100er the longest possibly valid prefix is 100e but the float parsed from that string is only 100. Since there is no number after the e it shouldn't be included in the float, so scanf should return a parsing error. For LLVM's libc it was decided to follow the standard, even though glibc's version is slightly simpler to implement and this edge case is rare. Following the standard must be the first priority, since that's the goal of the library. If there is no standard, then matching another implementation (even bug-for-bug) may be necessary, but before you implement an unstandardized function first consider if anyone will actually use it at all.
 
 Design Decisions
 ================

From 46bd65a0509fefdf50d44f63640a7b8bd8ad0d14 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Wed, 28 Feb 2024 09:45:09 -0800
Subject: [PATCH 099/114] [mlir][LinAlg] Vectorize reverse-like ops using
 vector.gather ops. (#83205)

The reverse op is treated as a VectorMemoryAccessKind::Contiguous load.
It is contiguous slice, but we'll need to compute indices differently
and apply a reverse at vector level. It takes non-trivial efforts for
the approach. The revision flips the case to use vector.gather.
Otherwise there are functionality issues. E.g., the below example loaded
`2, 3, 4` (which is a bug), but what we want is `2, 1, 0`.

Before vectorization:

```mlir
func.func @vectorize_reverse_like_tensor_extract(%arg0: tensor<1x2x3xf32>, %arg1: tensor<1x1x3xf32>, %arg2: index) -> tensor<1x1x3xf32> {
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %c2 = arith.constant 2 : index
  %0 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel"]} outs(%arg1 : tensor<1x1x3xf32>) {
  ^bb0(%out: f32):
    %1 = linalg.index 1 : index
    %2 = linalg.index 0 : index
    %3 = affine.apply #map1(%1, %2, %arg2)
    %4 = linalg.index 2 : index
    %5 = arith.subi %c2, %4 : index
    %extracted = tensor.extract %arg0[%c0, %3, %5] : tensor<1x2x3xf32>
    linalg.yield %extracted : f32
  } -> tensor<1x1x3xf32>
  return %0 : tensor<1x1x3xf32>
}
```

Partial IR after vectorization:

```
  %5 = vector.constant_mask [1, 1, 3] : vector<1x1x4xi1>
  %6 = vector.broadcast %arg0 : index to vector<1x1x4xindex>
  %7 = vector.shape_cast %6 : vector<1x1x4xindex> to vector<4xindex>
  %8 = vector.extractelement %7[%c0_i32 : i32] : vector<4xindex>
  %9 = vector.transfer_read %3[%c0, %8, %c2], %cst, %5 {in_bounds = [true, true, true]} : tensor<1x2x3xf32>, vector<1x1x4xf32>
```
---
 .../Linalg/Transforms/Vectorization.cpp       |  3 +-
 .../Linalg/vectorize-tensor-extract.mlir      | 45 +++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index ac043e87223dfe3..1e703dacfd0c758 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -891,8 +891,7 @@ static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val,
 
   // Conservatively reject Ops that could lead to indices with stride other
   // than 1.
-  if (!isa<arith::AddIOp, arith::SubIOp, arith::ConstantOp, linalg::IndexOp>(
-          ancestor))
+  if (!isa<arith::AddIOp, arith::ConstantOp, linalg::IndexOp>(ancestor))
     return false;
 
   bool result = false;
diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
index 96953c234a0873b..85e1c56dd45a0dc 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
@@ -550,3 +550,48 @@ module attributes {transform.with_named_sequence} {
      transform.yield
    }
 }
+
+// -----
+
+#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d0 + d1 + d2)>
+func.func @vectorize_reverse_like_tensor_extract(%arg0: tensor<1x2x3xf32>, %arg1: tensor<1x1x3xf32>, %arg2: index) -> tensor<1x1x3xf32> {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %0 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel", "parallel", "parallel"]} outs(%arg1 : tensor<1x1x3xf32>) {
+  ^bb0(%out: f32):
+    %1 = linalg.index 1 : index
+    %2 = linalg.index 0 : index
+    %3 = affine.apply #map1(%1, %2, %arg2)
+    %4 = linalg.index 2 : index
+    %5 = arith.subi %c2, %4 : index
+    %extracted = tensor.extract %arg0[%c0, %3, %5] : tensor<1x2x3xf32>
+    linalg.yield %extracted : f32
+  } -> tensor<1x1x3xf32>
+  return %0 : tensor<1x1x3xf32>
+}
+// CHECK-LABEL: func.func @vectorize_reverse_like_tensor_extract
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]*]]
+// CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]*]]
+// CHECK-SAME:    %[[ARG2:[0-9a-zA-Z]*]]
+// CHECK-DAG:    %[[CST:.+]] = arith.constant dense<3> : vector<1x1x3xindex>
+// CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:    %[[MASK:.*]] = arith.constant dense<true> : vector<1x1x3xi1>
+// CHECK-DAG:    %[[PASSTHRU:.*]] = arith.constant dense<0.000000e+00> : vector<1x1x3xf32>
+// CHECK-DAG:    %[[INIT_IDX:.+]] = arith.constant dense<[2, 1, 0]> : vector<3xindex>
+// CHECK:        %[[T0:.+]] = vector.broadcast %[[ARG2]] : index to vector<1x1x3xindex>
+// CHECK:        %[[T1:.+]] = arith.muli %[[T0]], %[[CST]] : vector<1x1x3xindex>
+// CHECK:        %[[T2:.+]] = vector.broadcast %[[INIT_IDX]]
+// CHECK:        %[[T3:.+]] = arith.addi %[[T2]], %[[T1]]
+// CHECK:        %[[GATHER:.*]] = vector.gather %[[ARG0]][%[[C0]], %[[C0]], %[[C0]]] [%[[T3]]], %[[MASK]], %[[PASSTHRU]]
+// CHECK:        vector.transfer_write %[[GATHER]]
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+     %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+     %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_nd_extract } : (!transform.any_op) -> !transform.any_op
+     transform.yield
+   }
+}

From ad43ea3328dad844c245caf93509c2facba1ec32 Mon Sep 17 00:00:00 2001
From: Jason Eckhardt <jeckhardt@nvidia.com>
Date: Wed, 28 Feb 2024 11:47:18 -0600
Subject: [PATCH 100/114] [TableGen] Add support for DefaultMode in per-HwMode
 encode/decode. (#83029)

Currently the decoder and encoder emitters will crash if DefaultMode is
used within an EncodingByHwMode. As can be done today for
RegInfoByHwMode and ValueTypeByHwMode, this patch adds support for this
usage in EncodingByHwMode:
  let EncodingInfos =
    EncodingByHwMode<[ModeA, DefaultMode], [EncA, EncDefault]>;
---
 llvm/test/TableGen/HwModeEncodeDecode3.td | 168 ++++++++++++++++++++++
 llvm/utils/TableGen/CodeEmitterGen.cpp    |   8 +-
 llvm/utils/TableGen/CodeGenHwModes.h      |   5 +
 llvm/utils/TableGen/DecoderEmitter.cpp    |  10 +-
 4 files changed, 183 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/TableGen/HwModeEncodeDecode3.td

diff --git a/llvm/test/TableGen/HwModeEncodeDecode3.td b/llvm/test/TableGen/HwModeEncodeDecode3.td
new file mode 100644
index 000000000000000..406e52d25be7065
--- /dev/null
+++ b/llvm/test/TableGen/HwModeEncodeDecode3.td
@@ -0,0 +1,168 @@
+// RUN: llvm-tblgen -gen-emitter -I %p/../../include %s | \
+// RUN:     FileCheck %s --check-prefix=ENCODER
+// RUN: llvm-tblgen -gen-disassembler -I %p/../../include %s | \
+// RUN:     FileCheck %s --check-prefix=DECODER
+// RUN: llvm-tblgen -gen-disassembler --suppress-per-hwmode-duplicates -I \
+// RUN:     %p/../../include %s | FileCheck %s --check-prefix=DECODER-SUPPRESS
+
+include "llvm/Target/Target.td"
+
+def archInstrInfo : InstrInfo { }
+
+def arch : Target {
+  let InstructionSet = archInstrInfo;
+}
+
+def Myi32 : Operand<i32> {
+  let DecoderMethod = "DecodeMyi32";
+}
+
+def HasA : Predicate<"Subtarget->hasA()">;
+def HasB : Predicate<"Subtarget->hasB()">;
+
+def ModeA : HwMode<"+a", [HasA]>;
+def ModeB : HwMode<"+b", [HasB]>;
+
+
+def fooTypeEncDefault : InstructionEncoding {
+  let Size = 8;
+  field bits<64> SoftFail = 0;
+  bits<64> Inst;
+  bits<8> factor;
+  let Inst{7...0} = factor;
+  let Inst{3...2} = 0b10;
+  let Inst{1...0} = 0b00;
+}
+
+def fooTypeEncA : InstructionEncoding {
+  let Size = 4;
+  field bits<32> SoftFail = 0;
+  bits<32> Inst;
+  bits<8> factor;
+  let Inst{7...0} = factor;
+  let Inst{3...2} = 0b11;
+  let Inst{1...0} = 0b00;
+}
+
+def fooTypeEncB : InstructionEncoding {
+  let Size = 4;
+  field bits<32> SoftFail = 0;
+  bits<32> Inst;
+  bits<8> factor;
+  let Inst{15...8} = factor;
+  let Inst{1...0} = 0b11;
+}
+
+// Test for DefaultMode as a selector.
+def foo : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$factor);
+  let EncodingInfos = EncodingByHwMode<
+    [ModeA, ModeB, DefaultMode], [fooTypeEncA, fooTypeEncB, fooTypeEncDefault]
+  >;
+  let AsmString = "foo  $factor";
+}
+
+def bar: Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$factor);
+  let Size = 4;
+  bits<32> Inst;
+  bits<32> SoftFail;
+  bits<8> factor;
+  let Inst{31...24} = factor;
+  let Inst{1...0} = 0b10;
+  let AsmString = "bar  $factor";
+}
+
+def baz : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$factor);
+  bits<32> Inst;
+  let EncodingInfos = EncodingByHwMode<
+    [ModeB], [fooTypeEncA]
+  >;
+  let AsmString = "foo  $factor";
+}
+
+def unrelated: Instruction {
+  let OutOperandList = (outs);
+  let DecoderNamespace = "Alt";
+  let InOperandList = (ins i32imm:$factor);
+  let Size = 4;
+  bits<32> Inst;
+  bits<32> SoftFail;
+  bits<8> factor;
+  let Inst{31...24} = factor;
+  let Inst{1...0} = 0b10;
+  let AsmString = "unrelated  $factor";
+}
+
+
+// DECODER-LABEL: DecoderTableAlt_DefaultMode32[] =
+// DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTableAlt_ModeA32[] =
+// DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTableAlt_ModeB32[] =
+// DECODER-DAG: Opcode: unrelated
+// DECODER-LABEL: DecoderTable_DefaultMode32[] =
+// DECODER-DAG: Opcode: bar
+// DECODER-LABEL: DecoderTable_DefaultMode64[] =
+// DECODER-DAG: Opcode: fooTypeEncDefault:foo
+// DECODER-LABEL: DecoderTable_ModeA32[] =
+// DECODER-DAG: Opcode: fooTypeEncA:foo
+// DECODER-DAG: Opcode: bar
+// DECODER-LABEL: DecoderTable_ModeB32[] =
+// DECODER-DAG: Opcode: fooTypeEncB:foo
+// DECODER-DAG: Opcode: fooTypeEncA:baz
+// DECODER-DAG: Opcode: bar
+
+
+// DECODER-SUPPRESS-LABEL: DecoderTableAlt_AllModes32[] =
+// DECODER-SUPPRESS-DAG: Opcode: unrelated
+// DECODER-SUPPRESS-LABEL: DecoderTable_AllModes32[] =
+// DECODER-SUPPRESS-DAG: Opcode: bar
+// DECODER-SUPPRESS-LABEL: DecoderTable_DefaultMode64[] =
+// DECODER-SUPPRESS-NOT: Opcode: bar
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncDefault:foo
+// DECODER-SUPPRESS-LABEL: DecoderTable_ModeA32[] =
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncA:foo
+// DECODER-SUPPRESS-NOT: Opcode: bar
+// DECODER-SUPPRESS-LABEL: DecoderTable_ModeB32[] =
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncB:foo
+// DECODER-SUPPRESS-DAG: Opcode: fooTypeEncA:baz
+// DECODER-SUPPRESS-NOT: Opcode: bar
+
+// ENCODER-LABEL:   static const uint64_t InstBits_DefaultMode[] = {
+// ENCODER:         UINT64_C(2),        // bar
+// ENCODER:         UINT64_C(0),        // baz
+// ENCODER:         UINT64_C(8),        // foo
+// ENCODER:         UINT64_C(2),        // unrelated
+
+// ENCODER-LABEL:   static const uint64_t InstBits_ModeA[] = {
+// ENCODER:         UINT64_C(2),        // bar
+// ENCODER:         UINT64_C(0),        // baz
+// ENCODER:         UINT64_C(12),       // foo
+// ENCODER:         UINT64_C(2),        // unrelated
+
+// ENCODER-LABEL:   static const uint64_t InstBits_ModeB[] = {
+// ENCODER:         UINT64_C(2),        // bar
+// ENCODER:         UINT64_C(12),       // baz
+// ENCODER:         UINT64_C(3),        // foo
+// ENCODER:         UINT64_C(2),        // unrelated
+
+// ENCODER:  unsigned HwMode = STI.getHwMode();
+// ENCODER:  switch (HwMode) {
+// ENCODER:  default: llvm_unreachable("Unknown hardware mode!"); break;
+// ENCODER:  case 0: InstBits = InstBits_DefaultMode; break;
+// ENCODER:  case 1: InstBits = InstBits_ModeA; break;
+// ENCODER:  case 2: InstBits = InstBits_ModeB; break;
+// ENCODER:  };
+
+// ENCODER:     case ::foo: {
+// ENCODER:      switch (HwMode) {
+// ENCODER:      default: llvm_unreachable("Unhandled HwMode");
+// ENCODER:      case 0: {
+// ENCODER:      case 1: {
+// ENCODER:      case 2: {
+
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index d80761d5fe35d2e..1e80eb6b1ad50ee 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -365,8 +365,8 @@ void CodeEmitterGen::emitInstructionBaseValues(
   if (HwMode == -1)
     o << "  static const uint64_t InstBits[] = {\n";
   else
-    o << "  static const uint64_t InstBits_" << HWM.getMode(HwMode).Name
-      << "[] = {\n";
+    o << "  static const uint64_t InstBits_"
+      << HWM.getModeName(HwMode, /*IncludeDefault=*/true) << "[] = {\n";
 
   for (const CodeGenInstruction *CGI : NumberedInstructions) {
     Record *R = CGI->TheDef;
@@ -495,8 +495,8 @@ void CodeEmitterGen::run(raw_ostream &o) {
       o << "  switch (HwMode) {\n";
       o << "  default: llvm_unreachable(\"Unknown hardware mode!\"); break;\n";
       for (unsigned I : HwModes) {
-        o << "  case " << I << ": InstBits = InstBits_" << HWM.getMode(I).Name
-          << "; break;\n";
+        o << "  case " << I << ": InstBits = InstBits_"
+          << HWM.getModeName(I, /*IncludeDefault=*/true) << "; break;\n";
       }
       o << "  };\n";
     }
diff --git a/llvm/utils/TableGen/CodeGenHwModes.h b/llvm/utils/TableGen/CodeGenHwModes.h
index 56639f741ede11a..23723b7bd4af552 100644
--- a/llvm/utils/TableGen/CodeGenHwModes.h
+++ b/llvm/utils/TableGen/CodeGenHwModes.h
@@ -52,6 +52,11 @@ struct CodeGenHwModes {
     assert(Id != 0 && "Mode id of 0 is reserved for the default mode");
     return Modes[Id - 1];
   }
+  StringRef getModeName(unsigned Id, bool IncludeDefault = false) const {
+    if (IncludeDefault && Id == CodeGenHwModes::DefaultMode)
+      return DefaultModeName;
+    return getMode(Id).Name;
+  }
   const HwModeSelect &getHwModeSelect(Record *R) const;
   const std::map<Record *, HwModeSelect> &getHwModeSelects() const {
     return ModeSelects;
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 4ce5a73d7756688..27ff84bce4058e1 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -2461,8 +2461,9 @@ collectHwModesReferencedForEncodings(const CodeGenHwModes &HWM,
         BV.set(P.first);
     }
   }
-  transform(BV.set_bits(), std::back_inserter(Names),
-            [&HWM](const int &M) { return HWM.getMode(M).Name; });
+  transform(BV.set_bits(), std::back_inserter(Names), [&HWM](const int &M) {
+    return HWM.getModeName(M, /*IncludeDefault=*/true);
+  });
 }
 
 // Emits disassembler code for instruction decoding.
@@ -2503,8 +2504,9 @@ void DecoderEmitter::run(raw_ostream &o) {
       if (DefInit *DI = dyn_cast_or_null<DefInit>(RV->getValue())) {
         EncodingInfoByHwMode EBM(DI->getDef(), HWM);
         for (auto &KV : EBM)
-          NumberedEncodings.emplace_back(KV.second, NumberedInstruction,
-                                         HWM.getMode(KV.first).Name);
+          NumberedEncodings.emplace_back(
+              KV.second, NumberedInstruction,
+              HWM.getModeName(KV.first, /*IncludeDefault=*/true));
         continue;
       }
     }

From c6cbf81c84b00ab0107a09f5b2d0183458a367b8 Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Wed, 28 Feb 2024 09:47:45 -0800
Subject: [PATCH 101/114] [InstallAPI] Hookup Input files & basic ASTVisitor
 (#82552)

This patch takes in json files as input to determine that header files
to process, and in which order, to pass along for CC1 invocations. This
patch also includes an ASTVisitor to collect simple global variables.
---
 clang/include/clang/InstallAPI/Context.h      | 22 ++++-
 clang/include/clang/InstallAPI/Frontend.h     | 48 ++++++++++
 clang/include/clang/InstallAPI/HeaderFile.h   | 23 +++++
 clang/include/clang/InstallAPI/Visitor.h      | 51 ++++++++++
 clang/lib/InstallAPI/CMakeLists.txt           |  3 +
 clang/lib/InstallAPI/Frontend.cpp             | 58 ++++++++++++
 clang/lib/InstallAPI/Visitor.cpp              | 94 +++++++++++++++++++
 clang/test/InstallAPI/basic.test              |  5 +
 clang/test/InstallAPI/variables.test          | 63 +++++++++++++
 .../clang-installapi/ClangInstallAPI.cpp      | 76 +++++++++++++--
 clang/tools/clang-installapi/Options.cpp      | 29 ++++++
 clang/tools/clang-installapi/Options.h        |  8 ++
 12 files changed, 472 insertions(+), 8 deletions(-)
 create mode 100644 clang/include/clang/InstallAPI/Frontend.h
 create mode 100644 clang/include/clang/InstallAPI/Visitor.h
 create mode 100644 clang/lib/InstallAPI/Frontend.cpp
 create mode 100644 clang/lib/InstallAPI/Visitor.cpp
 create mode 100644 clang/test/InstallAPI/variables.test

diff --git a/clang/include/clang/InstallAPI/Context.h b/clang/include/clang/InstallAPI/Context.h
index 7d105920734fdec..3e2046642c7fe89 100644
--- a/clang/include/clang/InstallAPI/Context.h
+++ b/clang/include/clang/InstallAPI/Context.h
@@ -9,6 +9,9 @@
 #ifndef LLVM_CLANG_INSTALLAPI_CONTEXT_H
 #define LLVM_CLANG_INSTALLAPI_CONTEXT_H
 
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/InstallAPI/HeaderFile.h"
 #include "llvm/TextAPI/InterfaceFile.h"
 #include "llvm/TextAPI/RecordVisitor.h"
 #include "llvm/TextAPI/RecordsSlice.h"
@@ -24,8 +27,23 @@ struct InstallAPIContext {
   /// Library attributes that are typically passed as linker inputs.
   llvm::MachO::RecordsSlice::BinaryAttrs BA;
 
-  /// Active target triple to parse.
-  llvm::Triple TargetTriple{};
+  /// All headers that represent a library.
+  HeaderSeq InputHeaders;
+
+  /// Active language mode to parse in.
+  Language LangMode = Language::ObjC;
+
+  /// Active header access type.
+  HeaderType Type = HeaderType::Unknown;
+
+  /// Active TargetSlice for symbol record collection.
+  std::shared_ptr<llvm::MachO::RecordsSlice> Slice;
+
+  /// FileManager for all I/O operations.
+  FileManager *FM = nullptr;
+
+  /// DiagnosticsEngine for all error reporting.
+  DiagnosticsEngine *Diags = nullptr;
 
   /// File Path of output location.
   llvm::StringRef OutputLoc{};
diff --git a/clang/include/clang/InstallAPI/Frontend.h b/clang/include/clang/InstallAPI/Frontend.h
new file mode 100644
index 000000000000000..7ee87ae028d0794
--- /dev/null
+++ b/clang/include/clang/InstallAPI/Frontend.h
@@ -0,0 +1,48 @@
+//===- InstallAPI/Frontend.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Top level wrappers for InstallAPI frontend operations.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_INSTALLAPI_FRONTEND_H
+#define LLVM_CLANG_INSTALLAPI_FRONTEND_H
+
+#include "clang/AST/ASTConsumer.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/FrontendActions.h"
+#include "clang/InstallAPI/Context.h"
+#include "clang/InstallAPI/Visitor.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace clang {
+namespace installapi {
+
+/// Create a buffer that contains all headers to scan
+/// for global symbols with.
+std::unique_ptr<llvm::MemoryBuffer>
+createInputBuffer(const InstallAPIContext &Ctx);
+
+class InstallAPIAction : public ASTFrontendAction {
+public:
+  explicit InstallAPIAction(llvm::MachO::RecordsSlice &Records)
+      : Records(Records) {}
+
+  std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &CI,
+                                                 StringRef InFile) override {
+    return std::make_unique<InstallAPIVisitor>(CI.getASTContext(), Records);
+  }
+
+private:
+  llvm::MachO::RecordsSlice &Records;
+};
+} // namespace installapi
+} // namespace clang
+
+#endif // LLVM_CLANG_INSTALLAPI_FRONTEND_H
diff --git a/clang/include/clang/InstallAPI/HeaderFile.h b/clang/include/clang/InstallAPI/HeaderFile.h
index fc64a43b3def5c7..70e83bbb3e76f6b 100644
--- a/clang/include/clang/InstallAPI/HeaderFile.h
+++ b/clang/include/clang/InstallAPI/HeaderFile.h
@@ -15,6 +15,7 @@
 
 #include "clang/Basic/LangStandard.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Regex.h"
 #include <optional>
 #include <string>
@@ -32,6 +33,20 @@ enum class HeaderType {
   Project,
 };
 
+inline StringRef getName(const HeaderType T) {
+  switch (T) {
+  case HeaderType::Public:
+    return "Public";
+  case HeaderType::Private:
+    return "Private";
+  case HeaderType::Project:
+    return "Project";
+  case HeaderType::Unknown:
+    return "Unknown";
+  }
+  llvm_unreachable("unexpected header type");
+}
+
 class HeaderFile {
   /// Full input path to header.
   std::string FullPath;
@@ -52,6 +67,14 @@ class HeaderFile {
 
   static llvm::Regex getFrameworkIncludeRule();
 
+  HeaderType getType() const { return Type; }
+  StringRef getIncludeName() const { return IncludeName; }
+  StringRef getPath() const { return FullPath; }
+
+  bool useIncludeName() const {
+    return Type != HeaderType::Project && !IncludeName.empty();
+  }
+
   bool operator==(const HeaderFile &Other) const {
     return std::tie(Type, FullPath, IncludeName, Language) ==
            std::tie(Other.Type, Other.FullPath, Other.IncludeName,
diff --git a/clang/include/clang/InstallAPI/Visitor.h b/clang/include/clang/InstallAPI/Visitor.h
new file mode 100644
index 000000000000000..95d669688e4f9cc
--- /dev/null
+++ b/clang/include/clang/InstallAPI/Visitor.h
@@ -0,0 +1,51 @@
+//===- InstallAPI/Visitor.h -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// ASTVisitor Interface for InstallAPI frontend operations.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_INSTALLAPI_VISITOR_H
+#define LLVM_CLANG_INSTALLAPI_VISITOR_H
+
+#include "clang/AST/Mangle.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Frontend/FrontendActions.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/TextAPI/RecordsSlice.h"
+
+namespace clang {
+namespace installapi {
+
+/// ASTVisitor for collecting declarations that represent global symbols.
+class InstallAPIVisitor final : public ASTConsumer,
+                                public RecursiveASTVisitor<InstallAPIVisitor> {
+public:
+  InstallAPIVisitor(ASTContext &ASTCtx, llvm::MachO::RecordsSlice &Slice)
+      : Slice(Slice),
+        MC(ItaniumMangleContext::create(ASTCtx, ASTCtx.getDiagnostics())),
+        Layout(ASTCtx.getTargetInfo().getDataLayoutString()) {}
+  void HandleTranslationUnit(ASTContext &ASTCtx) override;
+
+  /// Collect global variables.
+  bool VisitVarDecl(const VarDecl *D);
+
+private:
+  std::string getMangledName(const NamedDecl *D) const;
+  std::string getBackendMangledName(llvm::Twine Name) const;
+
+  llvm::MachO::RecordsSlice &Slice;
+  std::unique_ptr<clang::ItaniumMangleContext> MC;
+  StringRef Layout;
+};
+
+} // namespace installapi
+} // namespace clang
+
+#endif // LLVM_CLANG_INSTALLAPI_VISITOR_H
diff --git a/clang/lib/InstallAPI/CMakeLists.txt b/clang/lib/InstallAPI/CMakeLists.txt
index fdc4f064f29e95f..19fc4c3abde53af 100644
--- a/clang/lib/InstallAPI/CMakeLists.txt
+++ b/clang/lib/InstallAPI/CMakeLists.txt
@@ -1,11 +1,14 @@
 set(LLVM_LINK_COMPONENTS
   Support
   TextAPI
+  Core
   )
 
 add_clang_library(clangInstallAPI
   FileList.cpp
+  Frontend.cpp
   HeaderFile.cpp
+  Visitor.cpp
 
   LINK_LIBS
   clangAST
diff --git a/clang/lib/InstallAPI/Frontend.cpp b/clang/lib/InstallAPI/Frontend.cpp
new file mode 100644
index 000000000000000..9f675ef7d1bd222
--- /dev/null
+++ b/clang/lib/InstallAPI/Frontend.cpp
@@ -0,0 +1,58 @@
+//===- Frontend.cpp ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/InstallAPI/Frontend.h"
+#include "clang/AST/Availability.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace llvm;
+using namespace llvm::MachO;
+
+namespace clang::installapi {
+
+static StringRef getFileExtension(clang::Language Lang) {
+  switch (Lang) {
+  default:
+    llvm_unreachable("Unexpected language option.");
+  case clang::Language::C:
+    return ".c";
+  case clang::Language::CXX:
+    return ".cpp";
+  case clang::Language::ObjC:
+    return ".m";
+  case clang::Language::ObjCXX:
+    return ".mm";
+  }
+}
+
+std::unique_ptr<MemoryBuffer> createInputBuffer(const InstallAPIContext &Ctx) {
+  assert(Ctx.Type != HeaderType::Unknown &&
+         "unexpected access level for parsing");
+  SmallString<4096> Contents;
+  raw_svector_ostream OS(Contents);
+  for (const HeaderFile &H : Ctx.InputHeaders) {
+    if (H.getType() != Ctx.Type)
+      continue;
+    if (Ctx.LangMode == Language::C || Ctx.LangMode == Language::CXX)
+      OS << "#include ";
+    else
+      OS << "#import ";
+    if (H.useIncludeName())
+      OS << "<" << H.getIncludeName() << ">";
+    else
+      OS << "\"" << H.getPath() << "\"";
+  }
+  if (Contents.empty())
+    return nullptr;
+
+  return llvm::MemoryBuffer::getMemBufferCopy(
+      Contents, "installapi-includes" + getFileExtension(Ctx.LangMode));
+}
+
+} // namespace clang::installapi
diff --git a/clang/lib/InstallAPI/Visitor.cpp b/clang/lib/InstallAPI/Visitor.cpp
new file mode 100644
index 000000000000000..9b333a6142ae8f9
--- /dev/null
+++ b/clang/lib/InstallAPI/Visitor.cpp
@@ -0,0 +1,94 @@
+//===- Visitor.cpp ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/InstallAPI/Visitor.h"
+#include "clang/AST/Availability.h"
+#include "clang/Basic/Linkage.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Mangler.h"
+
+using namespace llvm;
+using namespace llvm::MachO;
+
+namespace clang::installapi {
+
+// Exported NamedDecl needs to have externally visibiliy linkage and
+// default visibility from LinkageComputer.
+static bool isExported(const NamedDecl *D) {
+  auto LV = D->getLinkageAndVisibility();
+  return isExternallyVisible(LV.getLinkage()) &&
+         (LV.getVisibility() == DefaultVisibility);
+}
+
+static SymbolFlags getFlags(bool WeakDef, bool ThreadLocal) {
+  SymbolFlags Result = SymbolFlags::None;
+  if (WeakDef)
+    Result |= SymbolFlags::WeakDefined;
+  if (ThreadLocal)
+    Result |= SymbolFlags::ThreadLocalValue;
+
+  return Result;
+}
+
+void InstallAPIVisitor::HandleTranslationUnit(ASTContext &ASTCtx) {
+  if (ASTCtx.getDiagnostics().hasErrorOccurred())
+    return;
+
+  auto *D = ASTCtx.getTranslationUnitDecl();
+  TraverseDecl(D);
+}
+
+std::string InstallAPIVisitor::getMangledName(const NamedDecl *D) const {
+  SmallString<256> Name;
+  if (MC->shouldMangleDeclName(D)) {
+    raw_svector_ostream NStream(Name);
+    MC->mangleName(D, NStream);
+  } else
+    Name += D->getNameAsString();
+
+  return getBackendMangledName(Name);
+}
+
+std::string InstallAPIVisitor::getBackendMangledName(Twine Name) const {
+  SmallString<256> FinalName;
+  Mangler::getNameWithPrefix(FinalName, Name, DataLayout(Layout));
+  return std::string(FinalName);
+}
+
+/// Collect all global variables.
+bool InstallAPIVisitor::VisitVarDecl(const VarDecl *D) {
+  // Skip function parameters.
+  if (isa<ParmVarDecl>(D))
+    return true;
+
+  // Skip variables in records. They are handled seperately for C++.
+  if (D->getDeclContext()->isRecord())
+    return true;
+
+  // Skip anything inside functions or methods.
+  if (!D->isDefinedOutsideFunctionOrMethod())
+    return true;
+
+  // If this is a template but not specialization or instantiation, skip.
+  if (D->getASTContext().getTemplateOrSpecializationInfo(D) &&
+      D->getTemplateSpecializationKind() == TSK_Undeclared)
+    return true;
+
+  // TODO: Capture SourceLocation & Availability for Decls.
+  const RecordLinkage Linkage =
+      isExported(D) ? RecordLinkage::Exported : RecordLinkage::Internal;
+  const bool WeakDef = D->hasAttr<WeakAttr>();
+  const bool ThreadLocal = D->getTLSKind() != VarDecl::TLS_None;
+  Slice.addGlobal(getMangledName(D), Linkage, GlobalRecord::Kind::Variable,
+                  getFlags(WeakDef, ThreadLocal));
+  return true;
+}
+
+} // namespace clang::installapi
diff --git a/clang/test/InstallAPI/basic.test b/clang/test/InstallAPI/basic.test
index 22b04792ca2c30e..5b41ccd517b0afe 100644
--- a/clang/test/InstallAPI/basic.test
+++ b/clang/test/InstallAPI/basic.test
@@ -16,6 +16,11 @@
 // CHECK-NOT: warning:  
 
 //--- basic_inputs.json
+{
+  "headers": [
+  ],
+  "version": "3"
+}
 
 //--- expected.tbd
 {
diff --git a/clang/test/InstallAPI/variables.test b/clang/test/InstallAPI/variables.test
new file mode 100644
index 000000000000000..6272867911f18b2
--- /dev/null
+++ b/clang/test/InstallAPI/variables.test
@@ -0,0 +1,63 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s|SRC_DIR|%/t|g" %t/vars_inputs.json.in > %t/vars_inputs.json
+
+/// Check multiple targets are captured.
+// RUN: clang-installapi -target arm64-apple-macos13.1 -target arm64e-apple-macos13.1 \
+// RUN: -fapplication-extension -install_name /usr/lib/vars.dylib \
+// RUN: %t/vars_inputs.json -o %t/vars.tbd 2>&1 | FileCheck %s --allow-empty
+// RUN: llvm-readtapi -compare %t/vars.tbd %t/expected.tbd 2>&1 | FileCheck %s --allow-empty
+
+// CHECK-NOT: error:  
+// CHECK-NOT: warning:  
+
+//--- vars.h
+extern int foo;
+
+//--- vars_inputs.json.in
+{
+  "headers": [ {
+    "path" : "SRC_DIR/vars.h",
+    "type" : "public"
+  }],
+  "version": "3"
+}
+
+//--- expected.tbd
+{
+  "main_library": {
+    "compatibility_versions": [
+      {
+        "version": "0"
+      }],
+    "current_versions": [
+      {
+        "version": "0"
+      }],
+    "install_names": [
+      {
+        "name": "/usr/lib/vars.dylib"
+      }
+    ],
+    "exported_symbols": [
+      {
+        "data": {
+          "global": [
+            "_foo"
+          ]
+        }
+      }
+    ],
+    "target_info": [
+      {
+        "min_deployment": "13.1",
+        "target": "arm64-macos"
+      },
+      {
+        "min_deployment": "13.1",
+        "target": "arm64e-macos"
+      }
+    ]
+  },
+  "tapi_tbd_version": 5
+}
diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
index fc23ffd7ae6b9cb..43c9fca0a82eec4 100644
--- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
+++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
@@ -12,12 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "Options.h"
-#include "clang/Basic/DiagnosticIDs.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
-#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Driver/Tool.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
-#include "clang/InstallAPI/Context.h"
+#include "clang/InstallAPI/Frontend.h"
+#include "clang/Tooling/Tooling.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Option/Option.h"
 #include "llvm/Support/CommandLine.h"
@@ -27,7 +29,9 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/TargetParser/Host.h"
+#include "llvm/TextAPI/RecordVisitor.h"
 #include "llvm/TextAPI/TextAPIWriter.h"
+#include <memory>
 
 using namespace clang;
 using namespace clang::installapi;
@@ -35,6 +39,36 @@ using namespace clang::driver::options;
 using namespace llvm::opt;
 using namespace llvm::MachO;
 
+static bool runFrontend(StringRef ProgName, bool Verbose,
+                        const InstallAPIContext &Ctx,
+                        llvm::vfs::InMemoryFileSystem *FS,
+                        const ArrayRef<std::string> InitialArgs) {
+
+  std::unique_ptr<llvm::MemoryBuffer> ProcessedInput = createInputBuffer(Ctx);
+  // Skip invoking cc1 when there are no header inputs.
+  if (!ProcessedInput)
+    return true;
+
+  if (Verbose)
+    llvm::errs() << getName(Ctx.Type) << " Headers:\n"
+                 << ProcessedInput->getBuffer() << "\n\n";
+
+  std::string InputFile = ProcessedInput->getBufferIdentifier().str();
+  FS->addFile(InputFile, /*ModTime=*/0, std::move(ProcessedInput));
+  // Reconstruct arguments with unique values like target triple or input
+  // headers.
+  std::vector<std::string> Args = {ProgName.data(), "-target",
+                                   Ctx.Slice->getTriple().str().c_str()};
+  llvm::copy(InitialArgs, std::back_inserter(Args));
+  Args.push_back(InputFile);
+
+  // Create & run invocation.
+  clang::tooling::ToolInvocation Invocation(
+      std::move(Args), std::make_unique<InstallAPIAction>(*Ctx.Slice), Ctx.FM);
+
+  return Invocation.run();
+}
+
 static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   // Setup Diagnostics engine.
   IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts = new DiagnosticOptions();
@@ -48,9 +82,15 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
       new clang::DiagnosticIDs(), DiagOpts.get(),
       new clang::TextDiagnosticPrinter(llvm::errs(), DiagOpts.get()));
 
-  // Create file manager for all file operations.
+  // Create file manager for all file operations and holding in-memory generated
+  // inputs.
+  llvm::IntrusiveRefCntPtr<llvm::vfs::OverlayFileSystem> OverlayFileSystem(
+      new llvm::vfs::OverlayFileSystem(llvm::vfs::getRealFileSystem()));
+  llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> InMemoryFileSystem(
+      new llvm::vfs::InMemoryFileSystem);
+  OverlayFileSystem->pushOverlay(InMemoryFileSystem);
   IntrusiveRefCntPtr<clang::FileManager> FM(
-      new FileManager(clang::FileSystemOptions()));
+      new FileManager(clang::FileSystemOptions(), OverlayFileSystem));
 
   // Set up driver to parse input arguments.
   auto DriverArgs = llvm::ArrayRef(Args).slice(1);
@@ -71,7 +111,10 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   Options Opts(*Diag, FM.get(), ArgList);
   if (Diag->hasErrorOccurred())
     return EXIT_FAILURE;
+
   InstallAPIContext Ctx = Opts.createContext();
+  if (Diag->hasErrorOccurred())
+    return EXIT_FAILURE;
 
   // Set up compilation.
   std::unique_ptr<CompilerInstance> CI(new CompilerInstance());
@@ -80,6 +123,21 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
   if (!CI->hasDiagnostics())
     return EXIT_FAILURE;
 
+  // Execute and gather AST results.
+  llvm::MachO::Records FrontendResults;
+  for (const auto &[Targ, Trip] : Opts.DriverOpts.Targets) {
+    for (const HeaderType Type :
+         {HeaderType::Public, HeaderType::Private, HeaderType::Project}) {
+      Ctx.Slice = std::make_shared<RecordsSlice>(Trip);
+      Ctx.Type = Type;
+      if (!runFrontend(ProgName, Opts.DriverOpts.Verbose, Ctx,
+                       InMemoryFileSystem.get(), Opts.getClangFrontendArgs()))
+        return EXIT_FAILURE;
+      FrontendResults.emplace_back(std::move(Ctx.Slice));
+    }
+  }
+
+  // After symbols have been collected, prepare to write output.
   auto Out = CI->createOutputFile(Ctx.OutputLoc, /*Binary=*/false,
                                   /*RemoveFileOnSignal=*/false,
                                   /*UseTemporary=*/false,
@@ -88,7 +146,13 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
     return EXIT_FAILURE;
 
   // Assign attributes for serialization.
-  InterfaceFile IF;
+  auto Symbols = std::make_unique<SymbolSet>();
+  for (const auto &FR : FrontendResults) {
+    SymbolConverter Converter(Symbols.get(), FR->getTarget());
+    FR->visit(Converter);
+  }
+
+  InterfaceFile IF(std::move(Symbols));
   for (const auto &TargetInfo : Opts.DriverOpts.Targets) {
     IF.addTarget(TargetInfo.first);
     IF.setFromBinaryAttrs(Ctx.BA, TargetInfo.first);
diff --git a/clang/tools/clang-installapi/Options.cpp b/clang/tools/clang-installapi/Options.cpp
index 562a643edfcf401..7d45e999448d9f8 100644
--- a/clang/tools/clang-installapi/Options.cpp
+++ b/clang/tools/clang-installapi/Options.cpp
@@ -9,6 +9,7 @@
 #include "Options.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Frontend/FrontendDiagnostic.h"
+#include "clang/InstallAPI/FileList.h"
 #include "llvm/Support/Program.h"
 #include "llvm/TargetParser/Host.h"
 
@@ -68,6 +69,8 @@ bool Options::processDriverOptions(InputArgList &Args) {
     }
   }
 
+  DriverOpts.Verbose = Args.hasArgNoClaim(OPT_v);
+
   return true;
 }
 
@@ -104,10 +107,21 @@ Options::Options(DiagnosticsEngine &Diag, FileManager *FM,
 
   if (!processLinkerOptions(ArgList))
     return;
+
+  /// Any remaining arguments should be handled by invoking the clang frontend.
+  for (const Arg *A : ArgList) {
+    if (A->isClaimed())
+      continue;
+    FrontendArgs.emplace_back(A->getAsString(ArgList));
+  }
+  FrontendArgs.push_back("-fsyntax-only");
 }
 
 InstallAPIContext Options::createContext() {
   InstallAPIContext Ctx;
+  Ctx.FM = FM;
+  Ctx.Diags = Diags;
+
   // InstallAPI requires two level namespacing.
   Ctx.BA.TwoLevelNamespace = true;
 
@@ -116,6 +130,21 @@ InstallAPIContext Options::createContext() {
   Ctx.BA.AppExtensionSafe = LinkerOpts.AppExtensionSafe;
   Ctx.FT = DriverOpts.OutFT;
   Ctx.OutputLoc = DriverOpts.OutputPath;
+
+  // Process inputs.
+  for (const std::string &ListPath : DriverOpts.FileLists) {
+    auto Buffer = FM->getBufferForFile(ListPath);
+    if (auto Err = Buffer.getError()) {
+      Diags->Report(diag::err_cannot_open_file) << ListPath;
+      return Ctx;
+    }
+    if (auto Err = FileListReader::loadHeaders(std::move(Buffer.get()),
+                                               Ctx.InputHeaders)) {
+      Diags->Report(diag::err_cannot_open_file) << ListPath;
+      return Ctx;
+    }
+  }
+
   return Ctx;
 }
 
diff --git a/clang/tools/clang-installapi/Options.h b/clang/tools/clang-installapi/Options.h
index 4a84166a6c91b1d..f68addf19728809 100644
--- a/clang/tools/clang-installapi/Options.h
+++ b/clang/tools/clang-installapi/Options.h
@@ -43,6 +43,9 @@ struct DriverOptions {
 
   /// \brief File encoding to print.
   llvm::MachO::FileType OutFT = llvm::MachO::FileType::TBD_V5;
+
+  /// \brief Print verbose output.
+  bool Verbose = false;
 };
 
 struct LinkerOptions {
@@ -78,9 +81,14 @@ class Options {
   Options(clang::DiagnosticsEngine &Diag, FileManager *FM,
           llvm::opt::InputArgList &Args);
 
+  /// \brief Get CC1 arguments after extracting out the irrelevant
+  /// ones.
+  std::vector<std::string> &getClangFrontendArgs() { return FrontendArgs; }
+
 private:
   DiagnosticsEngine *Diags;
   FileManager *FM;
+  std::vector<std::string> FrontendArgs;
 };
 
 } // namespace installapi

From b42b7c8a123863d86db9abc8b6a1340b920f6573 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Wed, 28 Feb 2024 17:49:59 +0000
Subject: [PATCH 102/114] [clang] Refactor target attribute mangling. (#81893)

Before this patch all of the 'target', 'target_version' and
'target_clones' attributes were sharing a common mangling logic across
different targets. However we would like to differenciate this logic,
therefore I have moved the default path to ABIInfo and provided
overrides for AArch64. This way we can resolve feature aliases without
affecting the name mangling. The PR #80540 demonstrates a motivating
case.
---
 clang/lib/CodeGen/ABIInfo.cpp         |  52 ++++++++++++
 clang/lib/CodeGen/ABIInfo.h           |  10 +++
 clang/lib/CodeGen/CodeGenModule.cpp   | 111 ++++----------------------
 clang/lib/CodeGen/Targets/AArch64.cpp |  40 ++++++++++
 4 files changed, 118 insertions(+), 95 deletions(-)

diff --git a/clang/lib/CodeGen/ABIInfo.cpp b/clang/lib/CodeGen/ABIInfo.cpp
index 1b56cf7c596d067..efcff958ce54522 100644
--- a/clang/lib/CodeGen/ABIInfo.cpp
+++ b/clang/lib/CodeGen/ABIInfo.cpp
@@ -184,6 +184,58 @@ ABIArgInfo ABIInfo::getNaturalAlignIndirectInReg(QualType Ty,
                                       /*ByVal*/ false, Realign);
 }
 
+void ABIInfo::appendAttributeMangling(TargetAttr *Attr,
+                                      raw_ostream &Out) const {
+  if (Attr->isDefaultVersion())
+    return;
+  appendAttributeMangling(Attr->getFeaturesStr(), Out);
+}
+
+void ABIInfo::appendAttributeMangling(TargetVersionAttr *Attr,
+                                      raw_ostream &Out) const {
+  appendAttributeMangling(Attr->getNamesStr(), Out);
+}
+
+void ABIInfo::appendAttributeMangling(TargetClonesAttr *Attr, unsigned Index,
+                                      raw_ostream &Out) const {
+  appendAttributeMangling(Attr->getFeatureStr(Index), Out);
+  Out << '.' << Attr->getMangledIndex(Index);
+}
+
+void ABIInfo::appendAttributeMangling(StringRef AttrStr,
+                                      raw_ostream &Out) const {
+  if (AttrStr == "default") {
+    Out << ".default";
+    return;
+  }
+
+  Out << '.';
+  const TargetInfo &TI = CGT.getTarget();
+  ParsedTargetAttr Info = TI.parseTargetAttr(AttrStr);
+
+  llvm::sort(Info.Features, [&TI](StringRef LHS, StringRef RHS) {
+    // Multiversioning doesn't allow "no-${feature}", so we can
+    // only have "+" prefixes here.
+    assert(LHS.starts_with("+") && RHS.starts_with("+") &&
+           "Features should always have a prefix.");
+    return TI.multiVersionSortPriority(LHS.substr(1)) >
+           TI.multiVersionSortPriority(RHS.substr(1));
+  });
+
+  bool IsFirst = true;
+  if (!Info.CPU.empty()) {
+    IsFirst = false;
+    Out << "arch_" << Info.CPU;
+  }
+
+  for (StringRef Feat : Info.Features) {
+    if (!IsFirst)
+      Out << '_';
+    IsFirst = false;
+    Out << Feat.substr(1);
+  }
+}
+
 // Pin the vtable to this file.
 SwiftABIInfo::~SwiftABIInfo() = default;
 
diff --git a/clang/lib/CodeGen/ABIInfo.h b/clang/lib/CodeGen/ABIInfo.h
index b9a5ef6e4366936..ff4ae44a42c332c 100644
--- a/clang/lib/CodeGen/ABIInfo.h
+++ b/clang/lib/CodeGen/ABIInfo.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_LIB_CODEGEN_ABIINFO_H
 #define LLVM_CLANG_LIB_CODEGEN_ABIINFO_H
 
+#include "clang/AST/Attr.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/Type.h"
 #include "llvm/IR/CallingConv.h"
@@ -111,6 +112,15 @@ class ABIInfo {
 
   CodeGen::ABIArgInfo getNaturalAlignIndirectInReg(QualType Ty,
                                                    bool Realign = false) const;
+
+  virtual void appendAttributeMangling(TargetAttr *Attr,
+                                       raw_ostream &Out) const;
+  virtual void appendAttributeMangling(TargetVersionAttr *Attr,
+                                       raw_ostream &Out) const;
+  virtual void appendAttributeMangling(TargetClonesAttr *Attr, unsigned Index,
+                                       raw_ostream &Out) const;
+  virtual void appendAttributeMangling(StringRef AttrStr,
+                                       raw_ostream &Out) const;
 };
 
 /// Target specific hooks for defining how a type should be passed or returned
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index d16d12fac8b03c2..82a97ecfaa00789 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1727,59 +1727,6 @@ static void AppendCPUSpecificCPUDispatchMangling(const CodeGenModule &CGM,
     Out << ".resolver";
 }
 
-static void AppendTargetVersionMangling(const CodeGenModule &CGM,
-                                        const TargetVersionAttr *Attr,
-                                        raw_ostream &Out) {
-  if (Attr->isDefaultVersion()) {
-    Out << ".default";
-    return;
-  }
-  Out << "._";
-  const TargetInfo &TI = CGM.getTarget();
-  llvm::SmallVector<StringRef, 8> Feats;
-  Attr->getFeatures(Feats);
-  llvm::stable_sort(Feats, [&TI](const StringRef FeatL, const StringRef FeatR) {
-    return TI.multiVersionSortPriority(FeatL) <
-           TI.multiVersionSortPriority(FeatR);
-  });
-  for (const auto &Feat : Feats) {
-    Out << 'M';
-    Out << Feat;
-  }
-}
-
-static void AppendTargetMangling(const CodeGenModule &CGM,
-                                 const TargetAttr *Attr, raw_ostream &Out) {
-  if (Attr->isDefaultVersion())
-    return;
-
-  Out << '.';
-  const TargetInfo &Target = CGM.getTarget();
-  ParsedTargetAttr Info = Target.parseTargetAttr(Attr->getFeaturesStr());
-  llvm::sort(Info.Features, [&Target](StringRef LHS, StringRef RHS) {
-    // Multiversioning doesn't allow "no-${feature}", so we can
-    // only have "+" prefixes here.
-    assert(LHS.starts_with("+") && RHS.starts_with("+") &&
-           "Features should always have a prefix.");
-    return Target.multiVersionSortPriority(LHS.substr(1)) >
-           Target.multiVersionSortPriority(RHS.substr(1));
-  });
-
-  bool IsFirst = true;
-
-  if (!Info.CPU.empty()) {
-    IsFirst = false;
-    Out << "arch_" << Info.CPU;
-  }
-
-  for (StringRef Feat : Info.Features) {
-    if (!IsFirst)
-      Out << '_';
-    IsFirst = false;
-    Out << Feat.substr(1);
-  }
-}
-
 // Returns true if GD is a function decl with internal linkage and
 // needs a unique suffix after the mangled name.
 static bool isUniqueInternalLinkageDecl(GlobalDecl GD,
@@ -1789,41 +1736,6 @@ static bool isUniqueInternalLinkageDecl(GlobalDecl GD,
          (CGM.getFunctionLinkage(GD) == llvm::GlobalValue::InternalLinkage);
 }
 
-static void AppendTargetClonesMangling(const CodeGenModule &CGM,
-                                       const TargetClonesAttr *Attr,
-                                       unsigned VersionIndex,
-                                       raw_ostream &Out) {
-  const TargetInfo &TI = CGM.getTarget();
-  if (TI.getTriple().isAArch64()) {
-    StringRef FeatureStr = Attr->getFeatureStr(VersionIndex);
-    if (FeatureStr == "default") {
-      Out << ".default";
-      return;
-    }
-    Out << "._";
-    SmallVector<StringRef, 8> Features;
-    FeatureStr.split(Features, "+");
-    llvm::stable_sort(Features,
-                      [&TI](const StringRef FeatL, const StringRef FeatR) {
-                        return TI.multiVersionSortPriority(FeatL) <
-                               TI.multiVersionSortPriority(FeatR);
-                      });
-    for (auto &Feat : Features) {
-      Out << 'M';
-      Out << Feat;
-    }
-  } else {
-    Out << '.';
-    StringRef FeatureStr = Attr->getFeatureStr(VersionIndex);
-    if (FeatureStr.starts_with("arch="))
-      Out << "arch_" << FeatureStr.substr(sizeof("arch=") - 1);
-    else
-      Out << FeatureStr;
-
-    Out << '.' << Attr->getMangledIndex(VersionIndex);
-  }
-}
-
 static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD,
                                       const NamedDecl *ND,
                                       bool OmitMultiVersionMangling = false) {
@@ -1877,16 +1789,25 @@ static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD,
                                              FD->getAttr<CPUSpecificAttr>(),
                                              GD.getMultiVersionIndex(), Out);
         break;
-      case MultiVersionKind::Target:
-        AppendTargetMangling(CGM, FD->getAttr<TargetAttr>(), Out);
+      case MultiVersionKind::Target: {
+        auto *Attr = FD->getAttr<TargetAttr>();
+        const ABIInfo &Info = CGM.getTargetCodeGenInfo().getABIInfo();
+        Info.appendAttributeMangling(Attr, Out);
         break;
-      case MultiVersionKind::TargetVersion:
-        AppendTargetVersionMangling(CGM, FD->getAttr<TargetVersionAttr>(), Out);
+      }
+      case MultiVersionKind::TargetVersion: {
+        auto *Attr = FD->getAttr<TargetVersionAttr>();
+        const ABIInfo &Info = CGM.getTargetCodeGenInfo().getABIInfo();
+        Info.appendAttributeMangling(Attr, Out);
         break;
-      case MultiVersionKind::TargetClones:
-        AppendTargetClonesMangling(CGM, FD->getAttr<TargetClonesAttr>(),
-                                   GD.getMultiVersionIndex(), Out);
+      }
+      case MultiVersionKind::TargetClones: {
+        auto *Attr = FD->getAttr<TargetClonesAttr>();
+        unsigned Index = GD.getMultiVersionIndex();
+        const ABIInfo &Info = CGM.getTargetCodeGenInfo().getABIInfo();
+        Info.appendAttributeMangling(Attr, Index, Out);
         break;
+      }
       case MultiVersionKind::None:
         llvm_unreachable("None multiversion type isn't valid here");
       }
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 94f8e7be2ee6eb1..adfdd516351901e 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -9,6 +9,7 @@
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
 #include "clang/Basic/DiagnosticFrontend.h"
+#include "llvm/TargetParser/AArch64TargetParser.h"
 
 using namespace clang;
 using namespace clang::CodeGen;
@@ -75,6 +76,12 @@ class AArch64ABIInfo : public ABIInfo {
   bool allowBFloatArgsAndRet() const override {
     return getTarget().hasBFloat16Type();
   }
+
+  using ABIInfo::appendAttributeMangling;
+  void appendAttributeMangling(TargetClonesAttr *Attr, unsigned Index,
+                               raw_ostream &Out) const override;
+  void appendAttributeMangling(StringRef AttrStr,
+                               raw_ostream &Out) const override;
 };
 
 class AArch64SwiftABIInfo : public SwiftABIInfo {
@@ -857,6 +864,39 @@ void AArch64TargetCodeGenInfo::checkFunctionCallABI(
           << Callee->getDeclName();
 }
 
+void AArch64ABIInfo::appendAttributeMangling(TargetClonesAttr *Attr,
+                                             unsigned Index,
+                                             raw_ostream &Out) const {
+  appendAttributeMangling(Attr->getFeatureStr(Index), Out);
+}
+
+void AArch64ABIInfo::appendAttributeMangling(StringRef AttrStr,
+                                             raw_ostream &Out) const {
+  if (AttrStr == "default") {
+    Out << ".default";
+    return;
+  }
+
+  Out << "._";
+  SmallVector<StringRef, 8> Features;
+  AttrStr.split(Features, "+");
+  for (auto &Feat : Features)
+    Feat = Feat.trim();
+
+  // FIXME: It was brought up in #79316 that sorting the features which are
+  // used for mangling based on their multiversion priority is not a good
+  // practice. Changing the feature priorities will break the ABI. Perhaps
+  // it would be preferable to perform a lexicographical sort instead.
+  const TargetInfo &TI = CGT.getTarget();
+  llvm::sort(Features, [&TI](const StringRef LHS, const StringRef RHS) {
+    return TI.multiVersionSortPriority(LHS) < TI.multiVersionSortPriority(RHS);
+  });
+
+  for (auto &Feat : Features)
+    if (auto Ext = llvm::AArch64::parseArchExtension(Feat))
+      Out << 'M' << Ext->Name;
+}
+
 std::unique_ptr<TargetCodeGenInfo>
 CodeGen::createAArch64TargetCodeGenInfo(CodeGenModule &CGM,
                                         AArch64ABIKind Kind) {

From f9b079972c234c90df4b28990d5b2e51184e0b64 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 28 Feb 2024 17:54:20 +0000
Subject: [PATCH 103/114] [gn build] Port c6cbf81c84b0

---
 llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
index fbff113613d2698..5e533bf23ec4753 100644
--- a/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/InstallAPI/BUILD.gn
@@ -8,6 +8,8 @@ static_library("InstallAPI") {
   ]
   sources = [
     "FileList.cpp",
+    "Frontend.cpp",
     "HeaderFile.cpp",
+    "Visitor.cpp",
   ]
 }

From 6101cf3164d77cfa4e2df75bf5262da9a2fb3d0a Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Wed, 28 Feb 2024 10:01:03 -0800
Subject: [PATCH 104/114] [InstallAPI] add missing link to clangAST

Appeases bots.
---
 clang/tools/clang-installapi/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/tools/clang-installapi/CMakeLists.txt b/clang/tools/clang-installapi/CMakeLists.txt
index b8384c92c104f65..e05f4eac3ad1983 100644
--- a/clang/tools/clang-installapi/CMakeLists.txt
+++ b/clang/tools/clang-installapi/CMakeLists.txt
@@ -14,6 +14,7 @@ add_clang_tool(clang-installapi
 
 clang_target_link_libraries(clang-installapi
   PRIVATE
+  clangAST
   clangInstallAPI
   clangBasic
   clangDriver

From 1c211bc76e8b6a3261c62e5b6c2b75b7b90386b0 Mon Sep 17 00:00:00 2001
From: Nilanjana Basu <n_basu@apple.com>
Date: Wed, 28 Feb 2024 10:17:25 -0800
Subject: [PATCH 105/114] [LV] Remove unused configuration option (#82955)

Recent set of changes (PR #67725) in loop interleaving algorithm caused removal of the loop trip count threshold for allowing interleaving. Therefore configuration option interleave-small-loop-scalar-reduction is no longer needed.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp          | 9 +--------
 .../Transforms/LoopVectorize/PowerPC/interleave_IC.ll    | 4 ++--
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ea77b6018c0d15f..b81cd508663c321 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -315,12 +315,6 @@ static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
     cl::desc(
         "Enable runtime interleaving until load/store ports are saturated"));
 
-/// Interleave small loops with scalar reductions.
-static cl::opt<bool> InterleaveSmallLoopScalarReduction(
-    "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
-    cl::desc("Enable interleaving for loops with small iteration counts that "
-             "contain scalar reductions to expose ILP."));
-
 /// The number of stores in a loop that are allowed to need predication.
 static cl::opt<unsigned> NumberOfStoresToPredicate(
     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
@@ -5495,8 +5489,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
 
     // If there are scalar reductions and TTI has enabled aggressive
     // interleaving for reductions, we will interleave to expose ILP.
-    if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
-        AggressivelyInterleaveReductions) {
+    if (VF.isScalar() && AggressivelyInterleaveReductions) {
       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
       // Interleave no less than SmallIC but not as aggressive as the normal IC
       // to satisfy the rare situation when resources are too limited.
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
index 7121c85dd59be48..c12b3b122ba74c5 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/interleave_IC.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -passes=loop-vectorize -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
-; RUN: opt < %s -passes='loop-vectorize' -S -mcpu=pwr9 -interleave-small-loop-scalar-reduction=true 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -S -mcpu=pwr9 2>&1 | FileCheck %s
+; RUN: opt < %s -passes='loop-vectorize' -S -mcpu=pwr9 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: vector.body
 ; CHECK: load double, ptr

From c1b8c6cf41df4a148e7a89c3a3c7e8049b0a47af Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Wed, 28 Feb 2024 10:18:21 -0800
Subject: [PATCH 106/114] [mlir][vector][print] do not append newline to
 printing pure strings (#83213)

Since the vector.print str provides no punctuation control, it is
slightly more flexible to let the client of this operation decide
whether there should be a trailing newline. This allows for printing
like

vector.print str "nse = "
vector.print %nse : index

as

nse = 42
---
 .../VectorToLLVM/ConvertVectorToLLVM.cpp         |  3 ++-
 ...t-wide-int-emulation-compare-results-i16.mlir |  2 +-
 .../Dialect/Linalg/CPU/ArmSME/fill-2d.mlir       |  2 +-
 .../Linalg/CPU/ArmSME/use-too-many-tiles.mlir    | 10 +++++-----
 .../CPU/ArmSME/Emulated/test-setArmSVLBits.mlir  |  8 ++++----
 .../CPU/ArmSME/load-store-128-bit-tile.mlir      |  8 ++++----
 .../Vector/CPU/ArmSME/test-load-vertical.mlir    |  8 ++++----
 .../CPU/ArmSME/test-multi-tile-transpose.mlir    |  8 ++++----
 .../Vector/CPU/ArmSME/test-outerproduct-f32.mlir | 16 ++++++++--------
 .../Vector/CPU/ArmSME/test-outerproduct-f64.mlir | 16 ++++++++--------
 .../Vector/CPU/ArmSME/test-transfer-read-2d.mlir | 12 ++++++------
 .../CPU/ArmSME/test-transfer-write-2d.mlir       |  2 +-
 .../Vector/CPU/ArmSME/test-transpose.mlir        |  8 ++++----
 .../Dialect/Vector/CPU/ArmSME/tile_fill.mlir     |  4 ++--
 .../Vector/CPU/ArmSME/vector-load-store.mlir     |  8 ++++----
 .../CPU/ArmSVE/arrays-of-scalable-vectors.mlir   | 12 ++++++------
 .../Dialect/Vector/CPU/test-print-str.mlir       |  4 ++--
 17 files changed, 66 insertions(+), 65 deletions(-)

diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 19cc914efae0059..337f8bb6ab99edb 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1532,7 +1532,8 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
     auto punct = printOp.getPunctuation();
     if (auto stringLiteral = printOp.getStringLiteral()) {
       LLVM::createPrintStrCall(rewriter, loc, parent, "vector_print_str",
-                               *stringLiteral, *getTypeConverter());
+                               *stringLiteral, *getTypeConverter(),
+                               /*addNewline=*/false);
     } else if (punct != PrintPunctuation::NoPunctuation) {
       emitCall(rewriter, printOp->getLoc(), [&] {
         switch (punct) {
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir
index 15bafeda67403eb..437e49a6b814464 100644
--- a/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-wide-int-emulation-compare-results-i16.mlir
@@ -26,7 +26,7 @@ func.func @check_results(%lhs : i16, %rhs : i16, %res0 : i16, %res1 : i16) -> ()
   %mismatch = arith.cmpi ne, %res0, %res1 : i16
   scf.if %mismatch -> () {
     vector.print %res1 : i16
-    vector.print str "Mismatch"
+    vector.print str "Mismatch\n"
   }
   return
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
index 12f13e8dbc4a9f5..881e2799b5b06b0 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
@@ -88,7 +88,7 @@ func.func @entry() {
   }
 
   // CHECK: SME: END OF TEST OUTPUT
-  vector.print str "SME: END OF TEST OUTPUT"
+  vector.print str "SME: END OF TEST OUTPUT\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
index ee3866de303e03b..588b44a36c29f3c 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
@@ -24,23 +24,23 @@ func.func @use_too_many_tiles(%a: memref<?x?xi16>, %b:  memref<?x?xi16>, %c: mem
 
   // CHECK-LABEL: tile_a:
   // CHECK-COUNT-8: ( 0, 0, 0, 0, 0, 0, 0, 0
-  vector.print str "tile_a:"
+  vector.print str "tile_a:\n"
   vector.print %tile_a : vector<[8]x[8]xi16>
   // CHECK-LABEL: tile_b:
   // CHECK-COUNT-8: ( 1, 1, 1, 1, 1, 1, 1, 1
-  vector.print str "tile_b:"
+  vector.print str "tile_b:\n"
   vector.print %tile_b : vector<[8]x[8]xi16>
   // CHECK-LABEL: tile_c:
   // CHECK-COUNT-8: ( 2, 2, 2, 2, 2, 2, 2, 2
-  vector.print str "tile_c:"
+  vector.print str "tile_c:\n"
   vector.print %tile_c : vector<[8]x[8]xi16>
   // CHECK-LABEL: tile_d:
   // CHECK-COUNT-8: ( 3, 3, 3, 3, 3, 3, 3, 3
-  vector.print str "tile_d:"
+  vector.print str "tile_d:\n"
   vector.print %tile_d : vector<[8]x[8]xi16>
   // CHECK-LABEL: tile_e:
   // CHECK-COUNT-8: ( 4, 4, 4, 4, 4, 4, 4, 4
-  vector.print str "tile_e:"
+  vector.print str "tile_e:\n"
   vector.print %tile_e : vector<[8]x[8]xi16>
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir
index 415181171e27b85..1794564a6a72445 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir
@@ -12,13 +12,13 @@ func.func @checkSVL() {
   %svl_h = arm_sme.streaming_vl <half>
   %svl_w = arm_sme.streaming_vl <word>
   %svl_d = arm_sme.streaming_vl <double>
-  vector.print str "SVL.b"
+  vector.print str "SVL.b\n"
   vector.print %svl_b : index
-  vector.print str "SVL.h"
+  vector.print str "SVL.h\n"
   vector.print %svl_h : index
-  vector.print str "SVL.w"
+  vector.print str "SVL.w\n"
   vector.print %svl_w : index
-  vector.print str "SVL.d"
+  vector.print str "SVL.d\n"
   vector.print %svl_d : index
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
index 2b8899b6c6fc327..41e724844fe4037 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir
@@ -53,13 +53,13 @@ func.func @test_load_store_zaq0() {
 
   // CHECK-LABEL: INITIAL TILE A:
   // CHECK: ( 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 )
-  vector.print str "INITIAL TILE A:"
+  vector.print str "INITIAL TILE A:\n"
   func.call @print_i8s(%tile_a_bytes, %zaq_size_bytes) : (memref<?xi8>, index) -> ()
   vector.print punctuation <newline>
 
   // CHECK-LABEL: INITIAL TILE B:
   // CHECK: ( 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 )
-  vector.print str "INITIAL TILE B:"
+  vector.print str "INITIAL TILE B:\n"
   func.call @print_i8s(%tile_b_bytes, %zaq_size_bytes) : (memref<?xi8>, index) -> ()
   vector.print punctuation <newline>
 
@@ -68,13 +68,13 @@ func.func @test_load_store_zaq0() {
 
   // CHECK-LABEL: FINAL TILE A:
   // CHECK: ( 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 )
-  vector.print str "FINAL TILE A:"
+  vector.print str "FINAL TILE A:\n"
   func.call @print_i8s(%tile_a_bytes, %zaq_size_bytes) : (memref<?xi8>, index) -> ()
   vector.print punctuation <newline>
 
   // CHECK-LABEL: FINAL TILE B:
   // CHECK: ( 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 )
-  vector.print str "FINAL TILE B:"
+  vector.print str "FINAL TILE B:\n"
   func.call @print_i8s(%tile_b_bytes, %zaq_size_bytes) : (memref<?xi8>, index) -> ()
 
   return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir
index 27be801252b8126..68c31ac1dd8e9cf 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-load-vertical.mlir
@@ -49,12 +49,12 @@ func.func @entry() {
   // CHECK-NEXT: ( 2, 2, 2, 2
   // CHECK-NEXT: ( 3, 3, 3, 3
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   scf.for %i = %c0 to %za_s_size step %svl_s {
     %tileslice = vector.load %mem1[%i] : memref<?xi32>, vector<[4]xi32>
     vector.print %tileslice : vector<[4]xi32>
   }
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   // 2. VERTICAL LAYOUT
   // Dump "mem2". The smallest SVL is 128-bits so the tile will be at least
@@ -66,9 +66,9 @@ func.func @entry() {
   // CHECK-NEXT: ( 0, 1, 2, 3
   // CHECK-NEXT: ( 0, 1, 2, 3
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %0 : vector<[4]x[4]xi32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir
index 9d836d93c85bb7d..cd48f2a9ebfd895 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-multi-tile-transpose.mlir
@@ -46,12 +46,12 @@ func.func @testTransposedReadWithMask(%maskRows: index, %maskCols: index) {
   vector.transfer_write %readTransposed, %outDyn[%c0, %c0] {in_bounds = [true, true]} : vector<[16]x[4]xf32>, memref<?x?xf32>
 
   /// Print the input memref.
-  vector.print str "Input memref:"
+  vector.print str "Input memref:\n"
   %inUnranked = memref.cast %inDyn : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%inUnranked) : (memref<*xf32>) -> ()
 
   /// Print the result memref.
-  vector.print str "Masked transposed result:"
+  vector.print str "Masked transposed result:\n"
   %outUnranked = memref.cast %outDyn : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%outUnranked) : (memref<*xf32>) -> ()
 
@@ -84,12 +84,12 @@ func.func @testTransposedWriteWithMask(%maskRows: index, %maskCols: index) {
     : vector<[16]x[4]xf32>, memref<?x?xf32>
 
   /// Print the input memref.
-  vector.print str "Input memref:"
+  vector.print str "Input memref:\n"
   %inUnranked = memref.cast %inDyn : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%inUnranked) : (memref<*xf32>) -> ()
 
   /// Print the result memref.
-  vector.print str "Masked transposed result:"
+  vector.print str "Masked transposed result:\n"
   %outUnranked = memref.cast %outDyn : memref<?x?xf32> to memref<*xf32>
   call @printMemrefF32(%outUnranked) : (memref<*xf32>) -> ()
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
index 7e7869d1c957aa5..fb6c06cfd69999d 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
@@ -35,9 +35,9 @@ func.func @test_outerproduct_no_accumulator_4x4xf32() {
   // WITHOUT-ACC-NEXT: ( 0, 2, 4, 6
   // WITHOUT-ACC-NEXT: ( 0, 3, 6, 9
   // WITHOUT-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xf32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -60,9 +60,9 @@ func.func @test_outerproduct_with_accumulator_4x4xf32() {
   // WITH-ACC-NEXT: ( 10, 12, 14, 16
   // WITH-ACC-NEXT: ( 10, 13, 16, 19
   // WITH-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xf32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -91,9 +91,9 @@ func.func @test_masked_outerproduct_no_accumulator_4x4xf32() {
   // WITH-MASK-NEXT: ( 3, 6, 0, 0
   // WITH-MASK-NEXT: ( 0, 0, 0, 0
   // WITH-MASK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xf32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -124,9 +124,9 @@ func.func @test_masked_outerproduct_with_accumulator_4x4xf32() {
   // WITH-MASK-AND-ACC-NEXT: ( 10, 10, 10, 10
   // WITH-MASK-AND-ACC-NEXT: ( 10, 10, 10, 10
   // WITH-MASK-AND-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xf32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
index 46bf799232ae3a5..b8458606d3f324e 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
@@ -40,9 +40,9 @@ func.func @test_outerproduct_no_accumulator_2x2xf64() {
   // CHECK-NEXT: ( 1, 2
   // CHECK-NEXT: ( 2, 4
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[2]x[2]xf64>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -66,9 +66,9 @@ func.func @test_outerproduct_with_accumulator_2x2xf64() {
   // WITH-ACC-NEXT: ( 11, 12
   // WITH-ACC-NEXT: ( 12, 14
   // WITH-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[2]x[2]xf64>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -96,9 +96,9 @@ func.func @test_masked_outerproduct_no_accumulator_2x2xf64() {
   // WITH-MASK-NEXT: ( 1, 0
   // WITH-MASK-NEXT: ( 2, 0
   // WITH-MASK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[2]x[2]xf64>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
@@ -127,9 +127,9 @@ func.func @test_masked_outerproduct_with_accumulator_2x2xf64() {
   // WITH-MASK-AND-ACC-NEXT: ( 11, 12
   // WITH-MASK-AND-ACC-NEXT: ( 10, 10
   // WITH-MASK-AND-ACC:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[2]x[2]xf64>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir
index 52f56883cad9c1f..7421521b96bf92c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-read-2d.mlir
@@ -14,7 +14,7 @@ func.func @transfer_read_2d(%A : memref<?x?xf32>, %base1: index, %base2: index)
   %0 = vector.transfer_read %A[%base1, %base2], %pad {in_bounds=[true, true]} :
     memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
@@ -27,7 +27,7 @@ func.func @transfer_read_2d_transposed(%A : memref<?x?xf32>, %base1: index, %bas
     {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]}
       : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0 : vector<[4]x[4]xf32>
 
   return
@@ -42,7 +42,7 @@ func.func @transfer_read_2d_mask(%A : memref<?x?xf32>, %base1: index, %base2: in
   %0 = vector.transfer_read %A[%base1, %base2], %pad, %mask
     {in_bounds = [true, true]} : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
@@ -58,7 +58,7 @@ func.func @transfer_read_2d_mask_transposed(%A : memref<?x?xf32>, %base1: index,
     {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]}
       : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
@@ -73,7 +73,7 @@ func.func @transfer_read_2d_mask_non_zero_pad(%A : memref<?x?xf32>, %base1: inde
   %0 = vector.transfer_read %A[%base1, %base2], %pad, %mask
     {in_bounds = [true, true]} : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
@@ -89,7 +89,7 @@ func.func @transfer_read_2d_mask_non_zero_pad_transposed(%A : memref<?x?xf32>, %
     {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds=[true, true]}
       : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
index 710cc6672f00571..2fef705861f28ed 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
@@ -51,7 +51,7 @@ func.func @transfer_write_2d_mask_transposed(%A : memref<?x?xf32>, %base1: index
 func.func @load_and_print(%A : memref<?x?xf32>, %base1: index, %base2: index) {
   %0 = vector.load %A[%base1, %base2] : memref<?x?xf32>, vector<[4]x[4]xf32>
 
-  vector.print str "TILE BEGIN:"
+  vector.print str "TILE BEGIN:\n"
   vector.print %0: vector<[4]x[4]xf32>
 
   return
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir
index 88bc0d0709d489a..177c96f1d8aae6a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transpose.mlir
@@ -51,9 +51,9 @@ func.func @entry() {
   // CHECK-NEXT: ( 2, 2, 2, 2
   // CHECK-NEXT: ( 3, 3, 3, 3
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xi32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   // Dump the transposed tile. The smallest SVL is 128-bits so the tile will be
   // at least 4x4xi32.
@@ -64,9 +64,9 @@ func.func @entry() {
   // CHECK-NEXT: ( 0, 1, 2, 3
   // CHECK-NEXT: ( 0, 1, 2, 3
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %transposed_tile : vector<[4]x[4]xi32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
index e14917486d845df..3d74508cd23b573 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
@@ -23,9 +23,9 @@ func.func @entry() -> i32 {
   // CHECK-NEXT: ( 123, 123, 123, 123
   // CHECK-NEXT: ( 123, 123, 123, 123
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   vector.print %tile : vector<[4]x[4]xi32>
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   %c0_i32 = arith.constant 0 : i32
   return %c0_i32 : i32
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir
index b29790db14ddc4e..48080fd0a26a2ba 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-load-store.mlir
@@ -255,7 +255,7 @@ func.func @load_store_two_za_s_tiles() -> i32 {
   // CHECK-NEXT: ( 1, 1, 1, 1
   // CHECK-NEXT: ( 1, 1, 1, 1
   // CHECK:      TILE END
-  vector.print str "TILE BEGIN"
+  vector.print str "TILE BEGIN\n"
   scf.for %i = %c0 to %size_of_two_tiles step %svl_s {
     %av = vector.load %mem2[%i] : memref<?xi32>, vector<[4]xi32>
     vector.print %av : vector<[4]xi32>
@@ -263,11 +263,11 @@ func.func @load_store_two_za_s_tiles() -> i32 {
     %tileSizeMinusStep = arith.subi %size_of_tile, %svl_s : index
     %isNextTile = arith.cmpi eq, %i, %tileSizeMinusStep : index
     scf.if %isNextTile {
-      vector.print str "TILE END"
-      vector.print str "TILE BEGIN"
+      vector.print str "TILE END\n"
+      vector.print str "TILE BEGIN\n"
     }
   }
-  vector.print str "TILE END"
+  vector.print str "TILE END\n"
 
   return %c0_i32 : i32
 }
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
index c486bf0de5d3528..afb23e8e5206601 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
@@ -24,7 +24,7 @@ func.func @read_and_print_2d_vector(%memref: memref<3x?xf32>)  {
   /// Print each of the vectors.
   /// vscale is >= 1, so at least 8 elements will be printed.
 
-  vector.print str "read_and_print_2d_vector()"
+  vector.print str "read_and_print_2d_vector()\n"
   // CHECK-LABEL: read_and_print_2d_vector()
   // CHECK: ( 8, 8, 8, 8, 8, 8, 8, 8
   vector.print %row0 : vector<[8]xf32>
@@ -62,21 +62,21 @@ func.func @add_arrays_of_scalable_vectors(%a: memref<1x2x?xf32>, %b: memref<1x2x
   // CHECK-LABEL: Vector A
   // CHECK-NEXT: ( 5, 5, 5, 5
   // CHECK-NEXT: ( 5, 5, 5, 5
-  vector.print str "\nVector A"
+  vector.print str "\nVector A\n"
   %vector_a = vector.transfer_read %a[%c0, %c0, %c0], %cst, %mask_a {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
   func.call @print_1x2xVSCALExf32(%vector_a) : (vector<1x2x[4]xf32>) -> ()
 
   // CHECK-LABEL: Vector B
   // CHECK-NEXT: ( 4, 4, 4, 4
   // CHECK-NEXT: ( 4, 4, 4, 4
-  vector.print str "\nVector B"
+  vector.print str "\nVector B\n"
   %vector_b = vector.transfer_read %b[%c0, %c0, %c0], %cst, %mask_b {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
   func.call @print_1x2xVSCALExf32(%vector_b) : (vector<1x2x[4]xf32>) -> ()
 
   // CHECK-LABEL: Sum
   // CHECK-NEXT: ( 9, 9, 9, 9
   // CHECK-NEXT: ( 9, 9, 9, 9
-  vector.print str "\nSum"
+  vector.print str "\nSum\n"
   %sum = arith.addf %vector_a, %vector_b : vector<1x2x[4]xf32>
   func.call @print_1x2xVSCALExf32(%sum) : (vector<1x2x[4]xf32>) -> ()
 
@@ -97,7 +97,7 @@ func.func @entry() {
 
   linalg.fill ins(%f32_8 : f32) outs(%test_1_memref :memref<3x?xf32>)
 
-  vector.print str "=> Print and read 2D arrays of scalable vectors:"
+  vector.print str "=> Print and read 2D arrays of scalable vectors:\n"
   func.call @read_and_print_2d_vector(%test_1_memref) : (memref<3x?xf32>) -> ()
 
   vector.print str "\n====================\n"
@@ -109,7 +109,7 @@ func.func @entry() {
   linalg.fill ins(%f32_5 : f32) outs(%test_2_memref_a :memref<1x2x?xf32>)
   linalg.fill ins(%f32_4 : f32) outs(%test_2_memref_b :memref<1x2x?xf32>)
 
-  vector.print str "=> Reading and adding two 3D arrays of scalable vectors:"
+  vector.print str "=> Reading and adding two 3D arrays of scalable vectors:\n"
   func.call @add_arrays_of_scalable_vectors(
     %test_2_memref_a, %test_2_memref_b) : (memref<1x2x?xf32>, memref<1x2x?xf32>) -> ()
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-print-str.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-print-str.mlir
index 78d6609ccaf9a9d..25a44f22c2dc0ba 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-print-str.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-print-str.mlir
@@ -7,8 +7,8 @@
 
 func.func @entry() {
    // CHECK: Hello, World!
-   vector.print str "Hello, World!"
+   vector.print str "Hello, World!\n"
    // CHECK-NEXT: Bye!
-   vector.print str "Bye!"
+   vector.print str "Bye!\n"
    return
 }

From baf6725b38491222f40a3c40bd27e57b0dd7f1f9 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Wed, 28 Feb 2024 10:39:14 -0800
Subject: [PATCH 107/114] [flang][runtime] Support NORM2 for REAL(16) with
 FortranFloat128Math lib. (#83219)

Changed the lowering to call Norm2DimReal16 for REAL(16).
Added the corresponding entry point to FortranFloat128Math,
which required some restructuring in the related templates.
---
 flang/include/flang/Runtime/reduction.h       |   5 +-
 .../Optimizer/Builder/Runtime/Reduction.cpp   |  25 +++-
 flang/runtime/Float128Math/CMakeLists.txt     |   1 +
 flang/runtime/Float128Math/math-entries.h     |  16 +++
 flang/runtime/Float128Math/norm2.cpp          |  59 +++++++++
 flang/runtime/extrema.cpp                     | 107 ++--------------
 flang/runtime/reduction-templates.h           | 115 ++++++++++++++++++
 flang/runtime/tools.h                         |  11 +-
 flang/test/Lower/Intrinsics/norm2.f90         |  16 +++
 9 files changed, 256 insertions(+), 99 deletions(-)
 create mode 100644 flang/runtime/Float128Math/norm2.cpp

diff --git a/flang/include/flang/Runtime/reduction.h b/flang/include/flang/Runtime/reduction.h
index 6d62f4016937e0b..5b607765857523a 100644
--- a/flang/include/flang/Runtime/reduction.h
+++ b/flang/include/flang/Runtime/reduction.h
@@ -364,9 +364,12 @@ double RTDECL(Norm2_8)(
 #if LDBL_MANT_DIG == 64
 long double RTDECL(Norm2_10)(
     const Descriptor &, const char *source, int line, int dim = 0);
-#elif LDBL_MANT_DIG == 113
+#endif
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
 long double RTDECL(Norm2_16)(
     const Descriptor &, const char *source, int line, int dim = 0);
+void RTDECL(Norm2DimReal16)(
+    Descriptor &, const Descriptor &, int dim, const char *source, int line);
 #endif
 void RTDECL(Norm2Dim)(
     Descriptor &, const Descriptor &, int dim, const char *source, int line);
diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
index fabbff818b6f0ef..66fbaddcbda1aae 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
@@ -149,6 +149,22 @@ struct ForcedNorm2Real16 {
   }
 };
 
+/// Placeholder for real*16 version of Norm2Dim Intrinsic
+struct ForcedNorm2DimReal16 {
+  static constexpr const char *name = ExpandAndQuoteKey(RTNAME(Norm2DimReal16));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto boxTy =
+          fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
+      auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
+      auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
+      return mlir::FunctionType::get(
+          ctx, {fir::ReferenceType::get(boxTy), boxTy, intTy, strTy, intTy},
+          mlir::NoneType::get(ctx));
+    };
+  }
+};
+
 /// Placeholder for real*10 version of Product Intrinsic
 struct ForcedProductReal10 {
   static constexpr const char *name = ExpandAndQuoteKey(RTNAME(ProductReal10));
@@ -876,7 +892,14 @@ mlir::Value fir::runtime::genMinval(fir::FirOpBuilder &builder,
 void fir::runtime::genNorm2Dim(fir::FirOpBuilder &builder, mlir::Location loc,
                                mlir::Value resultBox, mlir::Value arrayBox,
                                mlir::Value dim) {
-  auto func = fir::runtime::getRuntimeFunc<mkRTKey(Norm2Dim)>(loc, builder);
+  mlir::func::FuncOp func;
+  auto ty = arrayBox.getType();
+  auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
+  auto eleTy = arrTy.cast<fir::SequenceType>().getEleTy();
+  if (eleTy.isF128())
+    func = fir::runtime::getRuntimeFunc<ForcedNorm2DimReal16>(loc, builder);
+  else
+    func = fir::runtime::getRuntimeFunc<mkRTKey(Norm2Dim)>(loc, builder);
   auto fTy = func.getFunctionType();
   auto sourceFile = fir::factory::locationToFilename(builder, loc);
   auto sourceLine =
diff --git a/flang/runtime/Float128Math/CMakeLists.txt b/flang/runtime/Float128Math/CMakeLists.txt
index 8d276e8f1227286..f11678cd70b7696 100644
--- a/flang/runtime/Float128Math/CMakeLists.txt
+++ b/flang/runtime/Float128Math/CMakeLists.txt
@@ -69,6 +69,7 @@ set(sources
   log.cpp
   log10.cpp
   lround.cpp
+  norm2.cpp
   pow.cpp
   round.cpp
   sin.cpp
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index 83298674c4971fd..a0d81d0cbb54073 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -54,6 +54,7 @@ namespace Fortran::runtime {
   };
 
 // Define fallback callers.
+DEFINE_FALLBACK(Abs)
 DEFINE_FALLBACK(Acos)
 DEFINE_FALLBACK(Acosh)
 DEFINE_FALLBACK(Asin)
@@ -99,6 +100,7 @@ DEFINE_FALLBACK(Yn)
 // Use STD math functions. They provide IEEE-754 128-bit float
 // support either via 'long double' or __float128.
 // The Bessel's functions are not present in STD namespace.
+DEFINE_SIMPLE_ALIAS(Abs, std::abs)
 DEFINE_SIMPLE_ALIAS(Acos, std::acos)
 DEFINE_SIMPLE_ALIAS(Acosh, std::acosh)
 DEFINE_SIMPLE_ALIAS(Asin, std::asin)
@@ -155,6 +157,7 @@ DEFINE_SIMPLE_ALIAS(Yn, ynl)
 #elif HAS_QUADMATHLIB
 // Define wrapper callers for libquadmath.
 #include "quadmath.h"
+DEFINE_SIMPLE_ALIAS(Abs, fabsq)
 DEFINE_SIMPLE_ALIAS(Acos, acosq)
 DEFINE_SIMPLE_ALIAS(Acosh, acoshq)
 DEFINE_SIMPLE_ALIAS(Asin, asinq)
@@ -191,6 +194,19 @@ DEFINE_SIMPLE_ALIAS(Y0, y0q)
 DEFINE_SIMPLE_ALIAS(Y1, y1q)
 DEFINE_SIMPLE_ALIAS(Yn, ynq)
 #endif
+
+extern "C" {
+// Declarations of the entry points that might be referenced
+// within the Float128Math library itself.
+// Note that not all of these entry points are actually
+// defined in this library. Some of them are used just
+// as template parameters to call the corresponding callee directly.
+CppTypeFor<TypeCategory::Real, 16> RTDECL(AbsF128)(
+    CppTypeFor<TypeCategory::Real, 16> x);
+CppTypeFor<TypeCategory::Real, 16> RTDECL(SqrtF128)(
+    CppTypeFor<TypeCategory::Real, 16> x);
+} // extern "C"
+
 } // namespace Fortran::runtime
 
 #endif // FORTRAN_RUNTIME_FLOAT128MATH_MATH_ENTRIES_H_
diff --git a/flang/runtime/Float128Math/norm2.cpp b/flang/runtime/Float128Math/norm2.cpp
new file mode 100644
index 000000000000000..17453bd2d6cbd73
--- /dev/null
+++ b/flang/runtime/Float128Math/norm2.cpp
@@ -0,0 +1,59 @@
+//===-- runtime/Float128Math/norm2.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+#include "reduction-templates.h"
+#include <cmath>
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+
+namespace {
+using namespace Fortran::runtime;
+
+using AccumType = Norm2AccumType<16>;
+
+struct ABSTy {
+  static AccumType compute(AccumType x) {
+    return Sqrt<RTNAME(AbsF128)>::invoke(x);
+  }
+};
+
+struct SQRTTy {
+  static AccumType compute(AccumType x) {
+    return Sqrt<RTNAME(SqrtF128)>::invoke(x);
+  }
+};
+
+using Float128Norm2Accumulator = Norm2Accumulator<16, ABSTy, SQRTTy>;
+} // namespace
+
+namespace Fortran::runtime {
+extern "C" {
+
+CppTypeFor<TypeCategory::Real, 16> RTDEF(Norm2_16)(
+    const Descriptor &x, const char *source, int line, int dim) {
+  auto accumulator{::Float128Norm2Accumulator(x)};
+  return GetTotalReduction<TypeCategory::Real, 16>(
+      x, source, line, dim, nullptr, accumulator, "NORM2");
+}
+
+void RTDEF(Norm2DimReal16)(Descriptor &result, const Descriptor &x, int dim,
+    const char *source, int line) {
+  Terminator terminator{source, line};
+  auto type{x.type().GetCategoryAndKind()};
+  RUNTIME_CHECK(terminator, type);
+  RUNTIME_CHECK(
+      terminator, type->first == TypeCategory::Real && type->second == 16);
+  DoMaxMinNorm2<TypeCategory::Real, 16, ::Float128Norm2Accumulator>(
+      result, x, dim, nullptr, "NORM2", terminator);
+}
+
+} // extern "C"
+} // namespace Fortran::runtime
+
+#endif
diff --git a/flang/runtime/extrema.cpp b/flang/runtime/extrema.cpp
index 3fdc8e159866d1e..fc2b4e165cb2698 100644
--- a/flang/runtime/extrema.cpp
+++ b/flang/runtime/extrema.cpp
@@ -528,35 +528,6 @@ inline RT_API_ATTRS CppTypeFor<CAT, KIND> TotalNumericMaxOrMin(
       NumericExtremumAccumulator<CAT, KIND, IS_MAXVAL>{x}, intrinsic);
 }
 
-template <TypeCategory CAT, int KIND, typename ACCUMULATOR>
-static RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
-    int dim, const Descriptor *mask, const char *intrinsic,
-    Terminator &terminator) {
-  using Type = CppTypeFor<CAT, KIND>;
-  ACCUMULATOR accumulator{x};
-  if (dim == 0 || x.rank() == 1) {
-    // Total reduction
-
-    // Element size of the destination descriptor is the same
-    // as the element size of the source.
-    result.Establish(x.type(), x.ElementBytes(), nullptr, 0, nullptr,
-        CFI_attribute_allocatable);
-    if (int stat{result.Allocate()}) {
-      terminator.Crash(
-          "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
-    }
-    DoTotalReduction<Type>(x, dim, mask, accumulator, intrinsic, terminator);
-    accumulator.GetResult(result.OffsetElement<Type>());
-  } else {
-    // Partial reduction
-
-    // Element size of the destination descriptor is the same
-    // as the element size of the source.
-    PartialReduction<ACCUMULATOR, CAT, KIND>(result, x, x.ElementBytes(), dim,
-        mask, terminator, intrinsic, accumulator);
-  }
-}
-
 template <TypeCategory CAT, bool IS_MAXVAL> struct MaxOrMinHelper {
   template <int KIND> struct Functor {
     RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
@@ -802,66 +773,11 @@ RT_EXT_API_GROUP_END
 
 // NORM2
 
-RT_VAR_GROUP_BEGIN
-
-// Use at least double precision for accumulators.
-// Don't use __float128, it doesn't work with abs() or sqrt() yet.
-static constexpr RT_CONST_VAR_ATTRS int largestLDKind {
-#if LDBL_MANT_DIG == 113
-  16
-#elif LDBL_MANT_DIG == 64
-  10
-#else
-  8
-#endif
-};
-
-RT_VAR_GROUP_END
-
-template <int KIND> class Norm2Accumulator {
-public:
-  using Type = CppTypeFor<TypeCategory::Real, KIND>;
-  using AccumType =
-      CppTypeFor<TypeCategory::Real, std::clamp(KIND, 8, largestLDKind)>;
-  explicit RT_API_ATTRS Norm2Accumulator(const Descriptor &array)
-      : array_{array} {}
-  RT_API_ATTRS void Reinitialize() { max_ = sum_ = 0; }
-  template <typename A>
-  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
-    // m * sqrt(1 + sum((others(:)/m)**2))
-    *p = static_cast<Type>(max_ * std::sqrt(1 + sum_));
-  }
-  RT_API_ATTRS bool Accumulate(Type x) {
-    auto absX{std::abs(static_cast<AccumType>(x))};
-    if (!max_) {
-      max_ = absX;
-    } else if (absX > max_) {
-      auto t{max_ / absX}; // < 1.0
-      auto tsq{t * t};
-      sum_ *= tsq; // scale sum to reflect change to the max
-      sum_ += tsq; // include a term for the previous max
-      max_ = absX;
-    } else { // absX <= max_
-      auto t{absX / max_};
-      sum_ += t * t;
-    }
-    return true;
-  }
-  template <typename A>
-  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
-    return Accumulate(*array_.Element<A>(at));
-  }
-
-private:
-  const Descriptor &array_;
-  AccumType max_{0}; // value (m) with largest magnitude
-  AccumType sum_{0}; // sum((others(:)/m)**2)
-};
-
 template <int KIND> struct Norm2Helper {
   RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x, int dim,
       const Descriptor *mask, Terminator &terminator) const {
-    DoMaxMinNorm2<TypeCategory::Real, KIND, Norm2Accumulator<KIND>>(
+    DoMaxMinNorm2<TypeCategory::Real, KIND,
+        typename Norm2AccumulatorGetter<KIND>::Type>(
         result, x, dim, mask, "NORM2", terminator);
   }
 };
@@ -872,26 +788,27 @@ RT_EXT_API_GROUP_BEGIN
 // TODO: REAL(2 & 3)
 CppTypeFor<TypeCategory::Real, 4> RTDEF(Norm2_4)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 4>(
-      x, source, line, dim, nullptr, Norm2Accumulator<4>{x}, "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 4>(x, source, line, dim, nullptr,
+      Norm2AccumulatorGetter<4>::create(x), "NORM2");
 }
 CppTypeFor<TypeCategory::Real, 8> RTDEF(Norm2_8)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 8>(
-      x, source, line, dim, nullptr, Norm2Accumulator<8>{x}, "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 8>(x, source, line, dim, nullptr,
+      Norm2AccumulatorGetter<8>::create(x), "NORM2");
 }
 #if LDBL_MANT_DIG == 64
 CppTypeFor<TypeCategory::Real, 10> RTDEF(Norm2_10)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 10>(
-      x, source, line, dim, nullptr, Norm2Accumulator<10>{x}, "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 10>(x, source, line, dim,
+      nullptr, Norm2AccumulatorGetter<10>::create(x), "NORM2");
 }
 #endif
 #if LDBL_MANT_DIG == 113
+// The __float128 implementation resides in FortranFloat128Math library.
 CppTypeFor<TypeCategory::Real, 16> RTDEF(Norm2_16)(
     const Descriptor &x, const char *source, int line, int dim) {
-  return GetTotalReduction<TypeCategory::Real, 16>(
-      x, source, line, dim, nullptr, Norm2Accumulator<16>{x}, "NORM2");
+  return GetTotalReduction<TypeCategory::Real, 16>(x, source, line, dim,
+      nullptr, Norm2AccumulatorGetter<16>::create(x), "NORM2");
 }
 #endif
 
@@ -901,7 +818,7 @@ void RTDEF(Norm2Dim)(Descriptor &result, const Descriptor &x, int dim,
   auto type{x.type().GetCategoryAndKind()};
   RUNTIME_CHECK(terminator, type);
   if (type->first == TypeCategory::Real) {
-    ApplyFloatingPointKind<Norm2Helper, void>(
+    ApplyFloatingPointKind<Norm2Helper, void, true>(
         type->second, terminator, result, x, dim, nullptr, terminator);
   } else {
     terminator.Crash("NORM2: bad type code %d", x.type().raw());
diff --git a/flang/runtime/reduction-templates.h b/flang/runtime/reduction-templates.h
index 7d0f82d59a084d3..0891bc021ff753a 100644
--- a/flang/runtime/reduction-templates.h
+++ b/flang/runtime/reduction-templates.h
@@ -25,6 +25,7 @@
 #include "tools.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
+#include <algorithm>
 
 namespace Fortran::runtime {
 
@@ -332,5 +333,119 @@ template <typename ACCUMULATOR> struct PartialLocationHelper {
   };
 };
 
+// NORM2 templates
+
+RT_VAR_GROUP_BEGIN
+
+// Use at least double precision for accumulators.
+// Don't use __float128, it doesn't work with abs() or sqrt() yet.
+static constexpr RT_CONST_VAR_ATTRS int Norm2LargestLDKind {
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+  16
+#elif LDBL_MANT_DIG == 64
+  10
+#else
+  8
+#endif
+};
+
+RT_VAR_GROUP_END
+
+template <TypeCategory CAT, int KIND, typename ACCUMULATOR>
+inline RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
+    int dim, const Descriptor *mask, const char *intrinsic,
+    Terminator &terminator) {
+  using Type = CppTypeFor<CAT, KIND>;
+  ACCUMULATOR accumulator{x};
+  if (dim == 0 || x.rank() == 1) {
+    // Total reduction
+
+    // Element size of the destination descriptor is the same
+    // as the element size of the source.
+    result.Establish(x.type(), x.ElementBytes(), nullptr, 0, nullptr,
+        CFI_attribute_allocatable);
+    if (int stat{result.Allocate()}) {
+      terminator.Crash(
+          "%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
+    }
+    DoTotalReduction<Type>(x, dim, mask, accumulator, intrinsic, terminator);
+    accumulator.GetResult(result.OffsetElement<Type>());
+  } else {
+    // Partial reduction
+
+    // Element size of the destination descriptor is the same
+    // as the element size of the source.
+    PartialReduction<ACCUMULATOR, CAT, KIND>(result, x, x.ElementBytes(), dim,
+        mask, terminator, intrinsic, accumulator);
+  }
+}
+
+// The data type used by Norm2Accumulator.
+template <int KIND>
+using Norm2AccumType =
+    CppTypeFor<TypeCategory::Real, std::clamp(KIND, 8, Norm2LargestLDKind)>;
+
+template <int KIND, typename ABS, typename SQRT> class Norm2Accumulator {
+public:
+  using Type = CppTypeFor<TypeCategory::Real, KIND>;
+  using AccumType = Norm2AccumType<KIND>;
+  explicit RT_API_ATTRS Norm2Accumulator(const Descriptor &array)
+      : array_{array} {}
+  RT_API_ATTRS void Reinitialize() { max_ = sum_ = 0; }
+  template <typename A>
+  RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
+    // m * sqrt(1 + sum((others(:)/m)**2))
+    *p = static_cast<Type>(max_ * SQRT::compute(1 + sum_));
+  }
+  RT_API_ATTRS bool Accumulate(Type x) {
+    auto absX{ABS::compute(static_cast<AccumType>(x))};
+    if (!max_) {
+      max_ = absX;
+    } else if (absX > max_) {
+      auto t{max_ / absX}; // < 1.0
+      auto tsq{t * t};
+      sum_ *= tsq; // scale sum to reflect change to the max
+      sum_ += tsq; // include a term for the previous max
+      max_ = absX;
+    } else { // absX <= max_
+      auto t{absX / max_};
+      sum_ += t * t;
+    }
+    return true;
+  }
+  template <typename A>
+  RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
+    return Accumulate(*array_.Element<A>(at));
+  }
+
+private:
+  const Descriptor &array_;
+  AccumType max_{0}; // value (m) with largest magnitude
+  AccumType sum_{0}; // sum((others(:)/m)**2)
+};
+
+// Helper class for creating Norm2Accumulator instance
+// based on the given KIND. This helper returns and instance
+// that uses std::abs and std::sqrt for the computations.
+template <int KIND> class Norm2AccumulatorGetter {
+  using AccumType = Norm2AccumType<KIND>;
+
+public:
+  struct ABSTy {
+    static constexpr RT_API_ATTRS AccumType compute(AccumType &&x) {
+      return std::abs(std::forward<AccumType>(x));
+    }
+  };
+  struct SQRTTy {
+    static constexpr RT_API_ATTRS AccumType compute(AccumType &&x) {
+      return std::sqrt(std::forward<AccumType>(x));
+    }
+  };
+
+  using Type = Norm2Accumulator<KIND, ABSTy, SQRTTy>;
+
+  static RT_API_ATTRS Type create(const Descriptor &x) { return Type(x); }
+};
+
 } // namespace Fortran::runtime
 #endif // FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_
diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h
index 89e5069995748b5..c1f89cadca06e79 100644
--- a/flang/runtime/tools.h
+++ b/flang/runtime/tools.h
@@ -266,7 +266,8 @@ inline RT_API_ATTRS RESULT ApplyIntegerKind(
   }
 }
 
-template <template <int KIND> class FUNC, typename RESULT, typename... A>
+template <template <int KIND> class FUNC, typename RESULT,
+    bool NEEDSMATH = false, typename... A>
 inline RT_API_ATTRS RESULT ApplyFloatingPointKind(
     int kind, Terminator &terminator, A &&...x) {
   switch (kind) {
@@ -287,7 +288,13 @@ inline RT_API_ATTRS RESULT ApplyFloatingPointKind(
     break;
   case 16:
     if constexpr (HasCppTypeFor<TypeCategory::Real, 16>) {
-      return FUNC<16>{}(std::forward<A>(x)...);
+      // If FUNC implemenation relies on FP math functions,
+      // then we should not be here. The compiler should have
+      // generated a call to an entry in FortranFloat128Math
+      // library.
+      if constexpr (!NEEDSMATH) {
+        return FUNC<16>{}(std::forward<A>(x)...);
+      }
     }
     break;
   }
diff --git a/flang/test/Lower/Intrinsics/norm2.f90 b/flang/test/Lower/Intrinsics/norm2.f90
index f14cad59d5bd3b8..0d125e36f6650a8 100644
--- a/flang/test/Lower/Intrinsics/norm2.f90
+++ b/flang/test/Lower/Intrinsics/norm2.f90
@@ -76,3 +76,19 @@ subroutine norm2_test_dim_3(a,r)
   ! CHECK-DAG:  %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?x?xf32>>>) -> !fir.heap<!fir.array<?x?xf32>>
   ! CHECK-DAG:  fir.freemem %[[addr]]
 end subroutine norm2_test_dim_3
+
+! CHECK-LABEL: func @_QPnorm2_test_real16(
+! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?x?x?xf128>>{{.*}}, %[[arg1:.*]]: !fir.box<!fir.array<?x?xf128>>{{.*}})
+subroutine norm2_test_real16(a,r)
+  real(16) :: a(:,:,:)
+  real(16) :: r(:,:)
+  ! CHECK-DAG:  %[[dim:.*]] = arith.constant 3 : i32
+  ! CHECK-DAG:  %[[r:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xf128>>>
+  ! CHECK-DAG:  %[[res:.*]] = fir.convert %[[r]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf128>>>>) -> !fir.ref<!fir.box<none>>
+  ! CHECK:  %[[arr:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?x?xf128>>) -> !fir.box<none>
+  r = norm2(a,dim=3)
+  ! CHECK:  %{{.*}} = fir.call @_FortranANorm2DimReal16(%[[res]], %[[arr]], %[[dim]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> none
+  ! CHECK:  %[[box:.*]] = fir.load %[[r]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xf128>>>>
+  ! CHECK-DAG:  %[[addr:.*]] = fir.box_addr %[[box]] : (!fir.box<!fir.heap<!fir.array<?x?xf128>>>) -> !fir.heap<!fir.array<?x?xf128>>
+  ! CHECK-DAG:  fir.freemem %[[addr]]
+end subroutine norm2_test_real16

From 6244dfef5cd45f1395c66abbe061c6a7eb002676 Mon Sep 17 00:00:00 2001
From: Kevin Frei <kevinfrei@users.noreply.github.com>
Date: Wed, 28 Feb 2024 10:43:49 -0800
Subject: [PATCH 108/114] llvm-dwarfdump --verify aggregated output to JSON
 file (#81762)

In order to make tooling around dwarf health easier, I've added an `--verify-json` option to `llvm-dwarfdump --verify` that will spit out error summary data with counts to a JSON file.

I've added the same capability to `llvm-gsymutil` in a [different PR.](https://github.com/llvm/llvm-project/pull/81763)

The format of the json is:
``` json
{
  "error-categories": {
    "<first category description>": {"count": 1234},
    "<next category description>": {"count":4321}
  },
  "error-count": 5555
}
```
for a clean run:
``` json
{
  "error-categories": {},
  "error-count": 0
}
```

---------

Co-authored-by: Kevin Frei <freik@meta.com>
---
 llvm/include/llvm/DebugInfo/DIContext.h      |  1 +
 llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp   | 29 +++++++++++++++++++-
 llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp | 14 ++++++++--
 3 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/DebugInfo/DIContext.h b/llvm/include/llvm/DebugInfo/DIContext.h
index 288ddf77bdfda78..b75dc8db54336b6 100644
--- a/llvm/include/llvm/DebugInfo/DIContext.h
+++ b/llvm/include/llvm/DebugInfo/DIContext.h
@@ -206,6 +206,7 @@ struct DIDumpOptions {
   bool IsEH = false;
   bool DumpNonSkeleton = false;
   bool ShowAggregateErrors = false;
+  std::string JsonErrSummaryFile;
   std::function<llvm::StringRef(uint64_t DwarfRegNum, bool IsEH)>
       GetNameForDWARFReg;
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 20ef59e7b4422ed..520debe513d9f1b 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -29,7 +29,9 @@
 #include "llvm/Support/DJB.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/JSON.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
@@ -2026,12 +2028,37 @@ void OutputCategoryAggregator::EnumerateResults(
 }
 
 void DWARFVerifier::summarize() {
-  if (ErrorCategory.GetNumCategories() && DumpOpts.ShowAggregateErrors) {
+  if (DumpOpts.ShowAggregateErrors && ErrorCategory.GetNumCategories()) {
     error() << "Aggregated error counts:\n";
     ErrorCategory.EnumerateResults([&](StringRef s, unsigned count) {
       error() << s << " occurred " << count << " time(s).\n";
     });
   }
+  if (!DumpOpts.JsonErrSummaryFile.empty()) {
+    std::error_code EC;
+    raw_fd_ostream JsonStream(DumpOpts.JsonErrSummaryFile, EC,
+                              sys::fs::OF_Text);
+    if (EC) {
+      error() << "unable to open json summary file '"
+              << DumpOpts.JsonErrSummaryFile
+              << "' for writing: " << EC.message() << '\n';
+      return;
+    }
+
+    llvm::json::Object Categories;
+    uint64_t ErrorCount = 0;
+    ErrorCategory.EnumerateResults([&](StringRef Category, unsigned Count) {
+      llvm::json::Object Val;
+      Val.try_emplace("count", Count);
+      Categories.try_emplace(Category, std::move(Val));
+      ErrorCount += Count;
+    });
+    llvm::json::Object RootNode;
+    RootNode.try_emplace("error-categories", std::move(Categories));
+    RootNode.try_emplace("error-count", ErrorCount);
+
+    JsonStream << llvm::json::Value(std::move(RootNode));
+  }
 }
 
 raw_ostream &DWARFVerifier::error() const { return WithColor::error(OS); }
diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 2b438a8b1346138..2bfc9705368e468 100644
--- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -286,6 +286,8 @@ static opt<bool> Verify("verify", desc("Verify the DWARF debug info."),
                         cat(DwarfDumpCategory));
 static opt<ErrorDetailLevel> ErrorDetails(
     "error-display", init(Unspecified),
+    desc("Set the level of detail and summary to display when verifying "
+         "(implies --verify)"),
     values(clEnumValN(NoDetailsOrSummary, "quiet",
                       "Only display whether errors occurred."),
            clEnumValN(NoDetailsOnlySummary, "summary",
@@ -295,6 +297,11 @@ static opt<ErrorDetailLevel> ErrorDetails(
            clEnumValN(BothDetailsAndSummary, "full",
                       "Display each error as well as a summary. [default]")),
     cat(DwarfDumpCategory));
+static opt<std::string> JsonErrSummaryFile(
+    "verify-json", init(""),
+    desc("Output JSON-formatted error summary to the specified file. "
+         "(Implies --verify)"),
+    value_desc("filename.json"), cat(DwarfDumpCategory));
 static opt<bool> Quiet("quiet", desc("Use with -verify to not emit to STDOUT."),
                        cat(DwarfDumpCategory));
 static opt<bool> DumpUUID("uuid", desc("Show the UUID for each architecture."),
@@ -349,6 +356,7 @@ static DIDumpOptions getDumpOpts(DWARFContext &C) {
                        ErrorDetails != NoDetailsOrSummary;
     DumpOpts.ShowAggregateErrors = ErrorDetails != OnlyDetailsNoSummary &&
                                    ErrorDetails != NoDetailsOnlySummary;
+    DumpOpts.JsonErrSummaryFile = JsonErrSummaryFile;
     return DumpOpts.noImplicitRecursion();
   }
   return DumpOpts;
@@ -834,8 +842,10 @@ int main(int argc, char **argv) {
                           "-verbose is currently not supported";
     return 1;
   }
-  if (!Verify && ErrorDetails != Unspecified)
-    WithColor::warning() << "-error-detail has no affect without -verify";
+  // -error-detail and -json-summary-file both imply -verify
+  if (ErrorDetails != Unspecified || !JsonErrSummaryFile.empty()) {
+    Verify = true;
+  }
 
   std::error_code EC;
   ToolOutputFile OutputFile(OutputFilename, EC, sys::fs::OF_TextWithCRLF);

From b4bc19e2e6b7d0de9cb5f0578269085a76e99d3f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 28 Feb 2024 18:49:12 +0000
Subject: [PATCH 109/114] [X86] Add tests showing failure to demand only the
 sign bit of a sitofp/uitofp node

sitofp - if we only demand the signbit, then we can try to use the source integer
uitofp - signbit is guaranteed to be zero

Noticed while reviewing #82290
---
 .../CodeGen/X86/combine-sse41-intrinsics.ll   | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
index 7039e33c00935d6..cbb5bd09c2399ad 100644
--- a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
@@ -160,6 +160,53 @@ define <16 x i8> @demandedelts_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>
   ret <16 x i8> %5
 }
 
+define <4 x float> @demandedbits_sitofp_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
+; SSE-LABEL: demandedbits_sitofp_blendvps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    cvtdq2ps %xmm2, %xmm0
+; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: demandedbits_sitofp_blendvps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vcvtdq2ps %xmm2, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = sitofp <4 x i32> %a2 to <4 x float>
+  %sel = tail call noundef <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %cvt)
+  ret <4 x float> %sel
+}
+
+define <4 x float> @demandedbits_uitofp_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) {
+; SSE-LABEL: demandedbits_uitofp_blendvps:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [1258291200,1258291200,1258291200,1258291200]
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; SSE-NEXT:    psrld $16, %xmm2
+; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT:    addps %xmm2, %xmm0
+; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
+; SSE-NEXT:    movaps %xmm3, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: demandedbits_uitofp_blendvps:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpblendw {{.*#+}} xmm3 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
+; AVX-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7]
+; AVX-NEXT:    vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX-NEXT:    vaddps %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %cvt = uitofp <4 x i32> %a2 to <4 x float>
+  %sel = tail call noundef <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %cvt)
+  ret <4 x float> %sel
+}
+
 define <2 x i64> @demandedbits_blendvpd(i64 %a0, i64 %a2, <2 x double> %a3) {
 ; SSE-LABEL: demandedbits_blendvpd:
 ; SSE:       # %bb.0:

From 782147e82ab3e2d0e22f729ea4e54eeed7b3cb96 Mon Sep 17 00:00:00 2001
From: Patrick Dougherty <patrick.dougherty.0208@gmail.com>
Date: Wed, 28 Feb 2024 12:53:12 -0600
Subject: [PATCH 110/114] [CMake][LIT] Add option to run lit testsuites in
 parallel (#82899)

Currently `add_lit_target` sets the `USES_TERMINAL` CMake option. When
using Ninja, this forces all lit testsuite targets into the
single-threaded `console` pool.

This PR adds a new option `LLVM_PARALLEL_LIT` which drops the
`USES_TERMINAL` flag, allowing Ninja to run them in parallel.

The default setting (`LLVM_PARALLEL_LIT=OFF`) retains the existing
behavior of serial testsuite execution.
---
 llvm/CMakeLists.txt              |  2 ++
 llvm/cmake/modules/AddLLVM.cmake | 17 ++++++++++++-----
 llvm/docs/CMake.rst              |  6 ++++++
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index f5f7d3f3253fd30..651f17879fad243 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -718,6 +718,8 @@ if(LLVM_INDIVIDUAL_TEST_COVERAGE)
 endif()
 set(LLVM_LIT_ARGS "${LIT_ARGS_DEFAULT}" CACHE STRING "Default options for lit")
 
+option(LLVM_PARALLEL_LIT "Enable multiple lit suites to run in parallel" OFF)
+
 # On Win32 hosts, provide an option to specify the path to the GnuWin32 tools.
 if( WIN32 AND NOT CYGWIN )
   set(LLVM_LIT_TOOLS_DIR "" CACHE PATH "Path to GnuWin32 tools")
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 3bc78b0dc9355aa..0f1734a64ee6e68 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1947,11 +1947,18 @@ function(add_lit_target target comment)
     list(APPEND LIT_COMMAND --param ${param})
   endforeach()
   if (ARG_UNPARSED_ARGUMENTS)
-    add_custom_target(${target}
-      COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
-      COMMENT "${comment}"
-      USES_TERMINAL
-      )
+    if (LLVM_PARALLEL_LIT)
+     add_custom_target(${target}
+       COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
+       COMMENT "${comment}"
+       )
+    else()
+     add_custom_target(${target}
+       COMMAND ${LIT_COMMAND} ${ARG_UNPARSED_ARGUMENTS}
+       COMMENT "${comment}"
+       USES_TERMINAL
+       )
+    endif()
   else()
     add_custom_target(${target}
       COMMAND ${CMAKE_COMMAND} -E echo "${target} does nothing, no tools built.")
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index abef4f8103140f2..35c47989a7eef03 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -762,6 +762,12 @@ enabled sub-projects. Nearly all of these variable names begin with
 **LLVM_PARALLEL_LINK_JOBS**:STRING
   Define the maximum number of concurrent link jobs.
 
+**LLVM_PARALLEL_LIT**:BOOL
+  Defaults to ``OFF``. If set to ``OFF``, lit testsuites will be configured
+  with CMake's ``USES_TERMINAL`` flag to give direct access to the terminal. If
+  set to ``ON``, that flag will be removed allowing Ninja to schedule multiple
+  lit testsuites in parallel.
+
 **LLVM_RAM_PER_COMPILE_JOB**:STRING
   Calculates the amount of Ninja compile jobs according to available resources.
   Value has to be in MB, overwrites LLVM_PARALLEL_COMPILE_JOBS. Compile jobs 

From 2eb63982e88b9ed8336158d35884b1a1d04a0f78 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 28 Feb 2024 10:53:47 -0800
Subject: [PATCH 111/114] [SROA] Unfold gep of index phi (#83087)

If a gep has only one phi as one of its operands and the remaining
indexes are constant, we can unfold `gep ptr, (phi idx1, idx2)` to `phi
((gep ptr, idx1), (gep ptr, idx2))`.

Take care not to unfold recursive phis.

Followup to #80983.
---
 llvm/lib/Transforms/Scalar/SROA.cpp         | 115 ++++++++++------
 llvm/test/Transforms/SROA/phi-and-select.ll |  20 +--
 llvm/test/Transforms/SROA/phi-gep.ll        | 141 +++++++++++++++++---
 3 files changed, 207 insertions(+), 69 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index fad70e8bf2861f8..c7b9ce2e93120a7 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3956,11 +3956,11 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     return false;
   }
 
-  // Fold gep (select cond, ptr1, ptr2), idx
+  // Unfold gep (select cond, ptr1, ptr2), idx
   //   => select cond, gep(ptr1, idx), gep(ptr2, idx)
   // and  gep ptr, (select cond, idx1, idx2)
   //   => select cond, gep(ptr, idx1), gep(ptr, idx2)
-  bool foldGEPSelect(GetElementPtrInst &GEPI) {
+  bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
     // Check whether the GEP has exactly one select operand and all indices
     // will become constant after the transform.
     SelectInst *Sel = dyn_cast<SelectInst>(GEPI.getPointerOperand());
@@ -4029,67 +4029,100 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
     return true;
   }
 
-  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
-  bool foldGEPPhi(GetElementPtrInst &GEPI) {
-    if (!GEPI.hasAllConstantIndices())
-      return false;
+  // Unfold gep (phi ptr1, ptr2), idx
+  //   => phi ((gep ptr1, idx), (gep ptr2, idx))
+  // and  gep ptr, (phi idx1, idx2)
+  //   => phi ((gep ptr, idx1), (gep ptr, idx2))
+  bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
+    // To prevent infinitely expanding recursive phis, bail if the GEP pointer
+    // operand (looking through the phi if it is the phi we want to unfold) is
+    // an instruction besides an alloca.
+    PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
+    auto IsInvalidPointerOperand = [](Value *V) {
+      return isa<Instruction>(V) && !isa<AllocaInst>(V);
+    };
+    if (Phi) {
+      if (any_of(Phi->operands(), IsInvalidPointerOperand))
+        return false;
+    } else {
+      if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
+        return false;
+    }
+    // Check whether the GEP has exactly one phi operand (including the pointer
+    // operand) and all indices will become constant after the transform.
+    for (Value *Op : GEPI.indices()) {
+      if (auto *SI = dyn_cast<PHINode>(Op)) {
+        if (Phi)
+          return false;
 
-    PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
-    if (GEPI.getParent() != PHI->getParent() ||
-        llvm::any_of(PHI->incoming_values(), [](Value *In) {
-          Instruction *I = dyn_cast<Instruction>(In);
-          return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
-                 succ_empty(I->getParent()) ||
-                 !I->getParent()->isLegalToHoistInto();
-        }))
+        Phi = SI;
+        if (!all_of(Phi->incoming_values(),
+                    [](Value *V) { return isa<ConstantInt>(V); }))
+          return false;
+        continue;
+      }
+
+      if (!isa<ConstantInt>(Op))
+        return false;
+    }
+
+    if (!Phi)
       return false;
 
     LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):\n";
-               dbgs() << "    original: " << *PHI << "\n";
+               dbgs() << "    original: " << *Phi << "\n";
                dbgs() << "              " << GEPI << "\n";);
 
-    SmallVector<Value *, 4> Index(GEPI.indices());
-    bool IsInBounds = GEPI.isInBounds();
-    IRB.SetInsertPoint(GEPI.getParent(), GEPI.getParent()->getFirstNonPHIIt());
-    PHINode *NewPN = IRB.CreatePHI(GEPI.getType(), PHI->getNumIncomingValues(),
-                                   PHI->getName() + ".sroa.phi");
-    for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
-      BasicBlock *B = PHI->getIncomingBlock(I);
-      Value *NewVal = nullptr;
-      int Idx = NewPN->getBasicBlockIndex(B);
-      if (Idx >= 0) {
-        NewVal = NewPN->getIncomingValue(Idx);
-      } else {
-        Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
+    auto GetNewOps = [&](Value *PhiOp) {
+      SmallVector<Value *> NewOps;
+      for (Value *Op : GEPI.operands())
+        if (Op == Phi)
+          NewOps.push_back(PhiOp);
+        else
+          NewOps.push_back(Op);
+      return NewOps;
+    };
 
-        IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator()));
-        Type *Ty = GEPI.getSourceElementType();
-        NewVal = IRB.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep",
-                               IsInBounds);
-      }
-      NewPN->addIncoming(NewVal, B);
+    IRB.SetInsertPoint(Phi);
+    PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
+                                    Phi->getName() + ".sroa.phi");
+
+    bool IsInBounds = GEPI.isInBounds();
+    Type *SourceTy = GEPI.getSourceElementType();
+    // We only handle arguments, constants, and static allocas here, so we can
+    // insert GEPs at the beginning of the function after static allocas.
+    IRB.SetInsertPointPastAllocas(GEPI.getFunction());
+    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+      Value *Op = Phi->getIncomingValue(I);
+      BasicBlock *BB = Phi->getIncomingBlock(I);
+      SmallVector<Value *> NewOps = GetNewOps(Op);
+
+      Value *NewGEP =
+          IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
+                        Phi->getName() + ".sroa.gep", IsInBounds);
+      NewPhi->addIncoming(NewGEP, BB);
     }
 
     Visited.erase(&GEPI);
-    GEPI.replaceAllUsesWith(NewPN);
+    GEPI.replaceAllUsesWith(NewPhi);
     GEPI.eraseFromParent();
-    Visited.insert(NewPN);
-    enqueueUsers(*NewPN);
+    Visited.insert(NewPhi);
+    enqueueUsers(*NewPhi);
 
     LLVM_DEBUG(dbgs() << "          to: ";
                for (Value *In
-                    : NewPN->incoming_values()) dbgs()
+                    : NewPhi->incoming_values()) dbgs()
                << "\n              " << *In;
-               dbgs() << "\n              " << *NewPN << '\n');
+               dbgs() << "\n              " << *NewPhi << '\n');
 
     return true;
   }
 
   bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
-    if (foldGEPSelect(GEPI))
+    if (unfoldGEPSelect(GEPI))
       return true;
 
-    if (isa<PHINode>(GEPI.getPointerOperand()) && foldGEPPhi(GEPI))
+    if (unfoldGEPPhi(GEPI))
       return true;
 
     enqueueUsers(GEPI);
diff --git a/llvm/test/Transforms/SROA/phi-and-select.ll b/llvm/test/Transforms/SROA/phi-and-select.ll
index 54cfb10793a1ac8..7c8b27c9de9c0bf 100644
--- a/llvm/test/Transforms/SROA/phi-and-select.ll
+++ b/llvm/test/Transforms/SROA/phi-and-select.ll
@@ -114,13 +114,13 @@ define i32 @test3(i32 %x) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    switch i32 [[X:%.*]], label [[BB0:%.*]] [
-; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
-; CHECK-NEXT:    i32 2, label [[BB2:%.*]]
-; CHECK-NEXT:    i32 3, label [[BB3:%.*]]
-; CHECK-NEXT:    i32 4, label [[BB4:%.*]]
-; CHECK-NEXT:    i32 5, label [[BB5:%.*]]
-; CHECK-NEXT:    i32 6, label [[BB6:%.*]]
-; CHECK-NEXT:    i32 7, label [[BB7:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:      i32 2, label [[BB2:%.*]]
+; CHECK-NEXT:      i32 3, label [[BB3:%.*]]
+; CHECK-NEXT:      i32 4, label [[BB4:%.*]]
+; CHECK-NEXT:      i32 5, label [[BB5:%.*]]
+; CHECK-NEXT:      i32 6, label [[BB6:%.*]]
+; CHECK-NEXT:      i32 7, label [[BB7:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb0:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
@@ -733,6 +733,7 @@ define void @PR20822(i1 %c1, i1 %c2, ptr %ptr) {
 ; CHECK-LABEL: @PR20822(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[F_SROA_0:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[F1_SROA_GEP:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[PTR:%.*]], i32 0, i32 0
 ; CHECK-NEXT:    br i1 [[C1:%.*]], label [[IF_END:%.*]], label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    br label [[IF_END]]
@@ -742,9 +743,8 @@ define void @PR20822(i1 %c1, i1 %c2, ptr %ptr) {
 ; CHECK:       if.then2:
 ; CHECK-NEXT:    br label [[IF_THEN5]]
 ; CHECK:       if.then5:
-; CHECK-NEXT:    [[F1:%.*]] = phi ptr [ [[PTR:%.*]], [[IF_THEN2]] ], [ [[F_SROA_0]], [[IF_END]] ]
-; CHECK-NEXT:    [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[F1]], i32 0, i32 0
-; CHECK-NEXT:    store i32 0, ptr [[DOTFCA_0_GEP]], align 4
+; CHECK-NEXT:    [[F1_SROA_PHI:%.*]] = phi ptr [ [[F1_SROA_GEP]], [[IF_THEN2]] ], [ [[F_SROA_0]], [[IF_END]] ]
+; CHECK-NEXT:    store i32 0, ptr [[F1_SROA_PHI]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SROA/phi-gep.ll b/llvm/test/Transforms/SROA/phi-gep.ll
index c5aa1cdd9cf6541..78071dcdafb49de 100644
--- a/llvm/test/Transforms/SROA/phi-gep.ll
+++ b/llvm/test/Transforms/SROA/phi-gep.ll
@@ -65,15 +65,13 @@ end:
 define i32 @test_sroa_phi_gep_poison(i1 %cond) {
 ; CHECK-LABEL: @test_sroa_phi_gep_poison(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATE_LOAD_IF_THEN:%.*]] = load i32, ptr poison, align 4
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ poison, [[IF_THEN]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], ptr [[PHI]], i32 0, i32 1
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[PHI_SROA_PHI_SROA_SPECULATE_LOAD_IF_THEN]], [[IF_THEN]] ]
+; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
 ;
 entry:
   %a = alloca %pair, align 4
@@ -94,17 +92,13 @@ end:
 define i32 @test_sroa_phi_gep_global(i1 %cond) {
 ; CHECK-LABEL: @test_sroa_phi_gep_global(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = alloca [[PAIR:%.*]], align 4
-; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds [[PAIR]], ptr [[A]], i32 0, i32 1
-; CHECK-NEXT:    store i32 1, ptr [[GEP_A]], align 4
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATE_LOAD_IF_THEN:%.*]] = load i32, ptr getelementptr inbounds ([[PAIR:%.*]], ptr @g, i32 0, i32 1), align 4
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ @g, [[IF_THEN]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], ptr [[PHI]], i32 0, i32 1
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
-; CHECK-NEXT:    ret i32 [[LOAD]]
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[PHI_SROA_PHI_SROA_SPECULATE_LOAD_IF_THEN]], [[IF_THEN]] ]
+; CHECK-NEXT:    ret i32 [[PHI_SROA_PHI_SROA_SPECULATED]]
 ;
 entry:
   %a = alloca %pair, align 4
@@ -245,7 +239,7 @@ define i32 @test_sroa_invoke_phi_gep(i1 %cond) personality ptr @__gxx_personalit
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[CALL:%.*]], label [[END:%.*]]
 ; CHECK:       call:
 ; CHECK-NEXT:    [[B:%.*]] = invoke ptr @foo()
-; CHECK-NEXT:    to label [[END]] unwind label [[INVOKE_CATCH:%.*]]
+; CHECK-NEXT:            to label [[END]] unwind label [[INVOKE_CATCH:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[B]], [[CALL]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[PAIR]], ptr [[PHI]], i32 0, i32 1
@@ -253,7 +247,7 @@ define i32 @test_sroa_invoke_phi_gep(i1 %cond) personality ptr @__gxx_personalit
 ; CHECK-NEXT:    ret i32 [[LOAD]]
 ; CHECK:       invoke_catch:
 ; CHECK-NEXT:    [[RES:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    catch ptr null
+; CHECK-NEXT:            catch ptr null
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
@@ -468,10 +462,10 @@ define i32 @test_sroa_phi_gep_multiple_values_from_same_block(i32 %arg) {
 ; CHECK-LABEL: @test_sroa_phi_gep_multiple_values_from_same_block(
 ; CHECK-NEXT:  bb.1:
 ; CHECK-NEXT:    switch i32 [[ARG:%.*]], label [[BB_3:%.*]] [
-; CHECK-NEXT:    i32 1, label [[BB_2:%.*]]
-; CHECK-NEXT:    i32 2, label [[BB_2]]
-; CHECK-NEXT:    i32 3, label [[BB_4:%.*]]
-; CHECK-NEXT:    i32 4, label [[BB_4]]
+; CHECK-NEXT:      i32 1, label [[BB_2:%.*]]
+; CHECK-NEXT:      i32 2, label [[BB_2]]
+; CHECK-NEXT:      i32 3, label [[BB_4:%.*]]
+; CHECK-NEXT:      i32 4, label [[BB_4]]
 ; CHECK-NEXT:    ]
 ; CHECK:       bb.2:
 ; CHECK-NEXT:    br label [[BB_4]]
@@ -504,6 +498,117 @@ bb.4:                                                ; preds = %bb.1, %bb.1, %bb
   ret i32 %load
 }
 
+define i64 @test_phi_idx_mem2reg_const(i1 %arg) {
+; CHECK-LABEL: @test_phi_idx_mem2reg_const(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_SROA_PHI_SROA_SPECULATED:%.*]] = phi i64 [ 2, [[BB1]] ], [ 3, [[BB2]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 0, [[BB1]] ], [ 1, [[BB2]] ]
+; CHECK-NEXT:    ret i64 [[PHI_SROA_PHI_SROA_SPECULATED]]
+;
+bb:
+  %alloca = alloca [2 x i64], align 8
+  %gep1 = getelementptr inbounds i64, ptr %alloca, i64 1
+  store i64 2, ptr %alloca
+  store i64 3, ptr %gep1
+  br i1 %arg, label %bb1, label %bb2
+
+bb1:
+  br label %end
+
+bb2:
+  br label %end
+
+end:
+  %phi = phi i64 [ 0, %bb1 ], [ 1, %bb2 ]
+  %getelementptr = getelementptr inbounds i64, ptr %alloca, i64 %phi
+  %load = load i64, ptr %getelementptr
+  ret i64 %load
+}
+
+define i64 @test_phi_idx_mem2reg_not_const(i1 %arg, i64 %idx) {
+; CHECK-LABEL: @test_phi_idx_mem2reg_not_const(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x i64], align 8
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[ALLOCA]], i64 1
+; CHECK-NEXT:    store i64 2, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    store i64 3, ptr [[GEP1]], align 4
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[IDX:%.*]], [[BB2]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds i64, ptr [[ALLOCA]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GETELEMENTPTR]], align 4
+; CHECK-NEXT:    ret i64 [[LOAD]]
+;
+bb:
+  %alloca = alloca [2 x i64], align 8
+  %gep1 = getelementptr inbounds i64, ptr %alloca, i64 1
+  store i64 2, ptr %alloca
+  store i64 3, ptr %gep1
+  br i1 %arg, label %bb1, label %bb2
+
+bb1:
+  br label %end
+
+bb2:
+  br label %end
+
+end:
+  %phi = phi i64 [ 0, %bb1 ], [ %idx, %bb2 ]
+  %getelementptr = getelementptr inbounds i64, ptr %alloca, i64 %phi
+  %load = load i64, ptr %getelementptr
+  ret i64 %load
+}
+
+define i64 @test_phi_mem2reg_pointer_op_is_non_const_gep(i1 %arg, i64 %idx) {
+; CHECK-LABEL: @test_phi_mem2reg_pointer_op_is_non_const_gep(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [2 x i64], align 8
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[ALLOCA]], i64 1
+; CHECK-NEXT:    store i64 2, ptr [[ALLOCA]], align 4
+; CHECK-NEXT:    store i64 3, ptr [[GEP1]], align 4
+; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 0, [[BB1]] ], [ 1, [[BB2]] ]
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr inbounds i64, ptr [[ALLOCA]], i64 [[IDX:%.*]]
+; CHECK-NEXT:    [[GETELEMENTPTR2:%.*]] = getelementptr inbounds i64, ptr [[GETELEMENTPTR]], i64 [[PHI]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GETELEMENTPTR]], align 4
+; CHECK-NEXT:    ret i64 [[LOAD]]
+;
+bb:
+  %alloca = alloca [2 x i64], align 8
+  %gep1 = getelementptr inbounds i64, ptr %alloca, i64 1
+  store i64 2, ptr %alloca
+  store i64 3, ptr %gep1
+  br i1 %arg, label %bb1, label %bb2
+
+bb1:
+  br label %end
+
+bb2:
+  br label %end
+
+end:
+  %phi = phi i64 [ 0, %bb1 ], [ 1, %bb2 ]
+  %getelementptr = getelementptr inbounds i64, ptr %alloca, i64 %idx
+  %getelementptr2 = getelementptr inbounds i64, ptr %getelementptr, i64 %phi
+  %load = load i64, ptr %getelementptr
+  ret i64 %load
+}
+
 declare ptr @foo()
 
 declare i32 @__gxx_personality_v0(...)

From 9c7cde64e6556f8a04dc4a14cbfb2b277d5ab4d9 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <jacob.benoit.1@gmail.com>
Date: Wed, 28 Feb 2024 13:56:18 -0500
Subject: [PATCH 112/114] Fix the lowering of `arith.truncf : f32 to bf16`.
 (#83180)

This lowering was not correctly handling the case where saturation of
the mantissa results in an increase of the exponent value. The new code
borrows, with credit, the idea from
https://github.com/pytorch/pytorch/blob/e1502c0cdbfd17548c612f25d5a65b1e4b86224d/c10/util/BFloat16.h#L60-L79
and adds comments to explain the magic trick going on here and why it's
correct. Hat tip to its original author, whom I believe to be
@Maratyszcza.

A testcase was also requiring a tie to be broken upwards in a case where
"to nearest-even" required going downward. The fact that it used to pass
suggests that there was another bug in the old code.
---
 .../Dialect/Arith/Transforms/ExpandOps.cpp    | 98 +++++++++----------
 mlir/test/Dialect/Arith/expand-ops.mlir       | 45 +++------
 .../mlir-cpu-runner/expand-arith-ops.mlir     | 47 ++++++---
 3 files changed, 96 insertions(+), 94 deletions(-)

diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
index 8deb8f028ba4588..7f246daf99ff3cc 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -261,68 +261,62 @@ struct BFloat16TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
       return rewriter.notifyMatchFailure(op, "not a trunc of f32 to bf16.");
     }
 
-    Type i1Ty = b.getI1Type();
     Type i16Ty = b.getI16Type();
     Type i32Ty = b.getI32Type();
     Type f32Ty = b.getF32Type();
     if (auto shapedTy = dyn_cast<ShapedType>(operandTy)) {
-      i1Ty = shapedTy.clone(i1Ty);
       i16Ty = shapedTy.clone(i16Ty);
       i32Ty = shapedTy.clone(i32Ty);
       f32Ty = shapedTy.clone(f32Ty);
     }
 
-    Value bitcast = b.create<arith::BitcastOp>(i32Ty, operand);
-
-    Value c23 = createConst(op.getLoc(), i32Ty, 23, rewriter);
-    Value c31 = createConst(op.getLoc(), i32Ty, 31, rewriter);
-    Value c23Mask = createConst(op.getLoc(), i32Ty, (1 << 23) - 1, rewriter);
-    Value expMask =
-        createConst(op.getLoc(), i32Ty, ((1 << 8) - 1) << 23, rewriter);
-    Value expMax =
-        createConst(op.getLoc(), i32Ty, ((1 << 8) - 2) << 23, rewriter);
-
-    // Grab the sign bit.
-    Value sign = b.create<arith::ShRUIOp>(bitcast, c31);
-
-    // Our mantissa rounding value depends on the sign bit and the last
-    // truncated bit.
-    Value cManRound = createConst(op.getLoc(), i32Ty, (1 << 15), rewriter);
-    cManRound = b.create<arith::SubIOp>(cManRound, sign);
-
-    // Grab out the mantissa and directly apply rounding.
-    Value man = b.create<arith::AndIOp>(bitcast, c23Mask);
-    Value manRound = b.create<arith::AddIOp>(man, cManRound);
-
-    // Grab the overflow bit and shift right if we overflow.
-    Value roundBit = b.create<arith::ShRUIOp>(manRound, c23);
-    Value manNew = b.create<arith::ShRUIOp>(manRound, roundBit);
-
-    // Grab the exponent and round using the mantissa's carry bit.
-    Value exp = b.create<arith::AndIOp>(bitcast, expMask);
-    Value expCarry = b.create<arith::AddIOp>(exp, manRound);
-    expCarry = b.create<arith::AndIOp>(expCarry, expMask);
-
-    // If the exponent is saturated, we keep the max value.
-    Value expCmp =
-        b.create<arith::CmpIOp>(arith::CmpIPredicate::uge, exp, expMax);
-    exp = b.create<arith::SelectOp>(expCmp, exp, expCarry);
-
-    // If the exponent is max and we rolled over, keep the old mantissa.
-    Value roundBitBool = b.create<arith::TruncIOp>(i1Ty, roundBit);
-    Value keepOldMan = b.create<arith::AndIOp>(expCmp, roundBitBool);
-    man = b.create<arith::SelectOp>(keepOldMan, man, manNew);
-
-    // Assemble the now rounded f32 value (as an i32).
-    Value rounded = b.create<arith::ShLIOp>(sign, c31);
-    rounded = b.create<arith::OrIOp>(rounded, exp);
-    rounded = b.create<arith::OrIOp>(rounded, man);
-
+    // Algorithm borrowed from this excellent code:
+    // https://github.com/pytorch/pytorch/blob/e1502c0cdbfd17548c612f25d5a65b1e4b86224d/c10/util/BFloat16.h#L60-L79
+    // There is a magic idea there, to let the addition of the rounding_bias to
+    // the mantissa simply overflow into the exponent bits. It's a bit of an
+    // aggressive, obfuscating optimization, but it is well-tested code, and it
+    // results in more concise and efficient IR.
+    // The case of NaN is handled separately (see isNaN and the final select).
+    // The case of infinities is NOT handled separately, which deserves an
+    // explanation. As the encoding of infinities has zero mantissa, the
+    // rounding-bias addition never carries into the exponent so that just gets
+    // truncated away, and as bfloat16 and float32 have the same number of
+    // exponent bits, that simple truncation is the desired outcome for
+    // infinities.
+    Value isNan =
+        b.create<arith::CmpFOp>(arith::CmpFPredicate::UNE, operand, operand);
+    // Constant used to make the rounding bias.
+    Value c7FFF = createConst(op.getLoc(), i32Ty, 0x7fff, rewriter);
+    // Constant used to generate a quiet NaN.
+    Value c7FC0_i16 = createConst(op.getLoc(), i16Ty, 0x7fc0, rewriter);
+    // Small constants used to address bits.
     Value c16 = createConst(op.getLoc(), i32Ty, 16, rewriter);
-    Value shr = b.create<arith::ShRUIOp>(rounded, c16);
-    Value trunc = b.create<arith::TruncIOp>(i16Ty, shr);
-    Value result = b.create<arith::BitcastOp>(resultTy, trunc);
-
+    Value c1 = createConst(op.getLoc(), i32Ty, 1, rewriter);
+    // Reinterpret the input f32 value as bits.
+    Value bitcast = b.create<arith::BitcastOp>(i32Ty, operand);
+    // Read bit 16 as a value in {0,1}.
+    Value bit16 =
+        b.create<arith::AndIOp>(b.create<arith::ShRUIOp>(bitcast, c16), c1);
+    // Determine the rounding bias to add as either 0x7fff or 0x8000 depending
+    // on bit 16, implementing the tie-breaking "to nearest even".
+    Value roundingBias = b.create<arith::AddIOp>(bit16, c7FFF);
+    // Add the rounding bias. Generally we want this to be added to the
+    // mantissa, but nothing prevents this to from carrying into the exponent
+    // bits, which would feel like a bug, but this is the magic trick here:
+    // when that happens, the mantissa gets reset to zero and the exponent
+    // gets incremented by the carry... which is actually exactly what we
+    // want.
+    Value biased = b.create<arith::AddIOp>(bitcast, roundingBias);
+    // Now that the rounding-bias has been added, truncating the low bits
+    // yields the correctly rounded result.
+    Value biasedAndShifted = b.create<arith::ShRUIOp>(biased, c16);
+    Value normalCaseResult_i16 =
+        b.create<arith::TruncIOp>(i16Ty, biasedAndShifted);
+    // Select either the above-computed result, or a quiet NaN constant
+    // if the input was NaN.
+    Value select =
+        b.create<arith::SelectOp>(isNan, c7FC0_i16, normalCaseResult_i16);
+    Value result = b.create<arith::BitcastOp>(resultTy, select);
     rewriter.replaceOp(op, result);
     return success();
   }
diff --git a/mlir/test/Dialect/Arith/expand-ops.mlir b/mlir/test/Dialect/Arith/expand-ops.mlir
index 046e8ff64fba6d1..91f652e5a270e3d 100644
--- a/mlir/test/Dialect/Arith/expand-ops.mlir
+++ b/mlir/test/Dialect/Arith/expand-ops.mlir
@@ -255,36 +255,21 @@ func.func @truncf_f32(%arg0 : f32) -> bf16 {
 }
 
 // CHECK-LABEL: @truncf_f32
-
-// CHECK-DAG: %[[C16:.+]] = arith.constant 16
-// CHECK-DAG: %[[C32768:.+]] = arith.constant 32768
-// CHECK-DAG: %[[C2130706432:.+]] = arith.constant 2130706432
-// CHECK-DAG: %[[C2139095040:.+]] = arith.constant 2139095040
-// CHECK-DAG: %[[C8388607:.+]] = arith.constant 8388607
-// CHECK-DAG: %[[C31:.+]] = arith.constant 31
-// CHECK-DAG: %[[C23:.+]] = arith.constant 23
-// CHECK-DAG: %[[BITCAST:.+]] = arith.bitcast %arg0
-// CHECK-DAG: %[[SIGN:.+]] = arith.shrui %[[BITCAST:.+]], %[[C31]]
-// CHECK-DAG: %[[ROUND:.+]] = arith.subi %[[C32768]], %[[SIGN]]
-// CHECK-DAG: %[[MANTISSA:.+]] = arith.andi %[[BITCAST]], %[[C8388607]]
-// CHECK-DAG: %[[ROUNDED:.+]] = arith.addi %[[MANTISSA]], %[[ROUND]]
-// CHECK-DAG: %[[ROLL:.+]] = arith.shrui %[[ROUNDED]], %[[C23]]
-// CHECK-DAG: %[[SHR:.+]] = arith.shrui %[[ROUNDED]], %[[ROLL]]
-// CHECK-DAG: %[[EXP:.+]] = arith.andi %0, %[[C2139095040]]
-// CHECK-DAG: %[[EXPROUND:.+]] = arith.addi %[[EXP]], %[[ROUNDED]]
-// CHECK-DAG: %[[EXPROLL:.+]] = arith.andi %[[EXPROUND]], %[[C2139095040]]
-// CHECK-DAG: %[[EXPMAX:.+]] = arith.cmpi uge, %[[EXP]], %[[C2130706432]]
-// CHECK-DAG: %[[EXPNEW:.+]] = arith.select %[[EXPMAX]], %[[EXP]], %[[EXPROLL]]
-// CHECK-DAG: %[[OVERFLOW_B:.+]] = arith.trunci %[[ROLL]]
-// CHECK-DAG: %[[KEEP_MAN:.+]] = arith.andi %[[EXPMAX]], %[[OVERFLOW_B]]
-// CHECK-DAG: %[[MANNEW:.+]] = arith.select %[[KEEP_MAN]], %[[MANTISSA]], %[[SHR]]
-// CHECK-DAG: %[[NEWSIGN:.+]] = arith.shli %[[SIGN]], %[[C31]]
-// CHECK-DAG: %[[WITHEXP:.+]] = arith.ori %[[NEWSIGN]], %[[EXPNEW]]
-// CHECK-DAG: %[[WITHMAN:.+]] = arith.ori %[[WITHEXP]], %[[MANNEW]]
-// CHECK-DAG: %[[SHIFT:.+]] = arith.shrui %[[WITHMAN]], %[[C16]]
-// CHECK-DAG: %[[TRUNC:.+]] = arith.trunci %[[SHIFT]]
-// CHECK-DAG: %[[RES:.+]] = arith.bitcast %[[TRUNC]]
-// CHECK: return %[[RES]]
+// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : i32
+// CHECK-DAG: %[[C16:.+]] = arith.constant 16 : i32
+// CHECK-DAG: %[[C7FC0_i16:.+]] = arith.constant 32704 : i16
+// CHECK-DAG: %[[C7FFF:.+]] = arith.constant 32767 : i32
+// CHECK-DAG: %[[ISNAN:.+]] = arith.cmpf une, %arg0, %arg0 : f32
+// CHECK-DAG: %[[BITCAST:.+]] = arith.bitcast %arg0 : f32 to i32
+// CHECK-DAG: %[[SHRUI:.+]] = arith.shrui %[[BITCAST]], %[[C16]] : i32
+// CHECK-DAG: %[[BIT16:.+]] = arith.andi %[[SHRUI]], %[[C1]] : i32
+// CHECK-DAG: %[[ROUNDING_BIAS:.+]] = arith.addi %[[BIT16]], %[[C7FFF]] : i32
+// CHECK-DAG: %[[BIASED:.+]] = arith.addi %[[BITCAST]], %[[ROUNDING_BIAS]] : i32
+// CHECK-DAG: %[[BIASED_SHIFTED:.+]] = arith.shrui %[[BIASED]], %[[C16]] : i32
+// CHECK-DAG: %[[NORMAL_CASE_RESULT_i16:.+]] = arith.trunci %[[BIASED_SHIFTED]] : i32 to i16
+// CHECK-DAG: %[[SELECT:.+]] = arith.select %[[ISNAN]], %[[C7FC0_i16]], %[[NORMAL_CASE_RESULT_i16]] : i16
+// CHECK-DAG: %[[RESULT:.+]] = arith.bitcast %[[SELECT]] : i16 to bf16
+// CHECK: return %[[RESULT]]
 
 // -----
 
diff --git a/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir b/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir
index 44141cc4eeaf423..0bf6523c5e5d5c2 100644
--- a/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir
+++ b/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir
@@ -13,10 +13,21 @@ func.func @trunc_bf16(%a : f32) {
 }
 
 func.func @main() {
-  // CHECK: 1.00781
-  %roundOneI = arith.constant 0x3f808000 : i32
-  %roundOneF = arith.bitcast %roundOneI : i32 to f32
-  call @trunc_bf16(%roundOneF): (f32) -> ()
+  // Note: this is a tie (low 16 bits are 0x8000). We expect the rounding behavior
+  // to break ties "to nearest-even", which in this case means downwards,
+  // since bit 16 is not set.
+  // CHECK: 1
+  %value_1_00391_I = arith.constant 0x3f808000 : i32
+  %value_1_00391_F = arith.bitcast %value_1_00391_I : i32 to f32
+  call @trunc_bf16(%value_1_00391_F): (f32) -> ()
+
+  // Note: this is a tie (low 16 bits are 0x8000). We expect the rounding behavior
+  // to break ties "to nearest-even", which in this case means upwards,
+  // since bit 16 is set.
+  // CHECK-NEXT: 1.01562
+  %value_1_01172_I = arith.constant 0x3f818000 : i32
+  %value_1_01172_F = arith.bitcast %value_1_01172_I : i32 to f32
+  call @trunc_bf16(%value_1_01172_F): (f32) -> ()
 
   // CHECK-NEXT: -1
   %noRoundNegOneI = arith.constant 0xbf808000 : i32
@@ -38,15 +49,27 @@ func.func @main() {
   %neginff = arith.bitcast %neginfi : i32 to f32
   call @trunc_bf16(%neginff): (f32) -> ()
 
+  // Note: this rounds upwards. As the mantissa was already saturated, this rounding
+  // causes the exponent to be incremented. As the exponent was already the
+  // maximum exponent value for finite values, this increment of the exponent
+  // causes this to overflow to +inf.
+  // CHECK-NEXT: inf
+  %big_overflowing_i = arith.constant 0x7f7fffff : i32
+  %big_overflowing_f = arith.bitcast %big_overflowing_i : i32 to f32
+  call @trunc_bf16(%big_overflowing_f): (f32) -> ()
+
+  // Same as the previous testcase but negative.
+  // CHECK-NEXT: -inf
+  %negbig_overflowing_i = arith.constant 0xff7fffff : i32
+  %negbig_overflowing_f = arith.bitcast %negbig_overflowing_i : i32 to f32
+  call @trunc_bf16(%negbig_overflowing_f): (f32) -> ()
+
+  // In contrast to the previous two testcases, the upwards-rounding here
+  // does not cause overflow.
   // CHECK-NEXT: 3.38953e+38
-  %bigi = arith.constant 0x7f7fffff : i32
-  %bigf = arith.bitcast %bigi : i32 to f32
-  call @trunc_bf16(%bigf): (f32) -> ()
-
-  // CHECK-NEXT: -3.38953e+38
-  %negbigi = arith.constant 0xff7fffff : i32
-  %negbigf = arith.bitcast %negbigi : i32 to f32
-  call @trunc_bf16(%negbigf): (f32) -> ()
+  %big_nonoverflowing_i = arith.constant 0x7f7effff : i32
+  %big_nonoverflowing_f = arith.bitcast %big_nonoverflowing_i : i32 to f32
+  call @trunc_bf16(%big_nonoverflowing_f): (f32) -> ()
 
   // CHECK-NEXT: 1.625
   %exprolli = arith.constant 0x3fcfffff : i32

From 777ac46ddbc318b5d5820d278a2e4dc2213699d8 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Wed, 28 Feb 2024 11:12:13 -0800
Subject: [PATCH 113/114] [llvm] Remove pipeline checks for optsize for
 DFAJumpThreadingPass

The pass itself checks whether to apply the optimization based on the
minsize attribute, so there isn't much functional benefit to preventing
the pass from being added. Gating the pass gets added to the pass
pipeline complicates the interaction with -enable-dfa-jump-thread, as
well.

Reviewers: aeubanks

Reviewed By: aeubanks

Pull Request: https://github.com/llvm/llvm-project/pull/83318
---
 llvm/lib/Passes/PassBuilderPipelines.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 142bd50b3798e0f..991c3ac8f7446cf 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -707,7 +707,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   // Re-consider control flow based optimizations after redundancy elimination,
   // redo DCE, etc.
-  if (EnableDFAJumpThreading && Level.getSizeLevel() == 0)
+  if (EnableDFAJumpThreading)
     FPM.addPass(DFAJumpThreadingPass());
 
   FPM.addPass(JumpThreadingPass());

From 21c7bc51e9ed2402a3a2f7a88d63cc8f5c2e303c Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 28 Feb 2024 20:33:20 +0100
Subject: [PATCH 114/114] [libc++] Use __integer_pack to implement
 integer_sequence on GCC (#82983)

This significantly simplifies the implementation.
---
 libcxx/include/__utility/integer_sequence.h   | 87 +++----------------
 .../make_integer_seq_fallback.pass.cpp        | 19 ----
 .../make_integer_seq_fallback.verify.cpp      | 24 -----
 3 files changed, 12 insertions(+), 118 deletions(-)
 delete mode 100644 libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.pass.cpp
 delete mode 100644 libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.verify.cpp

diff --git a/libcxx/include/__utility/integer_sequence.h b/libcxx/include/__utility/integer_sequence.h
index e63f3f265b7d5e8..ccce9433e7a8010 100644
--- a/libcxx/include/__utility/integer_sequence.h
+++ b/libcxx/include/__utility/integer_sequence.h
@@ -31,65 +31,16 @@ struct __integer_sequence {
   using __to_tuple_indices = __tuple_indices<(_Values + _Sp)...>;
 };
 
-#if !__has_builtin(__make_integer_seq) || defined(_LIBCPP_TESTING_FALLBACK_MAKE_INTEGER_SEQUENCE)
-
-namespace __detail {
-
-template <typename _Tp, size_t... _Extra>
-struct __repeat;
-template <typename _Tp, _Tp... _Np, size_t... _Extra>
-struct __repeat<__integer_sequence<_Tp, _Np...>, _Extra...> {
-  typedef _LIBCPP_NODEBUG __integer_sequence<
-      _Tp,
-      _Np...,
-      sizeof...(_Np) + _Np...,
-      2 * sizeof...(_Np) + _Np...,
-      3 * sizeof...(_Np) + _Np...,
-      4 * sizeof...(_Np) + _Np...,
-      5 * sizeof...(_Np) + _Np...,
-      6 * sizeof...(_Np) + _Np...,
-      7 * sizeof...(_Np) + _Np...,
-      _Extra...>
-      type;
-};
-
-template <size_t _Np>
-struct __parity;
-template <size_t _Np>
-struct __make : __parity<_Np % 8>::template __pmake<_Np> {};
-
-// clang-format off
-template<> struct __make<0> { typedef __integer_sequence<size_t> type; };
-template<> struct __make<1> { typedef __integer_sequence<size_t, 0> type; };
-template<> struct __make<2> { typedef __integer_sequence<size_t, 0, 1> type; };
-template<> struct __make<3> { typedef __integer_sequence<size_t, 0, 1, 2> type; };
-template<> struct __make<4> { typedef __integer_sequence<size_t, 0, 1, 2, 3> type; };
-template<> struct __make<5> { typedef __integer_sequence<size_t, 0, 1, 2, 3, 4> type; };
-template<> struct __make<6> { typedef __integer_sequence<size_t, 0, 1, 2, 3, 4, 5> type; };
-template<> struct __make<7> { typedef __integer_sequence<size_t, 0, 1, 2, 3, 4, 5, 6> type; };
-
-template<> struct __parity<0> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type> {}; };
-template<> struct __parity<1> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 1> {}; };
-template<> struct __parity<2> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 2, _Np - 1> {}; };
-template<> struct __parity<3> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 3, _Np - 2, _Np - 1> {}; };
-template<> struct __parity<4> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 4, _Np - 3, _Np - 2, _Np - 1> {}; };
-template<> struct __parity<5> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 5, _Np - 4, _Np - 3, _Np - 2, _Np - 1> {}; };
-template<> struct __parity<6> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 6, _Np - 5, _Np - 4, _Np - 3, _Np - 2, _Np - 1> {}; };
-template<> struct __parity<7> { template<size_t _Np> struct __pmake : __repeat<typename __make<_Np / 8>::type, _Np - 7, _Np - 6, _Np - 5, _Np - 4, _Np - 3, _Np - 2, _Np - 1> {}; };
-// clang-format on
-
-} // namespace __detail
-
-#endif
-
 #if __has_builtin(__make_integer_seq)
 template <size_t _Ep, size_t _Sp>
 using __make_indices_imp =
     typename __make_integer_seq<__integer_sequence, size_t, _Ep - _Sp>::template __to_tuple_indices<_Sp>;
-#else
+#elif __has_builtin(__integer_pack)
 template <size_t _Ep, size_t _Sp>
-using __make_indices_imp = typename __detail::__make<_Ep - _Sp>::type::template __to_tuple_indices<_Sp>;
-
+using __make_indices_imp =
+    typename __integer_sequence<size_t, __integer_pack(_Ep - _Sp)...>::template __to_tuple_indices<_Sp>;
+#else
+#  error "No known way to get an integer pack from the compiler"
 #endif
 
 #if _LIBCPP_STD_VER >= 14
@@ -104,34 +55,20 @@ struct _LIBCPP_TEMPLATE_VIS integer_sequence {
 template <size_t... _Ip>
 using index_sequence = integer_sequence<size_t, _Ip...>;
 
-#  if __has_builtin(__make_integer_seq) && !defined(_LIBCPP_TESTING_FALLBACK_MAKE_INTEGER_SEQUENCE)
+#  if __has_builtin(__make_integer_seq)
 
 template <class _Tp, _Tp _Ep>
-using __make_integer_sequence _LIBCPP_NODEBUG = __make_integer_seq<integer_sequence, _Tp, _Ep>;
-
-#  else
+using make_integer_sequence _LIBCPP_NODEBUG = __make_integer_seq<integer_sequence, _Tp, _Ep>;
 
-template <typename _Tp, _Tp _Np>
-using __make_integer_sequence_unchecked _LIBCPP_NODEBUG =
-    typename __detail::__make<_Np>::type::template __convert<integer_sequence, _Tp>;
+#  elif __has_builtin(__integer_pack)
 
-template <class _Tp, _Tp _Ep>
-struct __make_integer_sequence_checked {
-  static_assert(is_integral<_Tp>::value, "std::make_integer_sequence can only be instantiated with an integral type");
-  static_assert(0 <= _Ep, "std::make_integer_sequence must have a non-negative sequence length");
-  // Workaround GCC bug by preventing bad installations when 0 <= _Ep
-  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68929
-  typedef _LIBCPP_NODEBUG __make_integer_sequence_unchecked<_Tp, 0 <= _Ep ? _Ep : 0> type;
-};
-
-template <class _Tp, _Tp _Ep>
-using __make_integer_sequence _LIBCPP_NODEBUG = typename __make_integer_sequence_checked<_Tp, _Ep>::type;
+template <class _Tp, _Tp _SequenceSize>
+using make_integer_sequence _LIBCPP_NODEBUG = integer_sequence<_Tp, __integer_pack(_SequenceSize)...>;
 
+#  else
+#    error "No known way to get an integer pack from the compiler"
 #  endif
 
-template <class _Tp, _Tp _Np>
-using make_integer_sequence = __make_integer_sequence<_Tp, _Np>;
-
 template <size_t _Np>
 using make_index_sequence = make_integer_sequence<size_t, _Np>;
 
diff --git a/libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.pass.cpp b/libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.pass.cpp
deleted file mode 100644
index ceeb4dd3eeec41e..000000000000000
--- a/libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.pass.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <utility>
-
-// template<class T, T N>
-//   using make_integer_sequence = integer_sequence<T, 0, 1, ..., N-1>;
-
-// UNSUPPORTED: c++03, c++11
-
-#define _LIBCPP_TESTING_FALLBACK_MAKE_INTEGER_SEQUENCE
-#include "make_integer_seq.pass.cpp"
-
-#include "test_macros.h"
diff --git a/libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.verify.cpp b/libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.verify.cpp
deleted file mode 100644
index 32a4a5431333e2d..000000000000000
--- a/libcxx/test/std/utilities/intseq/intseq.make/make_integer_seq_fallback.verify.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <utility>
-
-// template<class T, T N>
-//   using make_integer_sequence = integer_sequence<T, 0, 1, ..., N-1>;
-
-// UNSUPPORTED: c++03, c++11
-
-// This test hangs during recursive template instantiation with libstdc++
-// UNSUPPORTED: stdlib=libstdc++
-
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_TESTING_FALLBACK_MAKE_INTEGER_SEQUENCE
-
-#include <utility>
-
-typedef std::make_integer_sequence<int, -3> MakeSeqT;
-MakeSeqT i; // expected-error-re@*:* {{static assertion failed{{.*}}std::make_integer_sequence must have a non-negative sequence length}}