From 496f8e5b369f091def93482578232da8c6e77a7a Mon Sep 17 00:00:00 2001
From: Hamilton Tobon Mosquera <Hamilton Tobon>
Date: Mon, 17 Aug 2020 20:18:21 -0500
Subject: [PATCH 001/101] [OpenMPOpt][HideMemTransfersLatency] Split
 __tgt_target_data_begin_mapper into its "issue" and "wait" counterparts.

WIP that tries to hide the latency of runtime calls that involve host to
device memory transfers by splitting them into their "issue" and "wait"
versions. The "issue" is moved upwards as much as possible. The "wait" is
moved downards as much as possible. The "issue" issues the memory transfer
asynchronously, returning a handle. The "wait" waits in the returned
handle for the memory transfer to finish. We still lack of the movement.
---
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |  4 ++
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         | 66 +++++++++++++++++++
 .../OpenMP/hide_mem_transfer_latency.ll       | 54 ++++++---------
 3 files changed, 90 insertions(+), 34 deletions(-)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 3fc87dc34cd346..9ad7efff6ef567 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -198,6 +198,7 @@ __OMP_ARRAY_TYPE(KmpCriticalName, Int32, 8)
   OMP_STRUCT_TYPE(VarName, "struct." #Name, __VA_ARGS__)
 
 __OMP_STRUCT_TYPE(Ident, ident_t, Int32, Int32, Int32, Int32, Int8Ptr)
+__OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, Int8Ptr)
 
 #undef __OMP_STRUCT_TYPE
 #undef OMP_STRUCT_TYPE
@@ -482,6 +483,9 @@ __OMP_RTL(__tgt_target_data_begin_mapper, false, Void, Int64, Int32, VoidPtrPtr,
           VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr)
 __OMP_RTL(__tgt_target_data_begin_nowait_mapper, false, Void, Int64, Int32,
           VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr)
+__OMP_RTL(__tgt_target_data_begin_mapper_issue, false, AsyncInfo, Int64, Int32,
+          VoidPtrPtr, VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr)
+__OMP_RTL(__tgt_target_data_begin_mapper_wait, false, Void, Int64, AsyncInfo)
 __OMP_RTL(__tgt_target_data_end_mapper, false, Void, Int64, Int32, VoidPtrPtr,
           VoidPtrPtr, Int64Ptr, Int64Ptr, VoidPtrPtr)
 __OMP_RTL(__tgt_target_data_end_nowait_mapper, false, Void, Int64, Int32,
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 93f1e5392eb2cc..ae7bafd7d91e5c 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -42,6 +42,13 @@ static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
                                         cl::init(false), cl::Hidden);
 
+static cl::opt<bool> HideMemoryTransferLatency(
+    "openmp-hide-memory-transfer-latency",
+    cl::desc("[WIP] Tries to hide the latency of host to device memory"
+             " transfers"),
+    cl::Hidden, cl::init(false));
+
+
 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
           "Number of OpenMP runtime calls deduplicated");
 STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -508,6 +515,8 @@ struct OpenMPOpt {
 
     Changed |= deduplicateRuntimeCalls();
     Changed |= deleteParallelRegions();
+    if (HideMemoryTransferLatency)
+      Changed |= hideMemTransfersLatency();
 
     return Changed;
   }
@@ -666,6 +675,63 @@ struct OpenMPOpt {
     return Changed;
   }
 
+  /// Tries to hide the latency of runtime calls that involve host to
+  /// device memory transfers by splitting them into their "issue" and "wait"
+  /// versions. The "issue" is moved upwards as much as possible. The "wait" is
+  /// moved downards as much as possible. The "issue" issues the memory transfer
+  /// asynchronously, returning a handle. The "wait" waits in the returned
+  /// handle for the memory transfer to finish.
+  bool hideMemTransfersLatency() {
+    auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
+    bool Changed = false;
+    auto SplitMemTransfers = [&](Use &U, Function &Decl) {
+      auto *RTCall = getCallIfRegularCall(U, &RFI);
+      if (!RTCall)
+        return false;
+
+      bool WasSplit = splitTargetDataBeginRTC(RTCall);
+      Changed |= WasSplit;
+      return WasSplit;
+    };
+    RFI.foreachUse(SCC, SplitMemTransfers);
+
+    return Changed;
+  }
+
+  /// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
+  bool splitTargetDataBeginRTC(CallInst *RuntimeCall) {
+    auto &IRBuilder = OMPInfoCache.OMPBuilder;
+    // Add "issue" runtime call declaration:
+    // declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
+    //   i8**, i8**, i64*, i64*)
+    FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___tgt_target_data_begin_mapper_issue);
+
+    // Change RuntimeCall call site for its asynchronous version.
+    SmallVector<Value *, 8> Args;
+    for (auto &Arg : RuntimeCall->args())
+      Args.push_back(Arg.get());
+
+    CallInst *IssueCallsite =
+        CallInst::Create(IssueDecl, Args, "handle", RuntimeCall);
+    RuntimeCall->eraseFromParent();
+
+    // Add "wait" runtime call declaration:
+    // declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
+    FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
+        M, OMPRTL___tgt_target_data_begin_mapper_wait);
+
+    // Add call site to WaitDecl.
+    Value *WaitParams[2] = {
+        IssueCallsite->getArgOperand(0), // device_id.
+        IssueCallsite // returned handle.
+    };
+    CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"",
+                     IssueCallsite->getNextNode());
+
+    return true;
+  }
+
   static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
                                     bool GlobalOnly, bool &SingleChoice) {
     if (CurrentIdent == NextIdent)
diff --git a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
index daebe4b52ace5a..7f55ad12af2d71 100644
--- a/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
+++ b/llvm/test/Transforms/OpenMP/hide_mem_transfer_latency.ll
@@ -1,9 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature --scrub-attributes
-; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa < %s | FileCheck %s
+; RUN: opt -S -passes=openmpopt -aa-pipeline=basic-aa -openmp-hide-memory-transfer-latency < %s | FileCheck %s
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
-; FIXME: This struct should be generated after splitting at least one of the runtime calls.
-; %struct.__tgt_async_info = type { i8* }
+; CHECK: %struct.__tgt_async_info = type { i8* }
 
 %struct.ident_t = type { i32, i32, i32, i32, i8* }
 %struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
@@ -58,7 +57,10 @@ define dso_local double @heavyComputation1() {
 ; CHECK-NEXT:    %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
 ; CHECK-NEXT:    %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
 ; CHECK-NEXT:    store double* %a, double** %4, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
 ; CHECK-NEXT:    %5 = bitcast double* %a to i64*
 ; CHECK-NEXT:    %6 = load i64, i64* %5, align 8
 ; CHECK-NEXT:    %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs4, i64 0, i64 0
@@ -102,11 +104,6 @@ entry:
   %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
   %4 = bitcast [1 x i8*]* %.offload_ptrs to double**
   store double* %a, double** %4, align 8
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively.
-  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
-  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
   call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes.1, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i8** null)
 
   %5 = bitcast double* %a to i64*
@@ -186,7 +183,10 @@ define dso_local i32 @heavyComputation2(double* %a, i32 %size) {
 ; CHECK-NEXT:    store i32* %size.addr, i32** %9, align 8
 ; CHECK-NEXT:    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
 ; CHECK-NEXT:    store i64 4, i64* %10, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
 ; CHECK-NEXT:    %11 = load i32, i32* %size.addr, align 4
 ; CHECK-NEXT:    %size.casted = zext i32 %11 to i64
 ; CHECK-NEXT:    %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
@@ -241,12 +241,6 @@ entry:
   store i32* %size.addr, i32** %9, align 8
   %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
   store i64 4, i64* %10, align 8
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively. Here though, the "issue" cannot be moved upwards
-  ;        because it's not guaranteed that rand() won't modify *a.
-  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
-  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
   call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
 
   %11 = load i32, i32* %size.addr, align 4
@@ -330,7 +324,10 @@ define dso_local i32 @heavyComputation3(double* noalias %a, i32 %size) {
 ; CHECK-NEXT:    store i32* %size.addr, i32** %9, align 8
 ; CHECK-NEXT:    %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
 ; CHECK-NEXT:    store i64 4, i64* %10, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 2, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
 ; CHECK-NEXT:    %11 = load i32, i32* %size.addr, align 4
 ; CHECK-NEXT:    %size.casted = zext i32 %11 to i64
 ; CHECK-NEXT:    %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs2, i64 0, i64 0
@@ -386,11 +383,6 @@ entry:
   store i32* %size.addr, i32** %9, align 8
   %10 = getelementptr inbounds [2 x i64], [2 x i64]* %.offload_sizes, i64 0, i64 1
   store i64 4, i64* %10, align 8
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively.
-  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
-  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
   call void @__tgt_target_data_begin_mapper(i64 -1, i32 2, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes.3, i64 0, i64 0), i8** null)
 
   %11 = load i32, i32* %size.addr, align 4
@@ -459,7 +451,10 @@ define dso_local i32 @dataTransferOnly1(double* noalias %a, i32 %size) {
 ; CHECK-NEXT:    store double* %a, double** %4, align 8
 ; CHECK-NEXT:    %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
 ; CHECK-NEXT:    store i64 %0, i64* %5, align 8
-; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
+
+; CHECK-NEXT:    %handle = call %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64 -1, i32 1, i8** %1, i8** %3, i64* %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
+; CHECK-NEXT:    call void @__tgt_target_data_begin_mapper_wait(i64 -1, %struct.__tgt_async_info %handle)
+
 ; CHECK-NEXT:    %rem = urem i32 %call, %size
 ; CHECK-NEXT:    call void @__tgt_target_data_end_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
 ; CHECK-NEXT:    ret i32 %rem
@@ -482,13 +477,6 @@ entry:
   store double* %a, double** %4, align 8
   %5 = getelementptr inbounds [1 x i64], [1 x i64]* %.offload_sizes, i64 0, i64 0
   store i64 %0, i64* %5, align 8
-  ; FIXME: This setup for the runtime call __tgt_target_data_begin_mapper should be
-  ;        split into its "issue" and "wait" counterpars and moved upwards
-  ;        and downwards, respectively. Here though, the "wait" cannot be moved downwards
-  ;        because it is not worthit. That is, there is no store nor call to be hoisted
-  ;        over.
-  ; %handle = call i8* @__tgt_target_data_begin_mapper_issue(...)
-  ; call void @__tgt_target_data_begin_wait(i64 -1, %struct.__tgt_async_info %handle)
   call void @__tgt_target_data_begin_mapper(i64 -1, i32 1, i8** nonnull %1, i8** nonnull %3, i64* nonnull %5, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes.5, i64 0, i64 0), i8** null)
 
   %rem = urem i32 %call, %size
@@ -503,7 +491,5 @@ declare void @__tgt_target_data_end_mapper(i64, i32, i8**, i8**, i64*, i64*, i8*
 
 declare dso_local i32 @rand(...)
 
-; FIXME: These two function declarations must be generated after splitting the runtime function
-;        __tgt_target_data_begin_mapper.
-; declare %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64, i32, i8**, i8**, i64*, i64*, i8**)
-; declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info)
+; CHECK: declare %struct.__tgt_async_info @__tgt_target_data_begin_mapper_issue(i64, i32, i8**, i8**, i64*, i64*, i8**)
+; CHECK: declare void @__tgt_target_data_begin_mapper_wait(i64, %struct.__tgt_async_info)

From 2af4c2b2b1be0333a14fbf82d9e31f62d0f3106c Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Mon, 17 Aug 2020 17:48:04 -0700
Subject: [PATCH 002/101] [NewPM] Pin various tests under Other/ to legacy PM

These all are legacy PM-specific or have a corresponding NPM RUN line.

Reviewed By: ychen

Differential Revision: https://reviews.llvm.org/D86124
---
 llvm/test/Other/cspgo-O2-pipeline.ll             |  4 ++--
 llvm/test/Other/loop-pass-printer.ll             |  6 +++---
 llvm/test/Other/opt-O0-pipeline-enable-matrix.ll |  2 +-
 llvm/test/Other/opt-O0-pipeline.ll               |  2 +-
 llvm/test/Other/opt-O2-pipeline.ll               |  2 +-
 llvm/test/Other/opt-O3-pipeline-enable-matrix.ll |  2 +-
 llvm/test/Other/opt-O3-pipeline.ll               |  2 +-
 llvm/test/Other/opt-Os-pipeline.ll               |  2 +-
 llvm/test/Other/opt-pipeline-vector-passes.ll    | 10 +++++-----
 llvm/test/Other/optimize-options.ll              | 10 +++++-----
 llvm/test/Other/pass-pipelines.ll                |  6 +++---
 llvm/test/Other/print-cfg-sccs.ll                |  2 +-
 llvm/test/Other/print-module-scope.ll            |  4 ++--
 llvm/test/Other/printer.ll                       |  2 +-
 llvm/test/Other/time-passes.ll                   |  6 +++---
 15 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/llvm/test/Other/cspgo-O2-pipeline.ll b/llvm/test/Other/cspgo-O2-pipeline.ll
index 974213c83c8128..26f2e338cbc8b7 100644
--- a/llvm/test/Other/cspgo-O2-pipeline.ll
+++ b/llvm/test/Other/cspgo-O2-pipeline.ll
@@ -1,13 +1,13 @@
 ; Test CSGen pass in CSPGO.
 ; RUN: llvm-profdata merge %S/Inputs/cspgo-noncs.proftext -o %t-noncs.profdata
 ; RUN: llvm-profdata merge %S/Inputs/cspgo-cs.proftext -o %t-cs.profdata
-; RUN: opt -O2 -debug-pass=Structure -pgo-kind=pgo-instr-use-pipeline -profile-file='%t-noncs.profdata' -cspgo-kind=cspgo-instr-gen-pipeline -cs-profilegen-file=alloc %s 2>&1 |FileCheck %s --check-prefixes=CSGENDEFAULT
+; RUN: opt -enable-new-pm=0 -O2 -debug-pass=Structure -pgo-kind=pgo-instr-use-pipeline -profile-file='%t-noncs.profdata' -cspgo-kind=cspgo-instr-gen-pipeline -cs-profilegen-file=alloc %s 2>&1 |FileCheck %s --check-prefixes=CSGENDEFAULT
 ; CSGENDEFAULT: PGOInstrumentationUse
 ; CSGENDEFAULT: PGOInstrumentationGenCreateVar
 ; CSGENDEFAULT: PGOInstrumentationGen
 
 ; Test CSUse pass in CSPGO.
-; RUN: opt -O2 -debug-pass=Structure -pgo-kind=pgo-instr-use-pipeline -profile-file='%t-cs.profdata' -cspgo-kind=cspgo-instr-use-pipeline %s 2>&1 |FileCheck %s --check-prefixes=CSUSEDEFAULT
+; RUN: opt -enable-new-pm=0 -O2 -debug-pass=Structure -pgo-kind=pgo-instr-use-pipeline -profile-file='%t-cs.profdata' -cspgo-kind=cspgo-instr-use-pipeline %s 2>&1 |FileCheck %s --check-prefixes=CSUSEDEFAULT
 ; CSUSEDEFAULT: PGOInstrumentationUse
 ; CSUSEDEFAULT-NOT: PGOInstrumentationGenCreateVar
 ; CSUSEDEFAULT: PGOInstrumentationUse
diff --git a/llvm/test/Other/loop-pass-printer.ll b/llvm/test/Other/loop-pass-printer.ll
index aab4dc91573a5f..c74d202f262158 100644
--- a/llvm/test/Other/loop-pass-printer.ll
+++ b/llvm/test/Other/loop-pass-printer.ll
@@ -1,19 +1,19 @@
 ; This test checks -print-after/before on loop passes
 ; Besides of the loop itself it should be dumping loop pre-header and exits.
 ;
-; RUN: opt < %s 2>&1 -disable-output \
+; RUN: opt -enable-new-pm=0 < %s 2>&1 -disable-output \
 ; RUN: 	   -loop-deletion -print-before=loop-deletion \
 ; RUN:	   | FileCheck %s -check-prefix=DEL
 ; RUN: opt < %s 2>&1 -disable-output \
 ; RUN: 	   -passes='loop(loop-deletion)' -print-before-all \
 ; RUN:	   | FileCheck %s -check-prefix=DEL
-; RUN: opt < %s 2>&1 -disable-output \
+; RUN: opt -enable-new-pm=0 < %s 2>&1 -disable-output \
 ; RUN: 	   -loop-unroll -print-after=loop-unroll -filter-print-funcs=bar \
 ; RUN:	   | FileCheck %s -check-prefix=BAR -check-prefix=BAR-OLD
 ; RUN: opt < %s 2>&1 -disable-output \
 ; RUN: 	   -passes='require<opt-remark-emit>,loop(loop-unroll-full)' -print-after-all -filter-print-funcs=bar \
 ; RUN:	   | FileCheck %s -check-prefix=BAR
-; RUN: opt < %s 2>&1 -disable-output \
+; RUN: opt -enable-new-pm=0 < %s 2>&1 -disable-output \
 ; RUN: 	   -loop-unroll -print-after=loop-unroll -filter-print-funcs=foo -print-module-scope \
 ; RUN:	   | FileCheck %s -check-prefix=FOO-MODULE -check-prefix=FOO-MODULE-OLD
 ; RUN: opt < %s 2>&1 -disable-output \
diff --git a/llvm/test/Other/opt-O0-pipeline-enable-matrix.ll b/llvm/test/Other/opt-O0-pipeline-enable-matrix.ll
index 401cbb976a4164..f754f6d1a513eb 100644
--- a/llvm/test/Other/opt-O0-pipeline-enable-matrix.ll
+++ b/llvm/test/Other/opt-O0-pipeline-enable-matrix.ll
@@ -1,4 +1,4 @@
-; RUN: opt -O0 -enable-matrix -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -enable-new-pm=0 -O0 -enable-matrix -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/llvm/test/Other/opt-O0-pipeline.ll b/llvm/test/Other/opt-O0-pipeline.ll
index ce431a502f93cb..6900b88cbb4ebf 100644
--- a/llvm/test/Other/opt-O0-pipeline.ll
+++ b/llvm/test/Other/opt-O0-pipeline.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=x86_64-- -O0 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=CHECK,%llvmcheckext
+; RUN: opt -enable-new-pm=0 -mtriple=x86_64-- -O0 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=CHECK,%llvmcheckext
 
 ; REQUIRES: asserts
 
diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll
index 56f85d0fb9a8c1..e606e7cfac1716 100644
--- a/llvm/test/Other/opt-O2-pipeline.ll
+++ b/llvm/test/Other/opt-O2-pipeline.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=x86_64-- -O2 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK,%llvmcheckext %s 
+; RUN: opt -enable-new-pm=0 -mtriple=x86_64-- -O2 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK,%llvmcheckext %s 
 
 ; REQUIRES: asserts
 
diff --git a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
index a0b7a8f5e1e3d7..aaee6f786bac91 100644
--- a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
+++ b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
@@ -1,4 +1,4 @@
-; RUN: opt -O3 -enable-matrix -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: opt -enable-new-pm=0 -O3 -enable-matrix -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll
index 942f7d9dfead59..b2d2f85ae21be2 100644
--- a/llvm/test/Other/opt-O3-pipeline.ll
+++ b/llvm/test/Other/opt-O3-pipeline.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=x86_64-- -O3 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK,%llvmcheckext %s
+; RUN: opt -enable-new-pm=0 -mtriple=x86_64-- -O3 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK,%llvmcheckext %s
 
 ; REQUIRES: asserts
 
diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll
index d975cc48b629c3..cc91707c4b009d 100644
--- a/llvm/test/Other/opt-Os-pipeline.ll
+++ b/llvm/test/Other/opt-Os-pipeline.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=x86_64-- -Os -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK,%llvmcheckext %s
+; RUN: opt -enable-new-pm=0 -mtriple=x86_64-- -Os -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck --check-prefixes=CHECK,%llvmcheckext %s
 
 ; REQUIRES: asserts
 
diff --git a/llvm/test/Other/opt-pipeline-vector-passes.ll b/llvm/test/Other/opt-pipeline-vector-passes.ll
index c9966d43e49126..5a76bfed168542 100644
--- a/llvm/test/Other/opt-pipeline-vector-passes.ll
+++ b/llvm/test/Other/opt-pipeline-vector-passes.ll
@@ -1,8 +1,8 @@
-; RUN: opt -O1                          -debug-pass=Structure  < %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=OLDPM_O1
-; RUN: opt -O2                          -debug-pass=Structure  < %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=OLDPM_O2
-; RUN: opt -O2 -extra-vectorizer-passes -debug-pass=Structure  < %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=OLDPM_O2_EXTRA
-; RUN: opt -O1 -vectorize-loops=0       -debug-pass=Structure  < %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=OLDPM_O1_FORCE_OFF
-; RUN: opt -O2 -vectorize-loops=0       -debug-pass=Structure  < %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=OLDPM_O2_FORCE_OFF
+; RUN: opt -enable-new-pm=0 -O1                          -debug-pass=Structure  < %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=OLDPM_O1
+; RUN: opt -enable-new-pm=0 -O2                          -debug-pass=Structure  < %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=OLDPM_O2
+; RUN: opt -enable-new-pm=0 -O2 -extra-vectorizer-passes -debug-pass=Structure  < %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=OLDPM_O2_EXTRA
+; RUN: opt -enable-new-pm=0 -O1 -vectorize-loops=0       -debug-pass=Structure  < %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=OLDPM_O1_FORCE_OFF
+; RUN: opt -enable-new-pm=0 -O2 -vectorize-loops=0       -debug-pass=Structure  < %s -o /dev/null 2>&1 | FileCheck %s --check-prefixes=OLDPM_O2_FORCE_OFF
 ; RUN: opt -disable-verify -debug-pass-manager -passes='default<O1>' -S %s 2>&1 | FileCheck %s --check-prefixes=NEWPM_O1
 ; RUN: opt -disable-verify -debug-pass-manager -passes='default<O2>' -S %s 2>&1 | FileCheck %s --check-prefixes=NEWPM_O2
 
diff --git a/llvm/test/Other/optimize-options.ll b/llvm/test/Other/optimize-options.ll
index 22dd842cab0696..ab2fc8f75b73bf 100644
--- a/llvm/test/Other/optimize-options.ll
+++ b/llvm/test/Other/optimize-options.ll
@@ -1,8 +1,8 @@
-;RUN: opt -S -O1 -debug-pass=Arguments %s 2>&1 | FileCheck %s
-;RUN: opt -S -O2 -debug-pass=Arguments %s 2>&1 | FileCheck %s
-;RUN: opt -S -Os -debug-pass=Arguments %s 2>&1 | FileCheck %s
-;RUN: opt -S -Oz -debug-pass=Arguments %s 2>&1 | FileCheck %s
-;RUN: opt -S -O3 -debug-pass=Arguments %s 2>&1 | FileCheck %s
+;RUN: opt -enable-new-pm=0 -S -O1 -debug-pass=Arguments %s 2>&1 | FileCheck %s
+;RUN: opt -enable-new-pm=0 -S -O2 -debug-pass=Arguments %s 2>&1 | FileCheck %s
+;RUN: opt -enable-new-pm=0 -S -Os -debug-pass=Arguments %s 2>&1 | FileCheck %s
+;RUN: opt -enable-new-pm=0 -S -Oz -debug-pass=Arguments %s 2>&1 | FileCheck %s
+;RUN: opt -enable-new-pm=0 -S -O3 -debug-pass=Arguments %s 2>&1 | FileCheck %s
 
 ; Just check that we get a non-empty set of passes for each -O option.
 ;CHECK: Pass Arguments: {{.*}} -print-module
diff --git a/llvm/test/Other/pass-pipelines.ll b/llvm/test/Other/pass-pipelines.ll
index 620325ec1d5ee6..ccd364d5d74044 100644
--- a/llvm/test/Other/pass-pipelines.ll
+++ b/llvm/test/Other/pass-pipelines.ll
@@ -3,15 +3,15 @@
 ; legacy pass manager doesn't introduce unexpected structural changes in the
 ; pass pipeline.
 ;
-; RUN: opt -disable-output -disable-verify -debug-pass=Structure \
+; RUN: opt -enable-new-pm=0 -disable-output -disable-verify -debug-pass=Structure \
 ; RUN:     -O2 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O2
 ; RUN: llvm-profdata merge %S/Inputs/pass-pipelines.proftext -o %t.profdata
-; RUN: opt -disable-output -disable-verify -debug-pass=Structure \
+; RUN: opt -enable-new-pm=0 -disable-output -disable-verify -debug-pass=Structure \
 ; RUN:     -pgo-kind=pgo-instr-use-pipeline -profile-file='%t.profdata' \
 ; RUN:     -O2 %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O2 --check-prefix=PGOUSE
-; RUN: opt -disable-output -disable-verify -debug-pass=Structure \
+; RUN: opt -enable-new-pm=0 -disable-output -disable-verify -debug-pass=Structure \
 ; RUN:     -pgo-kind=pgo-instr-use-pipeline -profile-file='%t.profdata' \
 ; RUN:     -hot-cold-split \
 ; RUN:     -O2 %s 2>&1 \
diff --git a/llvm/test/Other/print-cfg-sccs.ll b/llvm/test/Other/print-cfg-sccs.ll
index 43e885476bca81..6162b2d38fed5f 100644
--- a/llvm/test/Other/print-cfg-sccs.ll
+++ b/llvm/test/Other/print-cfg-sccs.ll
@@ -1,4 +1,4 @@
-; RUN: opt -print-cfg-sccs -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -enable-new-pm=0 -print-cfg-sccs -disable-output < %s 2>&1 | FileCheck %s
 
 ; CHECK: SCCs for Function test in PostOrder:
 ; CHECK-NEXT: SCC #1 : %exit,
diff --git a/llvm/test/Other/print-module-scope.ll b/llvm/test/Other/print-module-scope.ll
index 54e087ff29d25f..08d6bbb3a28b0a 100644
--- a/llvm/test/Other/print-module-scope.ll
+++ b/llvm/test/Other/print-module-scope.ll
@@ -3,13 +3,13 @@
 ;   - all the function attributes are shown, including those of declarations
 ;   - works on top of -print-after and -filter-print-funcs
 ;
-; RUN: opt < %s 2>&1 -disable-output \
+; RUN: opt -enable-new-pm=0 < %s 2>&1 -disable-output \
 ; RUN: 	   -simplifycfg -print-after=simplifycfg -print-module-scope \
 ; RUN:	   | FileCheck %s -check-prefix=CFG
 ; RUN: opt < %s 2>&1 -disable-output \
 ; RUN: 	   -passes=simplify-cfg -print-after-all -print-module-scope \
 ; RUN:	   | FileCheck %s -check-prefix=CFG
-; RUN: opt < %s 2>&1 -disable-output \
+; RUN: opt -enable-new-pm=0 < %s 2>&1 -disable-output \
 ; RUN: 	   -simplifycfg -print-after=simplifycfg -filter-print-funcs=foo -print-module-scope \
 ; RUN:	   | FileCheck %s -check-prefix=FOO
 ; RUN: opt < %s 2>&1 -disable-output \
diff --git a/llvm/test/Other/printer.ll b/llvm/test/Other/printer.ll
index 86337656285500..f5fdbfc1d7099c 100644
--- a/llvm/test/Other/printer.ll
+++ b/llvm/test/Other/printer.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mem2reg -instcombine -print-after-all -disable-output < %s 2>&1 | \
+; RUN: opt -enable-new-pm=0 -mem2reg -instcombine -print-after-all -disable-output < %s 2>&1 | \
 ; RUN:   FileCheck --check-prefixes=CHECK,OLDPM %s --implicit-check-not='IR Dump'
 ; RUN: opt -passes='mem2reg,instcombine' -print-after-all -disable-output < %s 2>&1 | \
 ; RUN:   FileCheck --check-prefixes=CHECK,NEWPM %s --implicit-check-not='IR Dump'
diff --git a/llvm/test/Other/time-passes.ll b/llvm/test/Other/time-passes.ll
index 743b4ebc0d6d1a..e3b5a003703079 100644
--- a/llvm/test/Other/time-passes.ll
+++ b/llvm/test/Other/time-passes.ll
@@ -1,11 +1,11 @@
-; RUN: opt < %s -disable-output -instcombine -instcombine -licm -time-passes 2>&1 | FileCheck %s --check-prefix=TIME --check-prefix=TIME-LEGACY
-; RUN: opt < %s -disable-output -instcombine -instcombine -licm -licm -time-passes 2>&1 | FileCheck %s --check-prefix=TIME --check-prefix=TIME-LEGACY --check-prefix=TIME-DOUBLE-LICM-LEGACY
+; RUN: opt -enable-new-pm=0 < %s -disable-output -instcombine -instcombine -licm -time-passes 2>&1 | FileCheck %s --check-prefix=TIME --check-prefix=TIME-LEGACY
+; RUN: opt -enable-new-pm=0 < %s -disable-output -instcombine -instcombine -licm -licm -time-passes 2>&1 | FileCheck %s --check-prefix=TIME --check-prefix=TIME-LEGACY --check-prefix=TIME-DOUBLE-LICM-LEGACY
 ; RUN: opt < %s -disable-output -passes='instcombine,instcombine,loop(licm)' -time-passes 2>&1 | FileCheck %s --check-prefix=TIME --check-prefix=TIME-NEW
 ; RUN: opt < %s -disable-output -passes='instcombine,loop(licm),instcombine,loop(licm)' -time-passes 2>&1 | FileCheck %s --check-prefix=TIME --check-prefix=TIME-NEW -check-prefix=TIME-DOUBLE-LICM-NEW
 ; RUN: opt < %s -disable-output -passes='default<O2>' -time-passes 2>&1 | FileCheck %s --check-prefix=TIME
 ;
 ; The following 4 test runs verify -info-output-file interaction (default goes to stderr, '-' goes to stdout).
-; RUN: opt < %s -disable-output -O2 -time-passes -info-output-file='-' 2>/dev/null | FileCheck %s --check-prefix=TIME
+; RUN: opt -enable-new-pm=0 < %s -disable-output -O2 -time-passes -info-output-file='-' 2>/dev/null | FileCheck %s --check-prefix=TIME
 ; RUN: opt < %s -disable-output -passes='default<O2>' -time-passes -info-output-file='-' 2>/dev/null | FileCheck %s --check-prefix=TIME
 ;
 ; RUN: rm -f %t; opt < %s -disable-output -O2 -time-passes -info-output-file=%t

From c7ec3a7e338cd8e58424a66d29162e9b6a5847f7 Mon Sep 17 00:00:00 2001
From: Amy Kwan <amy.kwan1@ibm.com>
Date: Wed, 12 Aug 2020 09:23:05 -0500
Subject: [PATCH 003/101] [PowerPC] Implement Vector Extract Mask builtins in
 LLVM/Clang

This patch implements the vec_extractm function prototypes in altivec.h in
order to utilize the vector extract with mask instructions introduced in Power10.

Differential Revision: https://reviews.llvm.org/D82675
---
 clang/include/clang/Basic/BuiltinsPPC.def     |  7 ++
 clang/lib/Headers/altivec.h                   | 28 ++++++++
 clang/test/CodeGen/builtins-ppc-p10vector.c   | 30 +++++++++
 llvm/include/llvm/IR/IntrinsicsPowerPC.td     | 12 ++++
 llvm/lib/Target/PowerPC/PPCInstrPrefix.td     | 15 +++--
 .../CodeGen/PowerPC/p10-vector-mask-ops.ll    | 66 +++++++++++++++++++
 6 files changed, 153 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/p10-vector-mask-ops.ll

diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index b79ed41284ace8..73c60780041572 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -298,6 +298,13 @@ BUILTIN(__builtin_altivec_vrldmi, "V2ULLiV2ULLiV2ULLiV2ULLi", "")
 BUILTIN(__builtin_altivec_vrlwnm, "V4UiV4UiV4Ui", "")
 BUILTIN(__builtin_altivec_vrldnm, "V2ULLiV2ULLiV2ULLi", "")
 
+// P10 Vector Extract with Mask built-ins.
+BUILTIN(__builtin_altivec_vextractbm, "UiV16Uc", "")
+BUILTIN(__builtin_altivec_vextracthm, "UiV8Us", "")
+BUILTIN(__builtin_altivec_vextractwm, "UiV4Ui", "")
+BUILTIN(__builtin_altivec_vextractdm, "UiV2ULLi", "")
+BUILTIN(__builtin_altivec_vextractqm, "UiV1ULLLi", "")
+
 // P10 Vector Parallel Bits built-ins.
 BUILTIN(__builtin_altivec_vpdepd, "V2ULLiV2ULLiV2ULLi", "")
 BUILTIN(__builtin_altivec_vpextd, "V2ULLiV2ULLiV2ULLi", "")
diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h
index ac4182613cdda3..b1e70f6c41bbc6 100644
--- a/clang/lib/Headers/altivec.h
+++ b/clang/lib/Headers/altivec.h
@@ -16815,6 +16815,34 @@ static vector signed char __ATTRS_o_ai vec_nabs(vector signed char __a) {
 }
 
 #ifdef __POWER10_VECTOR__
+
+/* vec_extractm */
+
+static __inline__ unsigned int __ATTRS_o_ai
+vec_extractm(vector unsigned char __a) {
+  return __builtin_altivec_vextractbm(__a);
+}
+
+static __inline__ unsigned int __ATTRS_o_ai
+vec_extractm(vector unsigned short __a) {
+  return __builtin_altivec_vextracthm(__a);
+}
+
+static __inline__ unsigned int __ATTRS_o_ai
+vec_extractm(vector unsigned int __a) {
+  return __builtin_altivec_vextractwm(__a);
+}
+
+static __inline__ unsigned int __ATTRS_o_ai
+vec_extractm(vector unsigned long long __a) {
+  return __builtin_altivec_vextractdm(__a);
+}
+
+static __inline__ unsigned int __ATTRS_o_ai
+vec_extractm(vector unsigned __int128 __a) {
+  return __builtin_altivec_vextractqm(__a);
+}
+
 /* vec_pdep */
 
 static __inline__ vector unsigned long long __ATTRS_o_ai
diff --git a/clang/test/CodeGen/builtins-ppc-p10vector.c b/clang/test/CodeGen/builtins-ppc-p10vector.c
index a575f5a924c5e6..fe3e678a579485 100644
--- a/clang/test/CodeGen/builtins-ppc-p10vector.c
+++ b/clang/test/CodeGen/builtins-ppc-p10vector.c
@@ -97,6 +97,36 @@ vector unsigned long long test_vpextd(void) {
   return vec_pext(vulla, vullb);
 }
 
+unsigned int test_vec_extractm_uc(void) {
+  // CHECK: @llvm.ppc.altivec.vextractbm(<16 x i8> %{{.+}})
+  // CHECK-NEXT: ret i32
+  return vec_extractm(vuca);
+}
+
+unsigned int test_vec_extractm_us(void) {
+  // CHECK: @llvm.ppc.altivec.vextracthm(<8 x i16> %{{.+}})
+  // CHECK-NEXT: ret i32
+  return vec_extractm(vusa);
+}
+
+unsigned int test_vec_extractm_ui(void) {
+  // CHECK: @llvm.ppc.altivec.vextractwm(<4 x i32> %{{.+}})
+  // CHECK-NEXT: ret i32
+  return vec_extractm(vuia);
+}
+
+unsigned int test_vec_extractm_ull(void) {
+  // CHECK: @llvm.ppc.altivec.vextractdm(<2 x i64> %{{.+}})
+  // CHECK-NEXT: ret i32
+  return vec_extractm(vulla);
+}
+
+unsigned int test_vec_extractm_u128(void) {
+  // CHECK: @llvm.ppc.altivec.vextractqm(<1 x i128> %{{.+}})
+  // CHECK-NEXT: ret i32
+  return vec_extractm(vui128a);
+}
+
 vector unsigned long long test_vcfuged(void) {
   // CHECK: @llvm.ppc.altivec.vcfuged(<2 x i64>
   // CHECK-NEXT: ret <2 x i64>
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index ae25bb400e463f..ce4c98968a7b70 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -434,6 +434,18 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   def int_ppc_altivec_vprtybq : GCCBuiltin<"__builtin_altivec_vprtybq">,
               Intrinsic<[llvm_v1i128_ty],[llvm_v1i128_ty],[IntrNoMem]>;
 
+  // P10 Vector Extract with Mask
+  def int_ppc_altivec_vextractbm : GCCBuiltin<"__builtin_altivec_vextractbm">,
+              Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vextracthm : GCCBuiltin<"__builtin_altivec_vextracthm">,
+              Intrinsic<[llvm_i32_ty], [llvm_v8i16_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vextractwm : GCCBuiltin<"__builtin_altivec_vextractwm">,
+              Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vextractdm : GCCBuiltin<"__builtin_altivec_vextractdm">,
+              Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+  def int_ppc_altivec_vextractqm : GCCBuiltin<"__builtin_altivec_vextractqm">,
+              Intrinsic<[llvm_i32_ty], [llvm_v1i128_ty], [IntrNoMem]>;
+
   // P10 Vector Parallel Bits Deposit/Extract Doubleword Builtins.
   def int_ppc_altivec_vpdepd : GCCBuiltin<"__builtin_altivec_vpdepd">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
index e86e7828c075a8..5bd1632475162c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -965,19 +965,24 @@ let Predicates = [IsISA3_1] in {
               RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
   def VEXTRACTBM : VXForm_RD5_XO5_RS5<1602, 8, (outs gprc:$rD), (ins vrrc:$vB),
                                       "vextractbm $rD, $vB", IIC_VecGeneral,
-                                      []>;
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextractbm v16i8:$vB))]>;
   def VEXTRACTHM : VXForm_RD5_XO5_RS5<1602, 9, (outs gprc:$rD), (ins vrrc:$vB),
                                       "vextracthm $rD, $vB", IIC_VecGeneral,
-                                      []>;
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextracthm v8i16:$vB))]>;
   def VEXTRACTWM : VXForm_RD5_XO5_RS5<1602, 10, (outs gprc:$rD), (ins vrrc:$vB),
                                       "vextractwm $rD, $vB", IIC_VecGeneral,
-                                      []>;
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextractwm v4i32:$vB))]>;
   def VEXTRACTDM : VXForm_RD5_XO5_RS5<1602, 11, (outs gprc:$rD), (ins vrrc:$vB),
                                       "vextractdm $rD, $vB", IIC_VecGeneral,
-                                      []>;
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextractdm v2i64:$vB))]>;
   def VEXTRACTQM : VXForm_RD5_XO5_RS5<1602, 12, (outs gprc:$rD), (ins vrrc:$vB),
                                       "vextractqm $rD, $vB", IIC_VecGeneral,
-                                      []>;
+                                      [(set i32:$rD,
+                                      (int_ppc_altivec_vextractqm v1i128:$vB))]>;
   def VEXPANDBM : VXForm_RD5_XO5_RS5<1602, 0, (outs vrrc:$vD), (ins vrrc:$vB),
                                      "vexpandbm $vD, $vB", IIC_VecGeneral,
                                      []>;
diff --git a/llvm/test/CodeGen/PowerPC/p10-vector-mask-ops.ll b/llvm/test/CodeGen/PowerPC/p10-vector-mask-ops.ll
new file mode 100644
index 00000000000000..2b1cf27c20ec9a
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/p10-vector-mask-ops.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+
+; This test case aims to test the vector mask manipulation operations
+; on Power10.
+
+declare i32 @llvm.ppc.altivec.vextractbm(<16 x i8>)
+declare i32 @llvm.ppc.altivec.vextracthm(<8 x i16>)
+declare i32 @llvm.ppc.altivec.vextractwm(<4 x i32>)
+declare i32 @llvm.ppc.altivec.vextractdm(<2 x i64>)
+declare i32 @llvm.ppc.altivec.vextractqm(<1 x i128>)
+
+define i32 @test_vextractbm(<16 x i8> %a) {
+; CHECK-LABEL: test_vextractbm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vextractbm r3, v2
+; CHECK-NEXT:    blr
+entry:
+  %ext = tail call i32 @llvm.ppc.altivec.vextractbm(<16 x i8> %a)
+  ret i32 %ext
+}
+
+define i32 @test_vextracthm(<8 x i16> %a) {
+; CHECK-LABEL: test_vextracthm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vextracthm r3, v2
+; CHECK-NEXT:    blr
+entry:
+  %ext = tail call i32 @llvm.ppc.altivec.vextracthm(<8 x i16> %a)
+  ret i32 %ext
+}
+
+define i32 @test_vextractwm(<4 x i32> %a) {
+; CHECK-LABEL: test_vextractwm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vextractwm r3, v2
+; CHECK-NEXT:    blr
+entry:
+  %ext = tail call i32 @llvm.ppc.altivec.vextractwm(<4 x i32> %a)
+  ret i32 %ext
+}
+
+define i32 @test_vextractdm(<2 x i64> %a) {
+; CHECK-LABEL: test_vextractdm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vextractdm r3, v2
+; CHECK-NEXT:    blr
+entry:
+  %ext = tail call i32 @llvm.ppc.altivec.vextractdm(<2 x i64> %a)
+  ret i32 %ext
+}
+
+define i32 @test_vextractqm(<1 x i128> %a) {
+; CHECK-LABEL: test_vextractqm:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vextractqm r3, v2
+; CHECK-NEXT:    blr
+entry:
+  %ext = tail call i32 @llvm.ppc.altivec.vextractqm(<1 x i128> %a)
+  ret i32 %ext
+}

From a52173a3e56553d7b795bcf3cdadcf6433117107 Mon Sep 17 00:00:00 2001
From: Harmen Stoppels <harmenstoppels@gmail.com>
Date: Mon, 17 Aug 2020 19:51:11 -0700
Subject: [PATCH 004/101] Use find_library for ncurses

Currently it is hard to avoid having LLVM link to the system install of
ncurses, since it uses check_library_exists to find e.g. libtinfo and
not find_library or find_package.

With this change the ncurses lib is found with find_library, which also
considers CMAKE_PREFIX_PATH. This solves an issue for the spack package
manager, where we want to use the zlib installed by spack, and spack
provides the CMAKE_PREFIX_PATH for it.

This is a similar change as https://reviews.llvm.org/D79219, which just
landed in master.

Differential revision: https://reviews.llvm.org/D85820
---
 compiler-rt/cmake/config-ix.cmake             | 21 +++++++++----------
 compiler-rt/lib/xray/tests/CMakeLists.txt     |  2 +-
 lldb/source/Core/CMakeLists.txt               |  4 ++--
 llvm/cmake/config-ix.cmake                    | 21 ++++++++-----------
 llvm/include/llvm/Config/config.h.cmake       |  2 +-
 llvm/lib/Support/CMakeLists.txt               |  6 ++----
 llvm/lib/Support/Unix/Process.inc             |  6 +++---
 .../llvm/include/llvm/Config/BUILD.gn         |  4 ++--
 8 files changed, 30 insertions(+), 36 deletions(-)

diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 5f9e868de5fd81..c9d0da2fc08936 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -133,17 +133,16 @@ check_library_exists(pthread pthread_create "" COMPILER_RT_HAS_LIBPTHREAD)
 check_library_exists(execinfo backtrace "" COMPILER_RT_HAS_LIBEXECINFO)
 
 # Look for terminfo library, used in unittests that depend on LLVMSupport.
-if(LLVM_ENABLE_TERMINFO)
-  foreach(library terminfo tinfo curses ncurses ncursesw)
-    string(TOUPPER ${library} library_suffix)
-    check_library_exists(
-      ${library} setupterm "" COMPILER_RT_HAS_TERMINFO_${library_suffix})
-    if(COMPILER_RT_HAS_TERMINFO_${library_suffix})
-      set(COMPILER_RT_HAS_TERMINFO TRUE)
-      set(COMPILER_RT_TERMINFO_LIB "${library}")
-      break()
-    endif()
-  endforeach()
+if(LLVM_ENABLE_TERMINFO STREQUAL FORCE_ON)
+  set(MAYBE_REQUIRED REQUIRED)
+else()
+  set(MAYBE_REQUIRED)
+endif()
+find_library(COMPILER_RT_TERMINFO_LIB NAMES terminfo tinfo curses ncurses ncursesw ${MAYBE_REQUIRED})
+if(COMPILER_RT_TERMINFO_LIB)
+  set(LLVM_ENABLE_TERMINFO 1)
+else()
+  set(LLVM_ENABLE_TERMINFO 0)
 endif()
 
 if (ANDROID AND COMPILER_RT_HAS_LIBDL)
diff --git a/compiler-rt/lib/xray/tests/CMakeLists.txt b/compiler-rt/lib/xray/tests/CMakeLists.txt
index a1fbccaeb6d268..96a9db1ef87773 100644
--- a/compiler-rt/lib/xray/tests/CMakeLists.txt
+++ b/compiler-rt/lib/xray/tests/CMakeLists.txt
@@ -55,7 +55,7 @@ set(XRAY_UNITTEST_LINK_FLAGS
 if (NOT APPLE)
   # Needed by LLVMSupport.
   append_list_if(
-    COMPILER_RT_HAS_TERMINFO
+    LLVM_ENABLE_TERMINFO
     -l${COMPILER_RT_TERMINFO_LIB} XRAY_UNITTEST_LINK_FLAGS)
 
   if (COMPILER_RT_STANDALONE_BUILD)
diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt
index a4057d11077f39..01a25045081f9f 100644
--- a/lldb/source/Core/CMakeLists.txt
+++ b/lldb/source/Core/CMakeLists.txt
@@ -11,8 +11,8 @@ set(LLDB_LIBEDIT_LIBS)
 
 if (LLDB_ENABLE_CURSES)
   list(APPEND LLDB_CURSES_LIBS ${CURSES_LIBRARIES} ${PANEL_LIBRARIES})
-  if(LLVM_ENABLE_TERMINFO AND HAVE_TERMINFO)
-    list(APPEND LLDB_CURSES_LIBS ${TERMINFO_LIBS})
+  if(LLVM_ENABLE_TERMINFO)
+    list(APPEND LLDB_CURSES_LIBS ${TERMINFO_LIB})
   endif()
   if (LLVM_BUILD_STATIC)
     list(APPEND LLDB_CURSES_LIBS gpm)
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 2c20a1afeec01d..67e2eb1cec1433 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -147,19 +147,16 @@ if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*")
     else()
       set(HAVE_LIBEDIT 0)
     endif()
-    if(LLVM_ENABLE_TERMINFO)
-      set(HAVE_TERMINFO 0)
-      foreach(library terminfo tinfo curses ncurses ncursesw)
-        string(TOUPPER ${library} library_suffix)
-        check_library_exists(${library} setupterm "" HAVE_TERMINFO_${library_suffix})
-        if(HAVE_TERMINFO_${library_suffix})
-          set(HAVE_TERMINFO 1)
-          set(TERMINFO_LIBS "${library}")
-          break()
-        endif()
-      endforeach()
+    if(LLVM_ENABLE_TERMINFO STREQUAL FORCE_ON)
+      set(MAYBE_REQUIRED REQUIRED)
+    else()
+      set(MAYBE_REQUIRED)
+    endif()
+    find_library(TERMINFO_LIB NAMES terminfo tinfo curses ncurses ncursesw ${MAYBE_REQUIRED})
+    if(TERMINFO_LIB)
+      set(LLVM_ENABLE_TERMINFO 1)
     else()
-      set(HAVE_TERMINFO 0)
+      set(LLVM_ENABLE_TERMINFO 0)
     endif()
 
     find_library(ICONV_LIBRARY_PATH NAMES iconv libiconv libiconv-2 c)
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index b8c7e070eb3416..a65947bf24c43c 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -209,7 +209,7 @@
 #cmakedefine HAVE_SYS_TYPES_H ${HAVE_SYS_TYPES_H}
 
 /* Define if the setupterm() function is supported this platform. */
-#cmakedefine HAVE_TERMINFO ${HAVE_TERMINFO}
+#cmakedefine LLVM_ENABLE_TERMINFO ${LLVM_ENABLE_TERMINFO}
 
 /* Define if the xar_open() function is supported this platform. */
 #cmakedefine HAVE_LIBXAR ${HAVE_LIBXAR}
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 7b45dc628160e0..b895f02a9df77a 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -21,10 +21,8 @@ elseif( CMAKE_HOST_UNIX )
     STRING(REGEX REPLACE "^lib" "" Backtrace_LIBFILE ${Backtrace_LIBFILE})
     set(system_libs ${system_libs} ${Backtrace_LIBFILE})
   endif()
-  if(LLVM_ENABLE_TERMINFO)
-    if(HAVE_TERMINFO)
-      set(system_libs ${system_libs} ${TERMINFO_LIBS})
-    endif()
+  if( LLVM_ENABLE_TERMINFO )
+    set(system_libs ${system_libs} ${TERMINFO_LIB})
   endif()
   if( LLVM_ENABLE_THREADS AND (HAVE_LIBATOMIC OR HAVE_CXX_LIBATOMICS64) )
     set(system_libs ${system_libs} atomic)
diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index 24f16b51af7be9..7425d084da27af 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -313,7 +313,7 @@ unsigned Process::StandardErrColumns() {
   return getColumns();
 }
 
-#ifdef HAVE_TERMINFO
+#ifdef LLVM_ENABLE_TERMINFO
 // We manually declare these extern functions because finding the correct
 // headers from various terminfo, curses, or other sources is harder than
 // writing their specs down.
@@ -323,12 +323,12 @@ extern "C" int del_curterm(struct term *termp);
 extern "C" int tigetnum(char *capname);
 #endif
 
-#ifdef HAVE_TERMINFO
+#ifdef LLVM_ENABLE_TERMINFO
 static ManagedStatic<std::mutex> TermColorMutex;
 #endif
 
 static bool terminalHasColors(int fd) {
-#ifdef HAVE_TERMINFO
+#ifdef LLVM_ENABLE_TERMINFO
   // First, acquire a global lock because these C routines are thread hostile.
   std::lock_guard<std::mutex> G(*TermColorMutex);
 
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index c9e7c45fc118ba..35c6890efd6da9 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -284,9 +284,9 @@ write_cmake_config("config") {
   }
 
   if (llvm_enable_terminfo) {
-    values += [ "HAVE_TERMINFO=1" ]
+    values += [ "LLVM_ENABLE_TERMINFO=1" ]
   } else {
-    values += [ "HAVE_TERMINFO=" ]
+    values += [ "LLVM_ENABLE_TERMINFO=" ]
   }
 
   if (llvm_enable_dia_sdk) {

From 15673d748acd8f26bdeee18c0aa18f44c775d738 Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976@hotmail.com>
Date: Sun, 16 Aug 2020 18:22:04 -0400
Subject: [PATCH 005/101] [clangd] Index refs to main-file symbols as well

Summary: This will be needed to support call hierarchy

Reviewers: kadircet

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D83536
---
 clang-tools-extra/clangd/ClangdServer.cpp     |  3 +-
 clang-tools-extra/clangd/ClangdServer.h       |  3 +
 clang-tools-extra/clangd/index/Background.cpp |  2 +
 clang-tools-extra/clangd/index/Background.h   |  3 +
 clang-tools-extra/clangd/index/FileIndex.cpp  | 21 ++++---
 clang-tools-extra/clangd/index/FileIndex.h    |  5 +-
 .../clangd/index/SymbolCollector.cpp          |  5 +-
 .../clangd/index/SymbolCollector.h            |  2 +
 clang-tools-extra/clangd/tool/ClangdMain.cpp  |  8 +++
 .../clangd/unittests/BackgroundIndexTests.cpp | 55 +++++++++++++++++++
 .../clangd/unittests/SymbolCollectorTests.cpp | 25 +++++++--
 11 files changed, 112 insertions(+), 20 deletions(-)

diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp
index 74ab21a5f7788f..d204e87c143b42 100644
--- a/clang-tools-extra/clangd/ClangdServer.cpp
+++ b/clang-tools-extra/clangd/ClangdServer.cpp
@@ -173,7 +173,8 @@ ClangdServer::ClangdServer(const GlobalCompilationDatabase &CDB,
                            Callbacks *Callbacks)
     : ConfigProvider(Opts.ConfigProvider), TFS(TFS),
       DynamicIdx(Opts.BuildDynamicSymbolIndex
-                     ? new FileIndex(Opts.HeavyweightDynamicSymbolIndex)
+                     ? new FileIndex(Opts.HeavyweightDynamicSymbolIndex,
+                                     Opts.CollectMainFileRefs)
                      : nullptr),
       GetClangTidyOptions(Opts.GetClangTidyOptions),
       SuggestMissingIncludes(Opts.SuggestMissingIncludes),
diff --git a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
index 1bc7d70eebaddc..7068cd5eb42179 100644
--- a/clang-tools-extra/clangd/ClangdServer.h
+++ b/clang-tools-extra/clangd/ClangdServer.h
@@ -111,6 +111,9 @@ class ClangdServer {
     /// on background threads. The index is stored in the project root.
     bool BackgroundIndex = false;
 
+    /// Store refs to main-file symbols in the index.
+    bool CollectMainFileRefs = false;
+
     /// If set, use this index to augment code completion results.
     SymbolIndex *StaticIndex = nullptr;
 
diff --git a/clang-tools-extra/clangd/index/Background.cpp b/clang-tools-extra/clangd/index/Background.cpp
index 18037d694c11ed..2bac6ec39d308b 100644
--- a/clang-tools-extra/clangd/index/Background.cpp
+++ b/clang-tools-extra/clangd/index/Background.cpp
@@ -95,6 +95,7 @@ BackgroundIndex::BackgroundIndex(
     BackgroundIndexStorage::Factory IndexStorageFactory, Options Opts)
     : SwapIndex(std::make_unique<MemIndex>()), TFS(TFS), CDB(CDB),
       ContextProvider(std::move(Opts.ContextProvider)),
+      CollectMainFileRefs(Opts.CollectMainFileRefs),
       Rebuilder(this, &IndexedSymbols, Opts.ThreadPoolSize),
       IndexStorageFactory(std::move(IndexStorageFactory)),
       Queue(std::move(Opts.OnProgress)),
@@ -301,6 +302,7 @@ llvm::Error BackgroundIndex::index(tooling::CompileCommand Cmd) {
       return false; // Skip files that haven't changed, without errors.
     return true;
   };
+  IndexOpts.CollectMainFileRefs = CollectMainFileRefs;
 
   IndexFileIn Index;
   auto Action = createStaticIndexingAction(
diff --git a/clang-tools-extra/clangd/index/Background.h b/clang-tools-extra/clangd/index/Background.h
index 72fe84466959fe..472603013a53ac 100644
--- a/clang-tools-extra/clangd/index/Background.h
+++ b/clang-tools-extra/clangd/index/Background.h
@@ -137,6 +137,8 @@ class BackgroundIndex : public SwapIndex {
     // file. Called with the empty string for other tasks.
     // (When called, the context from BackgroundIndex construction is active).
     std::function<Context(PathRef)> ContextProvider = nullptr;
+    // Whether to collect references to main-file-only symbols.
+    bool CollectMainFileRefs = false;
   };
 
   /// Creates a new background index and starts its threads.
@@ -188,6 +190,7 @@ class BackgroundIndex : public SwapIndex {
   const ThreadsafeFS &TFS;
   const GlobalCompilationDatabase &CDB;
   std::function<Context(PathRef)> ContextProvider;
+  bool CollectMainFileRefs;
 
   llvm::Error index(tooling::CompileCommand);
 
diff --git a/clang-tools-extra/clangd/index/FileIndex.cpp b/clang-tools-extra/clangd/index/FileIndex.cpp
index 5f84545d7c73d1..dafec6742c2ca0 100644
--- a/clang-tools-extra/clangd/index/FileIndex.cpp
+++ b/clang-tools-extra/clangd/index/FileIndex.cpp
@@ -47,12 +47,13 @@ SlabTuple indexSymbols(ASTContext &AST, std::shared_ptr<Preprocessor> PP,
                        llvm::ArrayRef<Decl *> DeclsToIndex,
                        const MainFileMacros *MacroRefsToIndex,
                        const CanonicalIncludes &Includes, bool IsIndexMainAST,
-                       llvm::StringRef Version) {
+                       llvm::StringRef Version, bool CollectMainFileRefs) {
   SymbolCollector::Options CollectorOpts;
   CollectorOpts.CollectIncludePath = true;
   CollectorOpts.Includes = &Includes;
   CollectorOpts.CountReferences = false;
   CollectorOpts.Origin = SymbolOrigin::Dynamic;
+  CollectorOpts.CollectMainFileRefs = CollectMainFileRefs;
 
   index::IndexingOptions IndexOpts;
   // We only need declarations, because we don't count references.
@@ -205,11 +206,11 @@ FileShardedIndex::getShard(llvm::StringRef Uri) const {
   return std::move(IF);
 }
 
-SlabTuple indexMainDecls(ParsedAST &AST) {
-  return indexSymbols(AST.getASTContext(), AST.getPreprocessorPtr(),
-                      AST.getLocalTopLevelDecls(), &AST.getMacros(),
-                      AST.getCanonicalIncludes(),
-                      /*IsIndexMainAST=*/true, AST.version());
+SlabTuple indexMainDecls(ParsedAST &AST, bool CollectMainFileRefs) {
+  return indexSymbols(
+      AST.getASTContext(), AST.getPreprocessorPtr(),
+      AST.getLocalTopLevelDecls(), &AST.getMacros(), AST.getCanonicalIncludes(),
+      /*IsIndexMainAST=*/true, AST.version(), CollectMainFileRefs);
 }
 
 SlabTuple indexHeaderSymbols(llvm::StringRef Version, ASTContext &AST,
@@ -220,7 +221,8 @@ SlabTuple indexHeaderSymbols(llvm::StringRef Version, ASTContext &AST,
       AST.getTranslationUnitDecl()->decls().end());
   return indexSymbols(AST, std::move(PP), DeclsToIndex,
                       /*MainFileMacros=*/nullptr, Includes,
-                      /*IsIndexMainAST=*/false, Version);
+                      /*IsIndexMainAST=*/false, Version,
+                      /*CollectMainFileRefs=*/false);
 }
 
 void FileSymbols::update(llvm::StringRef Key,
@@ -371,8 +373,9 @@ FileSymbols::buildIndex(IndexType Type, DuplicateHandling DuplicateHandle,
   llvm_unreachable("Unknown clangd::IndexType");
 }
 
-FileIndex::FileIndex(bool UseDex)
+FileIndex::FileIndex(bool UseDex, bool CollectMainFileRefs)
     : MergedIndex(&MainFileIndex, &PreambleIndex), UseDex(UseDex),
+      CollectMainFileRefs(CollectMainFileRefs),
       PreambleIndex(std::make_unique<MemIndex>()),
       MainFileIndex(std::make_unique<MemIndex>()) {}
 
@@ -415,7 +418,7 @@ void FileIndex::updatePreamble(PathRef Path, llvm::StringRef Version,
 }
 
 void FileIndex::updateMain(PathRef Path, ParsedAST &AST) {
-  auto Contents = indexMainDecls(AST);
+  auto Contents = indexMainDecls(AST, CollectMainFileRefs);
   MainFileSymbols.update(
       Path, std::make_unique<SymbolSlab>(std::move(std::get<0>(Contents))),
       std::make_unique<RefSlab>(std::move(std::get<1>(Contents))),
diff --git a/clang-tools-extra/clangd/index/FileIndex.h b/clang-tools-extra/clangd/index/FileIndex.h
index e6f8d1ef9e3d71..c7bc855bcb8e56 100644
--- a/clang-tools-extra/clangd/index/FileIndex.h
+++ b/clang-tools-extra/clangd/index/FileIndex.h
@@ -104,7 +104,7 @@ class FileSymbols {
 /// FIXME: Expose an interface to remove files that are closed.
 class FileIndex : public MergedIndex {
 public:
-  FileIndex(bool UseDex = true);
+  FileIndex(bool UseDex = true, bool CollectMainFileRefs = false);
 
   /// Update preamble symbols of file \p Path with all declarations in \p AST
   /// and macros in \p PP.
@@ -118,6 +118,7 @@ class FileIndex : public MergedIndex {
 
 private:
   bool UseDex; // FIXME: this should be always on.
+  bool CollectMainFileRefs;
 
   // Contains information from each file's preamble only. Symbols and relations
   // are sharded per declaration file to deduplicate multiple symbols and reduce
@@ -152,7 +153,7 @@ using SlabTuple = std::tuple<SymbolSlab, RefSlab, RelationSlab>;
 /// Retrieves symbols and refs of local top level decls in \p AST (i.e.
 /// `AST.getLocalTopLevelDecls()`).
 /// Exposed to assist in unit tests.
-SlabTuple indexMainDecls(ParsedAST &AST);
+SlabTuple indexMainDecls(ParsedAST &AST, bool CollectMainFileRefs = false);
 
 /// Index declarations from \p AST and macros from \p PP that are declared in
 /// included headers.
diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp
index a3ceaa388cf9db..2e1f261ab18aee 100644
--- a/clang-tools-extra/clangd/index/SymbolCollector.cpp
+++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp
@@ -334,12 +334,13 @@ bool SymbolCollector::handleDeclOccurrence(
   if (IsOnlyRef && !CollectRef)
     return true;
 
-  // Do not store references to main-file symbols.
   // Unlike other fields, e.g. Symbols (which use spelling locations), we use
   // file locations for references (as it aligns the behavior of clangd's
   // AST-based xref).
   // FIXME: we should try to use the file locations for other fields.
-  if (CollectRef && (!IsMainFileOnly || ND->isExternallyVisible()) &&
+  if (CollectRef &&
+      (!IsMainFileOnly || Opts.CollectMainFileRefs ||
+       ND->isExternallyVisible()) &&
       !isa<NamespaceDecl>(ND) &&
       (Opts.RefsInHeaders ||
        SM.getFileID(SM.getFileLoc(Loc)) == SM.getMainFileID()))
diff --git a/clang-tools-extra/clangd/index/SymbolCollector.h b/clang-tools-extra/clangd/index/SymbolCollector.h
index f66a71c2d59b10..9b30aeba95383c 100644
--- a/clang-tools-extra/clangd/index/SymbolCollector.h
+++ b/clang-tools-extra/clangd/index/SymbolCollector.h
@@ -78,6 +78,8 @@ class SymbolCollector : public index::IndexDataConsumer {
     /// Collect symbols local to main-files, such as static functions
     /// and symbols inside an anonymous namespace.
     bool CollectMainFileSymbols = true;
+    /// Collect references to main-file symbols.
+    bool CollectMainFileRefs = false;
     /// If set to true, SymbolCollector will collect doc for all symbols.
     /// Note that documents of symbols being indexed for completion will always
     /// be collected regardless of this option.
diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index 3d83f3652f3003..57dac600014d5e 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -450,6 +450,13 @@ opt<bool> EnableConfig{
     init(true),
 };
 
+opt<bool> CollectMainFileRefs{
+    "collect-main-file-refs",
+    cat(Misc),
+    desc("Store references to main-file-only symbols in the index"),
+    init(false),
+};
+
 #if CLANGD_ENABLE_REMOTE
 opt<std::string> RemoteIndexAddress{
     "remote-index-address",
@@ -682,6 +689,7 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var
   if (!ResourceDir.empty())
     Opts.ResourceDir = ResourceDir;
   Opts.BuildDynamicSymbolIndex = EnableIndex;
+  Opts.CollectMainFileRefs = CollectMainFileRefs;
   std::unique_ptr<SymbolIndex> StaticIdx;
   std::future<void> AsyncIndexLoad; // Block exit while loading the index.
   if (EnableIndex && !IndexFile.empty()) {
diff --git a/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp b/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp
index 06614872363f54..f9f584e8895f52 100644
--- a/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp
+++ b/clang-tools-extra/clangd/unittests/BackgroundIndexTests.cpp
@@ -229,6 +229,61 @@ TEST_F(BackgroundIndexTest, IndexTwoFiles) {
                        FileURI("unittest:///root/B.cc")}));
 }
 
+TEST_F(BackgroundIndexTest, MainFileRefs) {
+  MockFS FS;
+  FS.Files[testPath("root/A.h")] = R"cpp(
+      void header_sym();
+      )cpp";
+  FS.Files[testPath("root/A.cc")] =
+      "#include \"A.h\"\nstatic void main_sym() { (void)header_sym; }";
+
+  // Check the behaviour with CollectMainFileRefs = false (the default).
+  {
+    llvm::StringMap<std::string> Storage;
+    size_t CacheHits = 0;
+    MemoryShardStorage MSS(Storage, CacheHits);
+    OverlayCDB CDB(/*Base=*/nullptr);
+    BackgroundIndex Idx(FS, CDB, [&](llvm::StringRef) { return &MSS; },
+                        /*Opts=*/{});
+
+    tooling::CompileCommand Cmd;
+    Cmd.Filename = testPath("root/A.cc");
+    Cmd.Directory = testPath("root");
+    Cmd.CommandLine = {"clang++", testPath("root/A.cc")};
+    CDB.setCompileCommand(testPath("root/A.cc"), Cmd);
+
+    ASSERT_TRUE(Idx.blockUntilIdleForTest());
+    EXPECT_THAT(
+        runFuzzyFind(Idx, ""),
+        UnorderedElementsAre(AllOf(Named("header_sym"), NumReferences(1U)),
+                             AllOf(Named("main_sym"), NumReferences(0U))));
+  }
+
+  // Check the behaviour with CollectMainFileRefs = true.
+  {
+    llvm::StringMap<std::string> Storage;
+    size_t CacheHits = 0;
+    MemoryShardStorage MSS(Storage, CacheHits);
+    OverlayCDB CDB(/*Base=*/nullptr);
+    BackgroundIndex::Options Opts;
+    Opts.CollectMainFileRefs = true;
+    BackgroundIndex Idx(
+        FS, CDB, [&](llvm::StringRef) { return &MSS; }, Opts);
+
+    tooling::CompileCommand Cmd;
+    Cmd.Filename = testPath("root/A.cc");
+    Cmd.Directory = testPath("root");
+    Cmd.CommandLine = {"clang++", testPath("root/A.cc")};
+    CDB.setCompileCommand(testPath("root/A.cc"), Cmd);
+
+    ASSERT_TRUE(Idx.blockUntilIdleForTest());
+    EXPECT_THAT(
+        runFuzzyFind(Idx, ""),
+        UnorderedElementsAre(AllOf(Named("header_sym"), NumReferences(1U)),
+                             AllOf(Named("main_sym"), NumReferences(1U))));
+  }
+}
+
 TEST_F(BackgroundIndexTest, ShardStorageTest) {
   MockFS FS;
   FS.Files[testPath("root/A.h")] = R"cpp(
diff --git a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
index 70a8e6832d02f0..d89db8f015cea0 100644
--- a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
@@ -714,7 +714,6 @@ TEST_F(SymbolCollectorTest, Refs) {
   EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(Symbols, "NS").ID, _))));
   EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "MACRO").ID,
                                   HaveRanges(Main.ranges("macro")))));
-  // Symbols *only* in the main file:
   // - (a, b) externally visible and should have refs.
   // - (c, FUNC) externally invisible and had no refs collected.
   auto MainSymbols =
@@ -723,6 +722,20 @@ TEST_F(SymbolCollectorTest, Refs) {
   EXPECT_THAT(Refs, Contains(Pair(findSymbol(MainSymbols, "b").ID, _)));
   EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "c").ID, _))));
   EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "FUNC").ID, _))));
+
+  // Run the collector again with CollectMainFileRefs = true.
+  // We need to recreate InMemoryFileSystem because runSymbolCollector()
+  // calls MemoryBuffer::getMemBuffer(), which makes the buffers unusable
+  // after runSymbolCollector() exits.
+  InMemoryFileSystem = new llvm::vfs::InMemoryFileSystem();
+  CollectorOpts.CollectMainFileRefs = true;
+  runSymbolCollector(Header.code(),
+                     (Main.code() + SymbolsOnlyInMainCode.code()).str());
+  EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "a").ID, _)));
+  EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "b").ID, _)));
+  EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "c").ID, _)));
+  // However, references to main-file macros are not collected.
+  EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(Symbols, "FUNC").ID, _))));
 }
 
 TEST_F(SymbolCollectorTest, MacroRefInHeader) {
@@ -908,8 +921,9 @@ TEST_F(SymbolCollectorTest, HeaderAsMainFile) {
     $Foo[[Foo]] fo;
   }
   )");
-  // The main file is normal .cpp file, we should collect the refs
-  // for externally visible symbols.
+  // We should collect refs to main-file symbols in all cases:
+
+  // 1. The main file is normal .cpp file.
   TestFileName = testPath("foo.cpp");
   runSymbolCollector("", Header.code());
   EXPECT_THAT(Refs,
@@ -918,7 +932,7 @@ TEST_F(SymbolCollectorTest, HeaderAsMainFile) {
                                    Pair(findSymbol(Symbols, "Func").ID,
                                         HaveRanges(Header.ranges("Func")))));
 
-  // Run the .h file as main file, we should collect the refs.
+  // 2. Run the .h file as main file.
   TestFileName = testPath("foo.h");
   runSymbolCollector("", Header.code(),
                      /*ExtraArgs=*/{"-xobjective-c++-header"});
@@ -929,8 +943,7 @@ TEST_F(SymbolCollectorTest, HeaderAsMainFile) {
                                    Pair(findSymbol(Symbols, "Func").ID,
                                         HaveRanges(Header.ranges("Func")))));
 
-  // Run the .hh file as main file (without "-x c++-header"), we should collect
-  // the refs as well.
+  // 3. Run the .hh file as main file (without "-x c++-header").
   TestFileName = testPath("foo.hh");
   runSymbolCollector("", Header.code());
   EXPECT_THAT(Symbols, UnorderedElementsAre(QName("Foo"), QName("Func")));

From 00d7b7d014f90aaaacaef6f9c778614b09356bf0 Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976@hotmail.com>
Date: Sun, 26 Jul 2020 22:45:24 -0400
Subject: [PATCH 006/101] [clang] Fix visitation of ConceptSpecializationExpr
 in constrained-parameter

Summary: RecursiveASTVisitor needs to traverse TypeConstraint::ImmediatelyDeclaredConstraint

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, kadircet, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D84136
---
 .../clangd/unittests/FindTargetTests.cpp      | 22 +++++++++
 clang/include/clang/AST/RecursiveASTVisitor.h | 13 +++++-
 clang/unittests/Tooling/CMakeLists.txt        |  1 +
 .../RecursiveASTVisitorTests/Concept.cpp      | 45 +++++++++++++++++++
 4 files changed, 79 insertions(+), 2 deletions(-)
 create mode 100644 clang/unittests/Tooling/RecursiveASTVisitorTests/Concept.cpp

diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index 4c655c3338d203..2507932c5cda30 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -442,6 +442,28 @@ TEST_F(TargetDeclTest, Concept) {
   )cpp";
   EXPECT_DECLS("ConceptSpecializationExpr",
                {"template <typename T> concept Fooable = true;"});
+
+  // constrained-parameter
+  Code = R"cpp(
+    template <typename T>
+    concept Fooable = true;
+
+    template <[[Fooable]] T>
+    void bar(T t);
+  )cpp";
+  EXPECT_DECLS("ConceptSpecializationExpr",
+               {"template <typename T> concept Fooable = true;"});
+
+  // partial-concept-id
+  Code = R"cpp(
+    template <typename T, typename U>
+    concept Fooable = true;
+
+    template <[[Fooable]]<int> T>
+    void bar(T t);
+  )cpp";
+  EXPECT_DECLS("ConceptSpecializationExpr",
+               {"template <typename T, typename U> concept Fooable = true;"});
 }
 
 TEST_F(TargetDeclTest, FunctionTemplate) {
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 3dcfc9fee629ac..6f07b92f253230 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -1777,8 +1777,17 @@ DEF_TRAVERSE_DECL(TemplateTypeParmDecl, {
   // D is the "T" in something like "template<typename T> class vector;"
   if (D->getTypeForDecl())
     TRY_TO(TraverseType(QualType(D->getTypeForDecl(), 0)));
-  if (const auto *TC = D->getTypeConstraint())
-    TRY_TO(TraverseConceptReference(*TC));
+  if (const auto *TC = D->getTypeConstraint()) {
+    if (Expr *IDC = TC->getImmediatelyDeclaredConstraint()) {
+      TRY_TO(TraverseStmt(IDC));
+    } else {
+      // Avoid traversing the ConceptReference in the TypeCosntraint
+      // if we have an immediately-declared-constraint, otherwise
+      // we'll end up visiting the concept and the arguments in
+      // the TC twice.
+      TRY_TO(TraverseConceptReference(*TC));
+    }
+  }
   if (D->hasDefaultArgument() && !D->defaultArgumentWasInherited())
     TRY_TO(TraverseTypeLoc(D->getDefaultArgumentInfo()->getTypeLoc()));
 })
diff --git a/clang/unittests/Tooling/CMakeLists.txt b/clang/unittests/Tooling/CMakeLists.txt
index f290c3d2bedee8..9de330ab73d425 100644
--- a/clang/unittests/Tooling/CMakeLists.txt
+++ b/clang/unittests/Tooling/CMakeLists.txt
@@ -22,6 +22,7 @@ add_clang_unittest(ToolingTests
   RecursiveASTVisitorTests/Attr.cpp
   RecursiveASTVisitorTests/Callbacks.cpp
   RecursiveASTVisitorTests/Class.cpp
+  RecursiveASTVisitorTests/Concept.cpp
   RecursiveASTVisitorTests/ConstructExpr.cpp
   RecursiveASTVisitorTests/CXXBoolLiteralExpr.cpp
   RecursiveASTVisitorTests/CXXMemberCall.cpp
diff --git a/clang/unittests/Tooling/RecursiveASTVisitorTests/Concept.cpp b/clang/unittests/Tooling/RecursiveASTVisitorTests/Concept.cpp
new file mode 100644
index 00000000000000..f0f700204dd5a9
--- /dev/null
+++ b/clang/unittests/Tooling/RecursiveASTVisitorTests/Concept.cpp
@@ -0,0 +1,45 @@
+//===- unittest/Tooling/RecursiveASTVisitorTests/Concept.cpp----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestVisitor.h"
+#include "clang/AST/ExprConcepts.h"
+
+using namespace clang;
+
+namespace {
+
+struct ConceptVisitor : ExpectedLocationVisitor<ConceptVisitor> {
+  bool VisitConceptSpecializationExpr(ConceptSpecializationExpr *E) {
+    ++ConceptSpecializationExprsVisited;
+    return true;
+  }
+  bool TraverseConceptReference(const ConceptReference &R) {
+    ++ConceptReferencesTraversed;
+    return true;
+  }
+
+  int ConceptSpecializationExprsVisited = 0;
+  int ConceptReferencesTraversed = 0;
+};
+
+TEST(RecursiveASTVisitor, ConstrainedParameter) {
+  ConceptVisitor Visitor;
+  EXPECT_TRUE(Visitor.runOver("template <typename T> concept Fooable = true;\n"
+                              "template <Fooable T> void bar(T);",
+                              ConceptVisitor::Lang_CXX2a));
+  // Check that we visit the "Fooable T" template parameter's TypeConstraint's
+  // ImmediatelyDeclaredConstraint, which is a ConceptSpecializationExpr.
+  EXPECT_EQ(1, Visitor.ConceptSpecializationExprsVisited);
+  // There are two ConceptReference objects in the AST: the base subobject
+  // of the ConceptSpecializationExpr, and the base subobject of the
+  // TypeConstraint itself. To avoid traversing the concept and arguments
+  // multiple times, we only traverse one.
+  EXPECT_EQ(1, Visitor.ConceptReferencesTraversed);
+}
+
+} // end anonymous namespace

From b27bdf955a74e1050645ef5482498a834e9dfc1e Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Mon, 17 Aug 2020 19:54:42 -0500
Subject: [PATCH 007/101] [Attributor][FIX] Handle function pointers properly
 in AANonNull

Before we tired to create a dominator tree for a declaration when we
wanted to determine if the function pointer is `nonnull`. We now avoid
looking at global values if `Value::getPointerDereferenceableBytes` not
already determined `nonnull`.
---
 .../Transforms/IPO/AttributorAttributes.cpp   | 39 +++++++---
 .../IPConstantProp/openmp_parallel_for.ll     |  4 +-
 llvm/test/Transforms/Attributor/callbacks.ll  | 20 ++---
 llvm/test/Transforms/Attributor/liveness.ll   | 34 +++++---
 llvm/test/Transforms/Attributor/misc.ll       | 77 ++++++-------------
 llvm/test/Transforms/Attributor/nonnull.ll    | 31 ++++++++
 6 files changed, 119 insertions(+), 86 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 762d4a11151551..13f56ec9d50d87 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -1687,21 +1687,33 @@ struct AANonNullImpl : AANonNull {
     Value &V = getAssociatedValue();
     if (!NullIsDefined &&
         hasAttr({Attribute::NonNull, Attribute::Dereferenceable},
-                /* IgnoreSubsumingPositions */ false, &A))
+                /* IgnoreSubsumingPositions */ false, &A)) {
       indicateOptimisticFixpoint();
-    else if (isa<ConstantPointerNull>(V))
+      return;
+    }
+
+    if (isa<ConstantPointerNull>(V)) {
       indicatePessimisticFixpoint();
-    else
-      AANonNull::initialize(A);
+      return;
+    }
+
+    AANonNull::initialize(A);
 
     bool CanBeNull = true;
-    if (V.getPointerDereferenceableBytes(A.getDataLayout(), CanBeNull))
-      if (!CanBeNull)
+    if (V.getPointerDereferenceableBytes(A.getDataLayout(), CanBeNull)) {
+      if (!CanBeNull) {
         indicateOptimisticFixpoint();
+        return;
+      }
+    }
 
-    if (!getState().isAtFixpoint())
-      if (Instruction *CtxI = getCtxI())
-        followUsesInMBEC(*this, A, getState(), *CtxI);
+    if (isa<GlobalValue>(&getAssociatedValue())) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    if (Instruction *CtxI = getCtxI())
+      followUsesInMBEC(*this, A, getState(), *CtxI);
   }
 
   /// See followUsesInMBEC
@@ -1778,9 +1790,14 @@ struct AANonNullFloating : public AANonNullImpl {
 
 /// NonNull attribute for function return value.
 struct AANonNullReturned final
-    : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl> {
+    : AAReturnedFromReturnedValues<AANonNull, AANonNull> {
   AANonNullReturned(const IRPosition &IRP, Attributor &A)
-      : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl>(IRP, A) {}
+      : AAReturnedFromReturnedValues<AANonNull, AANonNull>(IRP, A) {}
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nonnull" : "may-null";
+  }
 
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll b/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll
index 83990224bddae0..137193b972ca6c 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll
@@ -36,7 +36,7 @@ define dso_local void @foo(i32 %N) {
 ; IS__TUNIT_OPM-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
 ; IS__TUNIT_OPM-NEXT:    store float 3.000000e+00, float* [[P]], align 4
 ; IS__TUNIT_OPM-NEXT:    store i32 7, i32* [[N_ADDR]], align 4
-; IS__TUNIT_OPM-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) [[GLOB1:@.*]], i32 3, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*, float*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nocapture nonnull readonly align 4 dereferenceable(4) [[N_ADDR]], float* nocapture nonnull readonly align 4 dereferenceable(4) [[P]], i64 undef)
+; IS__TUNIT_OPM-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) [[GLOB1:@.*]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, float*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nocapture nonnull readonly align 4 dereferenceable(4) [[N_ADDR]], float* nocapture nonnull readonly align 4 dereferenceable(4) [[P]], i64 undef)
 ; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@foo
@@ -47,7 +47,7 @@ define dso_local void @foo(i32 %N) {
 ; IS__TUNIT_NPM-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
 ; IS__TUNIT_NPM-NEXT:    store float 3.000000e+00, float* [[P]], align 4
 ; IS__TUNIT_NPM-NEXT:    store i32 7, i32* [[N_ADDR]], align 4
-; IS__TUNIT_NPM-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) [[GLOB1:@.*]], i32 3, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*, float*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[N_ADDR]], float* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[P]], i64 undef)
+; IS__TUNIT_NPM-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) [[GLOB1:@.*]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, float*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[N_ADDR]], float* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[P]], i64 undef)
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@foo
diff --git a/llvm/test/Transforms/Attributor/callbacks.ll b/llvm/test/Transforms/Attributor/callbacks.ll
index 951f9830ae59e5..f1dfacea8a96ef 100644
--- a/llvm/test/Transforms/Attributor/callbacks.ll
+++ b/llvm/test/Transforms/Attributor/callbacks.ll
@@ -25,7 +25,7 @@ define void @t0_caller(i32* %a) {
 ; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_OPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_OPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t0_callback_broker(i32* noalias nocapture align 536870912 null, i32* nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*, i64, i32**)* @t0_callback_callee to void (i32*, i32*, ...)*), i32* align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t0_callback_broker(i32* noalias nocapture align 536870912 null, i32* nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i32**)* @t0_callback_callee to void (i32*, i32*, ...)*), i32* align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@t0_caller
@@ -37,7 +37,7 @@ define void @t0_caller(i32* %a) {
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_NPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_NPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t0_callback_broker(i32* noalias nocapture align 536870912 null, i32* nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*, i64, i32**)* @t0_callback_callee to void (i32*, i32*, ...)*), i32* align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t0_callback_broker(i32* noalias nocapture align 536870912 null, i32* nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i32**)* @t0_callback_callee to void (i32*, i32*, ...)*), i32* align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@t0_caller
@@ -124,7 +124,7 @@ define void @t1_caller(i32* noalias %a) {
 ; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_OPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_OPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture nonnull bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@t1_caller
@@ -136,7 +136,7 @@ define void @t1_caller(i32* noalias %a) {
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_NPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_NPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture nonnull bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@t1_caller
@@ -224,7 +224,7 @@ define void @t2_caller(i32* noalias %a) {
 ; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_OPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_OPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture nonnull bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@t2_caller
@@ -236,7 +236,7 @@ define void @t2_caller(i32* noalias %a) {
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_NPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_NPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture nonnull bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@t2_caller
@@ -324,8 +324,8 @@ define void @t3_caller(i32* noalias %a) {
 ; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_OPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_OPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture nonnull bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
-; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture nonnull bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@t3_caller
@@ -337,8 +337,8 @@ define void @t3_caller(i32* noalias %a) {
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_NPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_NPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture nonnull bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
-; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture nonnull bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@t3_caller
diff --git a/llvm/test/Transforms/Attributor/liveness.ll b/llvm/test/Transforms/Attributor/liveness.ll
index 2a24a168263c6e..57017c50af521a 100644
--- a/llvm/test/Transforms/Attributor/liveness.ll
+++ b/llvm/test/Transforms/Attributor/liveness.ll
@@ -1785,17 +1785,29 @@ define internal void @call_via_pointer_with_dead_args_internal_b(i32* %a, i32* %
   ret void
 }
 define void @call_via_pointer_with_dead_args_caller(i32* %a, i32* %b) {
-; CHECK-LABEL: define {{[^@]+}}@call_via_pointer_with_dead_args_caller
-; CHECK-SAME: (i32* [[A:%.*]], i32* [[B:%.*]])
-; CHECK-NEXT:    [[PTR1:%.*]] = alloca i32, align 128
-; CHECK-NEXT:    [[PTR2:%.*]] = alloca i32, align 128
-; CHECK-NEXT:    [[PTR3:%.*]] = alloca i32, align 128
-; CHECK-NEXT:    [[PTR4:%.*]] = alloca i32, align 128
-; CHECK-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[PTR1]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull @called_via_pointer)
-; CHECK-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[PTR2]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull @called_via_pointer_internal_1)
-; CHECK-NEXT:    call void @call_via_pointer_with_dead_args_internal_a(i32* [[B]], i32* nonnull align 128 dereferenceable(4) [[PTR3]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull @called_via_pointer)
-; CHECK-NEXT:    call void @call_via_pointer_with_dead_args_internal_b(i32* [[B]], i32* nonnull align 128 dereferenceable(4) [[PTR4]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull @called_via_pointer_internal_2)
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@call_via_pointer_with_dead_args_caller
+; NOT_CGSCC_NPM-SAME: (i32* [[A:%.*]], i32* [[B:%.*]])
+; NOT_CGSCC_NPM-NEXT:    [[PTR1:%.*]] = alloca i32, align 128
+; NOT_CGSCC_NPM-NEXT:    [[PTR2:%.*]] = alloca i32, align 128
+; NOT_CGSCC_NPM-NEXT:    [[PTR3:%.*]] = alloca i32, align 128
+; NOT_CGSCC_NPM-NEXT:    [[PTR4:%.*]] = alloca i32, align 128
+; NOT_CGSCC_NPM-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[PTR1]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree @called_via_pointer)
+; NOT_CGSCC_NPM-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[PTR2]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree @called_via_pointer_internal_1)
+; NOT_CGSCC_NPM-NEXT:    call void @call_via_pointer_with_dead_args_internal_a(i32* [[B]], i32* nonnull align 128 dereferenceable(4) [[PTR3]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree @called_via_pointer)
+; NOT_CGSCC_NPM-NEXT:    call void @call_via_pointer_with_dead_args_internal_b(i32* [[B]], i32* nonnull align 128 dereferenceable(4) [[PTR4]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree @called_via_pointer_internal_2)
+; NOT_CGSCC_NPM-NEXT:    ret void
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@call_via_pointer_with_dead_args_caller
+; IS__CGSCC____-SAME: (i32* [[A:%.*]], i32* [[B:%.*]])
+; IS__CGSCC____-NEXT:    [[PTR1:%.*]] = alloca i32, align 128
+; IS__CGSCC____-NEXT:    [[PTR2:%.*]] = alloca i32, align 128
+; IS__CGSCC____-NEXT:    [[PTR3:%.*]] = alloca i32, align 128
+; IS__CGSCC____-NEXT:    [[PTR4:%.*]] = alloca i32, align 128
+; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[PTR1]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull @called_via_pointer)
+; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[PTR2]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull @called_via_pointer_internal_1)
+; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args_internal_a(i32* [[B]], i32* nonnull align 128 dereferenceable(4) [[PTR3]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull @called_via_pointer)
+; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args_internal_b(i32* [[B]], i32* nonnull align 128 dereferenceable(4) [[PTR4]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull @called_via_pointer_internal_2)
+; IS__CGSCC____-NEXT:    ret void
 ;
   %ptr1 = alloca i32, align 128
   %ptr2 = alloca i32, align 128
diff --git a/llvm/test/Transforms/Attributor/misc.ll b/llvm/test/Transforms/Attributor/misc.ll
index 46a0449e5be6e8..80a6948ca6dc4c 100644
--- a/llvm/test/Transforms/Attributor/misc.ll
+++ b/llvm/test/Transforms/Attributor/misc.ll
@@ -9,31 +9,18 @@
 define internal void @internal(void (i8*)* %fp) {
 ;
 ;
-; IS__TUNIT____-LABEL: define {{[^@]+}}@internal
-; IS__TUNIT____-SAME: (void (i8*)* nonnull [[FP:%.*]])
-; IS__TUNIT____-NEXT:  entry:
-; IS__TUNIT____-NEXT:    [[A:%.*]] = alloca i32, align 4
-; IS__TUNIT____-NEXT:    call void @foo(i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[A]])
-; IS__TUNIT____-NEXT:    call void [[FP]](i8* bitcast (void (i32*)* @foo to i8*))
-; IS__TUNIT____-NEXT:    call void @callback1(void (i32*)* nonnull @foo)
-; IS__TUNIT____-NEXT:    call void @callback2(void (i8*)* nonnull bitcast (void (i32*)* @foo to void (i8*)*))
-; IS__TUNIT____-NEXT:    call void @callback2(void (i8*)* nonnull [[FP]])
-; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to i8*
-; IS__TUNIT____-NEXT:    call void [[FP]](i8* [[TMP1]])
-; IS__TUNIT____-NEXT:    ret void
-;
-; IS__CGSCC____-LABEL: define {{[^@]+}}@internal
-; IS__CGSCC____-SAME: (void (i8*)* nonnull [[FP:%.*]])
-; IS__CGSCC____-NEXT:  entry:
-; IS__CGSCC____-NEXT:    [[A:%.*]] = alloca i32, align 4
-; IS__CGSCC____-NEXT:    call void @foo(i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[A]])
-; IS__CGSCC____-NEXT:    call void [[FP]](i8* bitcast (void (i32*)* @foo to i8*))
-; IS__CGSCC____-NEXT:    call void @callback1(void (i32*)* nonnull @foo)
-; IS__CGSCC____-NEXT:    call void @callback2(void (i8*)* bitcast (void (i32*)* @foo to void (i8*)*))
-; IS__CGSCC____-NEXT:    call void @callback2(void (i8*)* nonnull [[FP]])
-; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to i8*
-; IS__CGSCC____-NEXT:    call void [[FP]](i8* [[TMP1]])
-; IS__CGSCC____-NEXT:    ret void
+; CHECK-LABEL: define {{[^@]+}}@internal
+; CHECK-SAME: (void (i8*)* nonnull [[FP:%.*]])
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @foo(i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[A]])
+; CHECK-NEXT:    call void [[FP]](i8* bitcast (void (i32*)* @foo to i8*))
+; CHECK-NEXT:    call void @callback1(void (i32*)* nonnull @foo)
+; CHECK-NEXT:    call void @callback2(void (i8*)* bitcast (void (i32*)* @foo to void (i8*)*))
+; CHECK-NEXT:    call void @callback2(void (i8*)* nonnull [[FP]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    call void [[FP]](i8* [[TMP1]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   %a = alloca i32, align 4
@@ -51,33 +38,19 @@ entry:
 define void @external(void (i8*)* %fp) {
 ;
 ;
-; IS__TUNIT____-LABEL: define {{[^@]+}}@external
-; IS__TUNIT____-SAME: (void (i8*)* [[FP:%.*]])
-; IS__TUNIT____-NEXT:  entry:
-; IS__TUNIT____-NEXT:    [[A:%.*]] = alloca i32, align 4
-; IS__TUNIT____-NEXT:    call void @foo(i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[A]])
-; IS__TUNIT____-NEXT:    call void @callback1(void (i32*)* nonnull @foo)
-; IS__TUNIT____-NEXT:    call void @callback2(void (i8*)* nonnull bitcast (void (i32*)* @foo to void (i8*)*))
-; IS__TUNIT____-NEXT:    call void @callback2(void (i8*)* [[FP]])
-; IS__TUNIT____-NEXT:    call void [[FP]](i8* bitcast (void (i32*)* @foo to i8*))
-; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to i8*
-; IS__TUNIT____-NEXT:    call void [[FP]](i8* [[TMP1]])
-; IS__TUNIT____-NEXT:    call void @internal(void (i8*)* nonnull [[FP]])
-; IS__TUNIT____-NEXT:    ret void
-;
-; IS__CGSCC____-LABEL: define {{[^@]+}}@external
-; IS__CGSCC____-SAME: (void (i8*)* [[FP:%.*]])
-; IS__CGSCC____-NEXT:  entry:
-; IS__CGSCC____-NEXT:    [[A:%.*]] = alloca i32, align 4
-; IS__CGSCC____-NEXT:    call void @foo(i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[A]])
-; IS__CGSCC____-NEXT:    call void @callback1(void (i32*)* nonnull @foo)
-; IS__CGSCC____-NEXT:    call void @callback2(void (i8*)* bitcast (void (i32*)* @foo to void (i8*)*))
-; IS__CGSCC____-NEXT:    call void @callback2(void (i8*)* [[FP]])
-; IS__CGSCC____-NEXT:    call void [[FP]](i8* bitcast (void (i32*)* @foo to i8*))
-; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to i8*
-; IS__CGSCC____-NEXT:    call void [[FP]](i8* [[TMP1]])
-; IS__CGSCC____-NEXT:    call void @internal(void (i8*)* nonnull [[FP]])
-; IS__CGSCC____-NEXT:    ret void
+; CHECK-LABEL: define {{[^@]+}}@external
+; CHECK-SAME: (void (i8*)* [[FP:%.*]])
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @foo(i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[A]])
+; CHECK-NEXT:    call void @callback1(void (i32*)* nonnull @foo)
+; CHECK-NEXT:    call void @callback2(void (i8*)* bitcast (void (i32*)* @foo to void (i8*)*))
+; CHECK-NEXT:    call void @callback2(void (i8*)* [[FP]])
+; CHECK-NEXT:    call void [[FP]](i8* bitcast (void (i32*)* @foo to i8*))
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to i8*
+; CHECK-NEXT:    call void [[FP]](i8* [[TMP1]])
+; CHECK-NEXT:    call void @internal(void (i8*)* nonnull [[FP]])
+; CHECK-NEXT:    ret void
 ;
 entry:
   %a = alloca i32, align 4
diff --git a/llvm/test/Transforms/Attributor/nonnull.ll b/llvm/test/Transforms/Attributor/nonnull.ll
index 9290d32453bada..4add5a5c1f5a89 100644
--- a/llvm/test/Transforms/Attributor/nonnull.ll
+++ b/llvm/test/Transforms/Attributor/nonnull.ll
@@ -1364,5 +1364,36 @@ define void @nonnull_assume_neg(i8* %arg) {
 declare void @use_i8_ptr(i8* nofree nocapture readnone) nounwind
 declare void @use_i8_ptr_ret(i8* nofree nocapture readnone) nounwind willreturn
 
+define i8* @nonnull_function_ptr_1() {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@nonnull_function_ptr_1()
+; IS__TUNIT____-NEXT:    [[BC:%.*]] = bitcast i8* ()* @nonnull_function_ptr_1 to i8*
+; IS__TUNIT____-NEXT:    ret i8* [[BC]]
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@nonnull_function_ptr_1()
+; IS__CGSCC____-NEXT:    [[BC:%.*]] = bitcast i8* ()* @nonnull_function_ptr_1 to i8*
+; IS__CGSCC____-NEXT:    ret i8* [[BC]]
+;
+  %bc = bitcast i8*()* @nonnull_function_ptr_1 to i8*
+  ret i8* %bc
+}
+
+declare i8* @function_decl()
+define i8* @nonnull_function_ptr_2() {
+; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
+; IS__TUNIT____-LABEL: define {{[^@]+}}@nonnull_function_ptr_2()
+; IS__TUNIT____-NEXT:    [[BC:%.*]] = bitcast i8* ()* @function_decl to i8*
+; IS__TUNIT____-NEXT:    ret i8* [[BC]]
+;
+; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; IS__CGSCC____-LABEL: define {{[^@]+}}@nonnull_function_ptr_2()
+; IS__CGSCC____-NEXT:    [[BC:%.*]] = bitcast i8* ()* @function_decl to i8*
+; IS__CGSCC____-NEXT:    ret i8* [[BC]]
+;
+  %bc = bitcast i8*()* @function_decl to i8*
+  ret i8* %bc
+}
+
 attributes #0 = { null_pointer_is_valid }
 attributes #1 = { nounwind willreturn}

From 858c75f7d19c14002eb81bcbc747bf708f92b1a9 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Sat, 1 Aug 2020 01:49:28 -0500
Subject: [PATCH 008/101] [Attributor][NFC] Directly return proper type to
 avoid casts

---
 llvm/include/llvm/Transforms/IPO/Attributor.h |  4 +-
 .../Transforms/IPO/AttributorAttributes.cpp   | 58 ++++++-------------
 2 files changed, 20 insertions(+), 42 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 27832d0fcc8467..b6c0a17fc3e7fa 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -2013,7 +2013,7 @@ struct StateWrapper : public BaseType, public StateTy {
   StateType &getState() override { return *this; }
 
   /// See AbstractAttribute::getState(...).
-  const AbstractState &getState() const override { return *this; }
+  const StateType &getState() const override { return *this; }
 };
 
 /// Helper class that provides common functionality to manifest IR attributes.
@@ -3302,7 +3302,7 @@ struct AAValueConstantRange
 
   /// See AbstractAttribute::getState(...).
   IntegerRangeState &getState() override { return *this; }
-  const AbstractState &getState() const override { return *this; }
+  const IntegerRangeState &getState() const override { return *this; }
 
   /// Create an abstract attribute view for the position \p IRP.
   static AAValueConstantRange &createForPosition(const IRPosition &IRP,
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 13f56ec9d50d87..81fc52cb3f1cb0 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -449,7 +449,7 @@ static void clampReturnedValueStates(Attributor &A, const AAType &QueryingAA,
     const AAType &AA = A.getAAFor<AAType>(QueryingAA, RVPos);
     LLVM_DEBUG(dbgs() << "[Attributor] RV: " << RV << " AA: " << AA.getAsStr()
                       << " @ " << RVPos << "\n");
-    const StateType &AAS = static_cast<const StateType &>(AA.getState());
+    const StateType &AAS = AA.getState();
     if (T.hasValue())
       *T &= AAS;
     else
@@ -511,7 +511,7 @@ static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA,
     const AAType &AA = A.getAAFor<AAType>(QueryingAA, ACSArgPos);
     LLVM_DEBUG(dbgs() << "[Attributor] ACS: " << *ACS.getInstruction()
                       << " AA: " << AA.getAsStr() << " @" << ACSArgPos << "\n");
-    const StateType &AAS = static_cast<const StateType &>(AA.getState());
+    const StateType &AAS = AA.getState();
     if (T.hasValue())
       *T &= AAS;
     else
@@ -568,8 +568,7 @@ struct AACallSiteReturnedFromReturned : public BaseType {
 
     IRPosition FnPos = IRPosition::returned(*AssociatedFunction);
     const AAType &AA = A.getAAFor<AAType>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        S, static_cast<const StateType &>(AA.getState()));
+    return clampStateAndIndicateChange(S, AA.getState());
   }
 };
 
@@ -749,9 +748,7 @@ struct AANoUnwindCallSite final : AANoUnwindImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AANoUnwind>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AANoUnwind::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -1401,8 +1398,7 @@ struct AANoSyncCallSite final : AANoSyncImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AANoSync>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AANoSync::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -1467,8 +1463,7 @@ struct AANoFreeCallSite final : AANoFreeImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AANoFree>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AANoFree::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -1550,8 +1545,7 @@ struct AANoFreeCallSiteArgument final : AANoFreeFloating {
       return indicatePessimisticFixpoint();
     const IRPosition &ArgPos = IRPosition::argument(*Arg);
     auto &ArgAA = A.getAAFor<AANoFree>(*this, ArgPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AANoFree::StateType &>(ArgAA.getState()));
+    return clampStateAndIndicateChange(getState(), ArgAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -1769,8 +1763,7 @@ struct AANonNullFloating : public AANonNullImpl {
           T.indicatePessimisticFixpoint();
       } else {
         // Use abstract attribute information.
-        const AANonNull::StateType &NS =
-            static_cast<const AANonNull::StateType &>(AA.getState());
+        const AANonNull::StateType &NS = AA.getState();
         T ^= NS;
       }
       return T.isValidState();
@@ -1924,9 +1917,7 @@ struct AANoRecurseCallSite final : AANoRecurseImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AANoRecurse>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AANoRecurse::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -2339,9 +2330,7 @@ struct AAWillReturnCallSite final : AAWillReturnImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AAWillReturn>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AAWillReturn::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -2735,8 +2724,7 @@ struct AANoAliasCallSiteReturned final : AANoAliasImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::returned(*F);
     auto &FnAA = A.getAAFor<AANoAlias>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AANoAlias::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -2926,8 +2914,7 @@ struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl {
       return indicatePessimisticFixpoint();
     const IRPosition &ArgPos = IRPosition::argument(*Arg);
     auto &ArgAA = A.getAAFor<AAIsDead>(*this, ArgPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AAIsDead::StateType &>(ArgAA.getState()));
+    return clampStateAndIndicateChange(getState(), ArgAA.getState());
   }
 
   /// See AbstractAttribute::manifest(...).
@@ -3547,7 +3534,7 @@ struct AADereferenceableFloating : AADereferenceableImpl {
         DerefBytes = Base->getPointerDereferenceableBytes(DL, CanBeNull);
         T.GlobalState.indicatePessimisticFixpoint();
       } else {
-        const DerefState &DS = static_cast<const DerefState &>(AA.getState());
+        const DerefState &DS = AA.getState();
         DerefBytes = DS.DerefBytesState.getAssumed();
         T.GlobalState &= DS.GlobalState;
       }
@@ -3829,8 +3816,7 @@ struct AAAlignFloating : AAAlignImpl {
         T.indicatePessimisticFixpoint();
       } else {
         // Use abstract attribute information.
-        const AAAlign::StateType &DS =
-            static_cast<const AAAlign::StateType &>(AA.getState());
+        const AAAlign::StateType &DS = AA.getState();
         T ^= DS;
       }
       return T.isValidState();
@@ -3985,9 +3971,7 @@ struct AANoReturnCallSite final : AANoReturnImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AANoReturn::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -4374,9 +4358,7 @@ struct AANoCaptureCallSiteArgument final : AANoCaptureImpl {
       return indicatePessimisticFixpoint();
     const IRPosition &ArgPos = IRPosition::argument(*Arg);
     auto &ArgAA = A.getAAFor<AANoCapture>(*this, ArgPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AANoCapture::StateType &>(ArgAA.getState()));
+    return clampStateAndIndicateChange(getState(), ArgAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -5880,9 +5862,7 @@ struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument {
     Argument *Arg = getAssociatedArgument();
     const IRPosition &ArgPos = IRPosition::argument(*Arg);
     auto &ArgAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AAMemoryBehavior::StateType &>(ArgAA.getState()));
+    return clampStateAndIndicateChange(getState(), ArgAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()
@@ -5965,9 +5945,7 @@ struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl {
     Function *F = getAssociatedFunction();
     const IRPosition &FnPos = IRPosition::function(*F);
     auto &FnAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AAMemoryBehavior::StateType &>(FnAA.getState()));
+    return clampStateAndIndicateChange(getState(), FnAA.getState());
   }
 
   /// See AbstractAttribute::trackStatistics()

From 1d99c3d7073888517e3557c1b522ed273bbe188c Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Sun, 2 Aug 2020 00:31:30 -0500
Subject: [PATCH 009/101] [Attributor] We (should) keep the CG updated so we
 can mark it as preserved

---
 llvm/lib/Transforms/IPO/Attributor.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index fb217cb5f14ee9..54ec7174f9fb8a 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -2324,7 +2324,9 @@ PreservedAnalyses AttributorCGSCCPass::run(LazyCallGraph::SCC &C,
   InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions);
   if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater)) {
     // FIXME: Think about passes we will preserve and add them here.
-    return PreservedAnalyses::none();
+    PreservedAnalyses PA;
+    PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+    return PA;
   }
   return PreservedAnalyses::all();
 }

From 8abd69aa9e8c21b9958f531b03ad9801ec850154 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Sun, 2 Aug 2020 00:44:08 -0500
Subject: [PATCH 010/101] [Attributor] Bail early if AAMemoryLocation cannot
 derive anything

Before this change we looked through all memory operations in a function
even if the first was an unknown call that could do anything. This did
cost a lot of time but there is little use to do so. We also avoid
creating AAs for things that we would have looked at in case no other AA
will; that is the reason for the test changes.

Running only the attributor-cgscc pass on a IR version of
`llvm-test-suite/MultiSource/Applications/SPASS/clause.c` reduced the
time we spend in `AAMemoryLocation::update` from 4% total to
0.9% (disclaimer: no accurate measurements).
---
 llvm/lib/Transforms/IPO/AttributorAttributes.cpp     | 4 +++-
 llvm/test/Transforms/Attributor/dereferenceable-1.ll | 1 +
 llvm/test/Transforms/Attributor/heap_to_stack.ll     | 9 +++++----
 llvm/test/Transforms/Attributor/liveness.ll          | 2 +-
 llvm/test/Transforms/Attributor/noreturn_async.ll    | 2 +-
 5 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 81fc52cb3f1cb0..a1bcec889d1bda 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -6628,7 +6628,9 @@ struct AAMemoryLocationFunction final : public AAMemoryLocationImpl {
       LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Accessed locations for " << I
                         << ": " << getMemoryLocationsAsStr(MLK) << "\n");
       removeAssumedBits(inverseLocation(MLK, false, false));
-      return true;
+      // Stop once only the valid bit set in the *not assumed location*, thus
+      // once we don't actually exclude any memory locations in the state.
+      return getAssumedNotAccessedLocation() != VALID_STATE;
     };
 
     if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
diff --git a/llvm/test/Transforms/Attributor/dereferenceable-1.ll b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
index 9a995396e516ef..3f8fb81a2636b6 100644
--- a/llvm/test/Transforms/Attributor/dereferenceable-1.ll
+++ b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
@@ -280,6 +280,7 @@ define void @f7_2(i1 %c) {
 ; CHECK-SAME: (i1 [[C:%.*]])
 ; CHECK-NEXT:    [[PTR:%.*]] = tail call nonnull align 4 dereferenceable(4) i32* @unkown_ptr()
 ; CHECK-NEXT:    [[A:%.*]] = tail call i32 @unkown_f(i32* nonnull align 4 dereferenceable(4) [[PTR]])
+; CHECK-NEXT:    [[ARG_A_0:%.*]] = load i32, i32* [[PTR]], align 4
 ; CHECK-NEXT:    [[B:%.*]] = tail call i32 @unkown_f(i32* nonnull align 4 dereferenceable(4) [[PTR]])
 ; CHECK-NEXT:    br i1 [[C]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK:       if.true:
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll
index 54e293e73179f2..28c0166dd0cd62 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll
@@ -450,10 +450,11 @@ define i32 @irreducible_cfg(i32 %0) {
 ; IS________OPM-NEXT:    [[TMP14]] = add nsw i32 [[DOT1]], 1
 ; IS________OPM-NEXT:    br label [[TMP8]]
 ; IS________OPM:       15:
-; IS________OPM-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP3]] to i8*
-; IS________OPM-NEXT:    call void @free(i8* nocapture [[TMP16]])
-; IS________OPM-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP3]], align 4
-; IS________OPM-NEXT:    ret i32 [[TMP17]]
+; IS________OPM-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP3]], align 4
+; IS________OPM-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP3]] to i8*
+; IS________OPM-NEXT:    call void @free(i8* nocapture [[TMP17]])
+; IS________OPM-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP3]], align 4
+; IS________OPM-NEXT:    ret i32 [[TMP18]]
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@irreducible_cfg
 ; IS________NPM-SAME: (i32 [[TMP0:%.*]])
diff --git a/llvm/test/Transforms/Attributor/liveness.ll b/llvm/test/Transforms/Attributor/liveness.ll
index 57017c50af521a..f3bd7ef1460a85 100644
--- a/llvm/test/Transforms/Attributor/liveness.ll
+++ b/llvm/test/Transforms/Attributor/liveness.ll
@@ -1920,7 +1920,7 @@ define i32 @main() {
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[G_0]], 1
 ; CHECK-NEXT:    br label [[FOR_COND_0]]
 ; CHECK:       for.end.0:
-; CHECK-NEXT:    [[CALL:%.*]] = call noalias i8* @malloc(i64 8)
+; CHECK-NEXT:    [[CALL:%.*]] = call i8* @malloc(i64 8)
 ; CHECK-NEXT:    store i8* [[CALL]], i8** bitcast (%struct.a** @e to i8**), align 8
 ; CHECK-NEXT:    [[B:%.*]] = bitcast i8* [[CALL]] to %struct.a**
 ; CHECK-NEXT:    store %struct.a* null, %struct.a** [[B]], align 8
diff --git a/llvm/test/Transforms/Attributor/noreturn_async.ll b/llvm/test/Transforms/Attributor/noreturn_async.ll
index 4c0fc203eb095e..879fb16a13d0bf 100644
--- a/llvm/test/Transforms/Attributor/noreturn_async.ll
+++ b/llvm/test/Transforms/Attributor/noreturn_async.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes --check-attributes
-; RUN: opt -attributor  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s
+; RUN: opt -attributor  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=1 -S < %s | FileCheck %s
 ;
 ; This file is the same as noreturn_sync.ll but with a personality which
 ; indicates that the exception handler *can* catch asynchronous exceptions. As

From 24c3dabef4436ec6436fb80e0672577ec52159ba Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Mon, 17 Aug 2020 18:17:38 -0700
Subject: [PATCH 011/101] DebugInfo: Emit class template parameters first,
 before members

This reads more like what you'd expect the DWARF to look like (from the
lexical order of C++ - template parameters come before members, etc),
and also happens to make it easier to tickle (& thus test) a bug related
to type units and Split DWARF I'm about to fix.
---
 llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp             | 11 +++++------
 .../test/DebugInfo/Generic/template-recursive-void.ll |  2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 3dfd57c82f89a5..11729842ff710a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -891,6 +891,11 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
       }
     }
 
+    // Add template parameters to a class, structure or union types.
+    if (Tag == dwarf::DW_TAG_class_type ||
+        Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
+      addTemplateParams(Buffer, CTy->getTemplateParams());
+
     // Add elements to structure type.
     DINodeArray Elements = CTy->getElements();
     for (const auto *Element : Elements) {
@@ -960,12 +965,6 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
     if (CTy->isObjcClassComplete())
       addFlag(Buffer, dwarf::DW_AT_APPLE_objc_complete_type);
 
-    // Add template parameters to a class, structure or union types.
-    // FIXME: The support isn't in the metadata for this yet.
-    if (Tag == dwarf::DW_TAG_class_type ||
-        Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
-      addTemplateParams(Buffer, CTy->getTemplateParams());
-
     // Add the type's non-standard calling convention.
     uint8_t CC = 0;
     if (CTy->isTypePassByValue())
diff --git a/llvm/test/DebugInfo/Generic/template-recursive-void.ll b/llvm/test/DebugInfo/Generic/template-recursive-void.ll
index 0b70f218b3567e..4718b7a1591125 100644
--- a/llvm/test/DebugInfo/Generic/template-recursive-void.ll
+++ b/llvm/test/DebugInfo/Generic/template-recursive-void.ll
@@ -14,7 +14,7 @@
 ; CHECK: DW_TAG_template_type_parameter [{{.*}}]
 ; CHECK-NEXT: DW_AT_name{{.*}}"T"
 ; CHECK-NOT: DW_AT_type
-; CHECK: NULL
+; CHECK: {{DW_TAG|NULL}}
 
 source_filename = "test/DebugInfo/Generic/template-recursive-void.ll"
 

From be3ef93bf58aa5546c7baadfb21d43b75fbb4e24 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Mon, 17 Aug 2020 21:27:19 -0700
Subject: [PATCH 012/101] PR44685: DebugInfo: Handle address-use-invalid type
 units referencing non-type units

Theory was that we should never reach a non-type unit (eg: type in an
anonymous namespace) when we're already in the invalid "encountered an
address-use, so stop emitting types for now, until we throw out the
whole type tree to restart emitting in non-type unit" state. But that's
not the case (prior commit cleaned up one reason this wasn't exposed
sooner - but also makes it easier to test/demonstrate this issue)
---
 llvm/lib/CodeGen/AsmPrinter/AddressPool.h    |  2 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp   |  6 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h     |  1 +
 llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll | 89 ++++++++++++++++++++
 4 files changed, 94 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll

diff --git a/llvm/lib/CodeGen/AsmPrinter/AddressPool.h b/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
index f92cf72093ca03..f1edc6c330d51e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
+++ b/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
@@ -48,7 +48,7 @@ class AddressPool {
 
   bool hasBeenUsed() const { return HasBeenUsed; }
 
-  void resetUsedFlag() { HasBeenUsed = false; }
+  void resetUsedFlag(bool HasBeenUsed = false) { this->HasBeenUsed = HasBeenUsed; }
 
   MCSymbol *getLabel() { return AddressTableBaseSym; }
   void setLabel(MCSymbol *Sym) { AddressTableBaseSym = Sym; }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index f70eed32f0b532..cee72120accb79 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -3305,14 +3305,14 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
 
 DwarfDebug::NonTypeUnitContext::NonTypeUnitContext(DwarfDebug *DD)
     : DD(DD),
-      TypeUnitsUnderConstruction(std::move(DD->TypeUnitsUnderConstruction)) {
+      TypeUnitsUnderConstruction(std::move(DD->TypeUnitsUnderConstruction)), AddrPoolUsed(DD->AddrPool.hasBeenUsed()) {
   DD->TypeUnitsUnderConstruction.clear();
-  assert(TypeUnitsUnderConstruction.empty() || !DD->AddrPool.hasBeenUsed());
+  DD->AddrPool.resetUsedFlag();
 }
 
 DwarfDebug::NonTypeUnitContext::~NonTypeUnitContext() {
   DD->TypeUnitsUnderConstruction = std::move(TypeUnitsUnderConstruction);
-  DD->AddrPool.resetUsedFlag();
+  DD->AddrPool.resetUsedFlag(AddrPoolUsed);
 }
 
 DwarfDebug::NonTypeUnitContext DwarfDebug::enterNonTypeUnitContext() {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 0b943ebe46b669..93e08d1151ff70 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -648,6 +648,7 @@ class DwarfDebug : public DebugHandlerBase {
   class NonTypeUnitContext {
     DwarfDebug *DD;
     decltype(DwarfDebug::TypeUnitsUnderConstruction) TypeUnitsUnderConstruction;
+    bool AddrPoolUsed;
     friend class DwarfDebug;
     NonTypeUnitContext(DwarfDebug *DD);
   public:
diff --git a/llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll b/llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll
new file mode 100644
index 00000000000000..98943b73aefe64
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll
@@ -0,0 +1,89 @@
+; RUN: llc -filetype=obj -O0 -generate-type-units -split-dwarf-file=x.dwo < %s \
+; RUN:     | llvm-dwarfdump -debug-info -debug-types - \
+; RUN:     | FileCheck --implicit-check-not=Unit --implicit-check-not=contents --implicit-check-not=declaration %s
+
+; Test that an address-using-with-Split-DWARF type unit that references a
+; non-type unit is handled correctly. A NonTypeUnitContext is used to insulate
+; the type construction from being discarded when the prior/outer type has to be
+; discarded due to finding it used an address & so can't be type united under
+; Split DWARF. 
+
+; The intermediate types tu and t2 are here just to test a bit more
+; thoroughly/broadly. They also demonstrate one slight limitation/sub-optimality
+; since 't2' isn't put in a type unit.
+
+
+; extern int foo;
+; namespace {
+; struct t1 {
+; };
+; }
+; template <int *> struct t2 {
+;   t1 v1;
+; };
+; struct t3 {
+;   t2<&foo> v1;
+; };
+; t3 v1;
+
+; CHECK: .debug_info contents:
+; CHECK: Compile Unit:
+
+; CHECK: .debug_info.dwo contents:
+; CHECK: Compile Unit:
+
+; FIXME: In theory "t3" could be in a type unit - but at the moment, because it
+;        references t2, which needs an address, t3 gets non-type-united.
+;        But the same doesn't happen if t3 referenced an anonymous namespace type.
+
+; CHECK: DW_TAG_structure_type
+; CHECK:   DW_AT_name ("t3")
+; CHECK:   DW_TAG_member
+; CHECK:     DW_AT_type {{.*}} "t2<&foo>"
+; CHECK: DW_TAG_namespace
+; CHECK: [[T1:0x[0-9a-f]*]]:  DW_TAG_structure_type
+; CHECK:     DW_AT_name    ("t1")
+; CHECK: DW_TAG_structure_type
+; CHECK:   DW_AT_name ("t2<&foo>")
+; CHECK:   DW_TAG_member
+; CHECK:     DW_AT_name    ("v1")
+; CHECK:     DW_AT_type    ([[T1]] "t1")
+
+; CHECK: .debug_types contents:
+
+; CHECK-NOT: .debug_types.dwo contents:
+
+
+%struct.t3 = type { %struct.t2 }
+%struct.t2 = type { %"struct.(anonymous namespace)::t1" }
+%"struct.(anonymous namespace)::t1" = type { i8 }
+
+@v1 = dso_local global %struct.t3 zeroinitializer, align 1, !dbg !0
+@foo = external dso_local global i32, align 4
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!18, !19, !20}
+!llvm.ident = !{!21}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "v1", scope: !2, file: !3, line: 16, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 12.0.0 (git@github.com:llvm/llvm-project.git be646ae2865371c7a4966797e88f355de5653e04)", isOptimized: false, runtimeVersion: 0, splitDebugFilename: "test.dwo", emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: GNU)
+!3 = !DIFile(filename: "test.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!4 = !{}
+!5 = !{!0}
+!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t3", file: !3, line: 12, size: 8, flags: DIFlagTypePassByValue, elements: !7, identifier: "_ZTS2t3")
+!7 = !{!8}
+!8 = !DIDerivedType(tag: DW_TAG_member, name: "v1", scope: !6, file: !3, line: 13, baseType: !9, size: 8)
+!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t2<&foo>", file: !3, line: 8, size: 8, flags: DIFlagTypePassByValue, elements: !10, templateParams: !14, identifier: "_ZTS2t2IXadL_Z3fooEEE")
+!10 = !{!11}
+!11 = !DIDerivedType(tag: DW_TAG_member, name: "v1", scope: !9, file: !3, line: 9, baseType: !12, size: 8)
+!12 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", scope: !13, file: !3, line: 4, size: 8, flags: DIFlagTypePassByValue, elements: !4)
+!13 = !DINamespace(scope: null)
+!14 = !{!15}
+!15 = !DITemplateValueParameter(type: !16, value: i32* @foo)
+!16 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !17, size: 64)
+!17 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!18 = !{i32 7, !"Dwarf Version", i32 4}
+!19 = !{i32 2, !"Debug Info Version", i32 3}
+!20 = !{i32 1, !"wchar_size", i32 4}
+!21 = !{!"clang version 12.0.0 (git@github.com:llvm/llvm-project.git be646ae2865371c7a4966797e88f355de5653e04)"}

From e33ec9d90400a906314ccbd5821dbe05d070108a Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976@hotmail.com>
Date: Thu, 12 Mar 2020 19:27:18 -0400
Subject: [PATCH 013/101] [clangd] Target member of dependent base made visible
 via a using-decl

Fixes https://github.com/clangd/clangd/issues/307

Differential Revision: https://reviews.llvm.org/D86047
---
 clang-tools-extra/clangd/FindTarget.cpp       | 38 ++++++++++------
 clang-tools-extra/clangd/XRefs.cpp            |  2 +-
 .../clangd/unittests/FindTargetTests.cpp      | 13 ++++++
 .../clangd/unittests/XRefsTests.cpp           | 44 ++++++++++++-------
 4 files changed, 67 insertions(+), 30 deletions(-)

diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp
index f73a6e58497274..9db814368a024d 100644
--- a/clang-tools-extra/clangd/FindTarget.cpp
+++ b/clang-tools-extra/clangd/FindTarget.cpp
@@ -100,7 +100,7 @@ CXXRecordDecl *resolveTypeToRecordDecl(const Type *T) {
 std::vector<const NamedDecl *> getMembersReferencedViaDependentName(
     const Type *T,
     llvm::function_ref<DeclarationName(ASTContext &)> NameFactory,
-    bool IsNonstaticMember) {
+    llvm::function_ref<bool(const NamedDecl *ND)> Filter) {
   if (!T)
     return {};
   if (auto *ET = T->getAs<EnumType>()) {
@@ -113,17 +113,22 @@ std::vector<const NamedDecl *> getMembersReferencedViaDependentName(
       return {};
     RD = RD->getDefinition();
     DeclarationName Name = NameFactory(RD->getASTContext());
-    return RD->lookupDependentName(Name, [=](const NamedDecl *D) {
-      return IsNonstaticMember ? D->isCXXInstanceMember()
-                               : !D->isCXXInstanceMember();
-    });
+    return RD->lookupDependentName(Name, Filter);
   }
   return {};
 }
 
-// Given the type T of a dependent expression that appears of the LHS of a "->",
-// heuristically find a corresponding pointee type in whose scope we could look
-// up the name appearing on the RHS.
+const auto NonStaticFilter = [](const NamedDecl *D) {
+  return D->isCXXInstanceMember();
+};
+const auto StaticFilter = [](const NamedDecl *D) {
+  return !D->isCXXInstanceMember();
+};
+const auto ValueFilter = [](const NamedDecl *D) { return isa<ValueDecl>(D); };
+
+// Given the type T of a dependent expression that appears of the LHS of a
+// "->", heuristically find a corresponding pointee type in whose scope we
+// could look up the name appearing on the RHS.
 const Type *getPointeeType(const Type *T) {
   if (!T)
     return nullptr;
@@ -141,7 +146,7 @@ const Type *getPointeeType(const Type *T) {
       [](ASTContext &Ctx) {
         return Ctx.DeclarationNames.getCXXOperatorName(OO_Arrow);
       },
-      /*IsNonStaticMember=*/true);
+      NonStaticFilter);
   if (ArrowOps.empty())
     return nullptr;
 
@@ -187,13 +192,12 @@ std::vector<const NamedDecl *> resolveExprToDecls(const Expr *E) {
     }
     return getMembersReferencedViaDependentName(
         BaseType, [ME](ASTContext &) { return ME->getMember(); },
-        /*IsNonstaticMember=*/true);
+        NonStaticFilter);
   }
   if (const auto *RE = dyn_cast<DependentScopeDeclRefExpr>(E)) {
     return getMembersReferencedViaDependentName(
         RE->getQualifier()->getAsType(),
-        [RE](ASTContext &) { return RE->getDeclName(); },
-        /*IsNonstaticMember=*/false);
+        [RE](ASTContext &) { return RE->getDeclName(); }, StaticFilter);
   }
   if (const auto *CE = dyn_cast<CallExpr>(E)) {
     const auto *CalleeType = resolveExprToType(CE->getCallee());
@@ -291,7 +295,6 @@ const NamedDecl *getTemplatePattern(const NamedDecl *D) {
 // CXXDependentScopeMemberExpr, but some other constructs remain to be handled:
 //  - DependentTemplateSpecializationType,
 //  - DependentNameType
-//  - UnresolvedUsingValueDecl
 //  - UnresolvedUsingTypenameDecl
 struct TargetFinder {
   using RelSet = DeclRelationSet;
@@ -345,6 +348,15 @@ struct TargetFinder {
     } else if (const auto *NAD = dyn_cast<NamespaceAliasDecl>(D)) {
       add(NAD->getUnderlyingDecl(), Flags | Rel::Underlying);
       Flags |= Rel::Alias; // continue with the alias
+    } else if (const UnresolvedUsingValueDecl *UUVD =
+                   dyn_cast<UnresolvedUsingValueDecl>(D)) {
+      for (const NamedDecl *Target : getMembersReferencedViaDependentName(
+               UUVD->getQualifier()->getAsType(),
+               [UUVD](ASTContext &) { return UUVD->getNameInfo().getName(); },
+               ValueFilter)) {
+        add(Target, Flags | Rel::Underlying);
+      }
+      Flags |= Rel::Alias; // continue with the alias
     } else if (const UsingShadowDecl *USD = dyn_cast<UsingShadowDecl>(D)) {
       // Include the using decl, but don't traverse it. This may end up
       // including *all* shadows, which we don't want.
diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index 9936c67cb6e5b2..031a9c7bf5da31 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -345,7 +345,7 @@ locateASTReferent(SourceLocation CurLoc, const syntax::Token *TouchedIdentifier,
 
     // Give the underlying decl if navigation is triggered on a non-renaming
     // alias.
-    if (llvm::isa<UsingDecl>(D)) {
+    if (llvm::isa<UsingDecl>(D) || llvm::isa<UnresolvedUsingValueDecl>(D)) {
       // FIXME: address more complicated cases. TargetDecl(... Underlying) gives
       // all overload candidates, we only want the targeted one if the cursor is
       // on an using-alias usage, workround it with getDeclAtPosition.
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index 2507932c5cda30..5bfdaaf6c3434c 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -207,6 +207,19 @@ TEST_F(TargetDeclTest, UsingDecl) {
   )cpp";
   EXPECT_DECLS("MemberExpr", {"using X::foo", Rel::Alias},
                {"int foo()", Rel::Underlying});
+
+  Code = R"cpp(
+      template <typename T>
+      struct Base {
+        void waldo() {}
+      };
+      template <typename T>
+      struct Derived : Base<T> {
+        using Base<T>::[[waldo]];
+      };
+    )cpp";
+  EXPECT_DECLS("UnresolvedUsingValueDecl", {"using Base<T>::waldo", Rel::Alias},
+               {"void waldo()", Rel::Underlying});
 }
 
 TEST_F(TargetDeclTest, ConstructorInitList) {
diff --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
index 63e8c96daab842..d2337dcbd7b318 100644
--- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
@@ -1087,66 +1087,78 @@ TEST(LocateSymbol, TextualDependent) {
 
 TEST(LocateSymbol, Alias) {
   const char *Tests[] = {
-    R"cpp(
+      R"cpp(
       template <class T> struct function {};
       template <class T> using [[callback]] = function<T()>;
 
       c^allback<int> foo;
     )cpp",
 
-    // triggered on non-definition of a renaming alias: should not give any
-    // underlying decls.
-    R"cpp(
+      // triggered on non-definition of a renaming alias: should not give any
+      // underlying decls.
+      R"cpp(
       class Foo {};
       typedef Foo [[Bar]];
 
       B^ar b;
     )cpp",
-    R"cpp(
+      R"cpp(
       class Foo {};
       using [[Bar]] = Foo; // definition
       Ba^r b;
     )cpp",
 
-    // triggered on the underlying decl of a renaming alias.
-    R"cpp(
+      // triggered on the underlying decl of a renaming alias.
+      R"cpp(
       class [[Foo]];
       using Bar = Fo^o;
     )cpp",
 
-    // triggered on definition of a non-renaming alias: should give underlying
-    // decls.
-    R"cpp(
+      // triggered on definition of a non-renaming alias: should give underlying
+      // decls.
+      R"cpp(
       namespace ns { class [[Foo]] {}; }
       using ns::F^oo;
     )cpp",
 
-    R"cpp(
+      R"cpp(
       namespace ns { int [[x]](char); int [[x]](double); }
       using ns::^x;
     )cpp",
 
-    R"cpp(
+      R"cpp(
       namespace ns { int [[x]](char); int x(double); }
       using ns::x;
       int y = ^x('a');
     )cpp",
 
-    R"cpp(
+      R"cpp(
       namespace ns { class [[Foo]] {}; }
       using ns::Foo;
       F^oo f;
     )cpp",
 
-    // other cases that don't matter much.
-    R"cpp(
+      // other cases that don't matter much.
+      R"cpp(
       class Foo {};
       typedef Foo [[Ba^r]];
     )cpp",
-    R"cpp(
+      R"cpp(
       class Foo {};
       using [[B^ar]] = Foo;
     )cpp",
+
+      // Member of dependent base
+      R"cpp(
+      template <typename T>
+      struct Base {
+        void [[waldo]]() {}
+      };
+      template <typename T>
+      struct Derived : Base<T> {
+        using Base<T>::w^aldo;
+      };
+    )cpp",
   };
 
   for (const auto* Case : Tests) {

From a4b8c2de1d393525f5333d24999031b25d0e8862 Mon Sep 17 00:00:00 2001
From: Jakub Lichman <limo@google.com>
Date: Tue, 18 Aug 2020 07:11:52 +0000
Subject: [PATCH 014/101] [mlir] VectorToSCF bug in setAllocAtFunctionEntry
 fixed.

The function makes too strong assumption regarding parent FuncOp
which gets broken when FuncOp is first lowered to llvm function.
In this fix we generalize the assumption to allocation scope and
add assertion to produce user friendly message in case our assumption
is broken.

Differential Revision: https://reviews.llvm.org/D86086
---
 mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
index ea368c9eb14e05..95208ad231c91d 100644
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -224,7 +224,10 @@ static Value setAllocAtFunctionEntry(MemRefType memRefMinorVectorType,
                                      Operation *op) {
   auto &b = ScopedContext::getBuilderRef();
   OpBuilder::InsertionGuard guard(b);
-  b.setInsertionPointToStart(&op->getParentOfType<FuncOp>().front());
+  Operation *scope =
+      op->getParentWithTrait<OpTrait::AutomaticAllocationScope>();
+  assert(scope && "Expected op to be inside automatic allocation scope");
+  b.setInsertionPointToStart(&scope->getRegion(0).front());
   Value res =
       std_alloca(memRefMinorVectorType, ValueRange{}, b.getI64IntegerAttr(128));
   return res;

From 674f2df4fe0b6af901fc7c7e8bd3fb37e1e8516c Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Mon, 17 Aug 2020 20:25:28 +0200
Subject: [PATCH 015/101] [mlir] Fix printing of unranked memrefs in
 non-default memory space

The type printer was ignoring the memory space on unranked memrefs.

Reviewed By: rriddle

Differential Revision: https://reviews.llvm.org/D86096
---
 mlir/lib/IR/AsmPrinter.cpp    | 3 +++
 mlir/test/IR/core-ops.mlir    | 5 +++++
 mlir/test/IR/invalid-ops.mlir | 2 +-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index c8b4a864fb63a0..61eecb81108504 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -1650,6 +1650,9 @@ void ModulePrinter::printType(Type type) {
       .Case<UnrankedMemRefType>([&](UnrankedMemRefType memrefTy) {
         os << "memref<*x";
         printType(memrefTy.getElementType());
+        // Only print the memory space if it is the non-default one.
+        if (memrefTy.getMemorySpace())
+          os << ", " << memrefTy.getMemorySpace();
         os << '>';
       })
       .Case<ComplexType>([&](ComplexType complexTy) {
diff --git a/mlir/test/IR/core-ops.mlir b/mlir/test/IR/core-ops.mlir
index 89bcd75ffa2a84..74470719047791 100644
--- a/mlir/test/IR/core-ops.mlir
+++ b/mlir/test/IR/core-ops.mlir
@@ -703,6 +703,11 @@ func @memref_cast(%arg0: memref<4xf32>, %arg1 : memref<?xf32>, %arg2 : memref<64
   return
 }
 
+// Check that unranked memrefs with non-default memory space roundtrip
+// properly.
+// CHECK-LABEL: @unranked_memref_roundtrip(memref<*xf32, 4>)
+func @unranked_memref_roundtrip(memref<*xf32, 4>)
+
 // CHECK-LABEL: func @memref_view(%arg0
 func @memref_view(%arg0 : index, %arg1 : index, %arg2 : index) {
   %0 = alloc() : memref<2048xi8>
diff --git a/mlir/test/IR/invalid-ops.mlir b/mlir/test/IR/invalid-ops.mlir
index 6302a8a4acbf92..55739119aa26d6 100644
--- a/mlir/test/IR/invalid-ops.mlir
+++ b/mlir/test/IR/invalid-ops.mlir
@@ -1076,7 +1076,7 @@ func @invalid_prefetch_locality_hint(%i : index) {
 // incompatible memory space
 func @invalid_memref_cast() {
   %0 = alloc() : memref<2x5xf32, 0>
-  // expected-error@+1 {{operand type 'memref<2x5xf32>' and result type 'memref<*xf32>' are cast incompatible}}
+  // expected-error@+1 {{operand type 'memref<2x5xf32>' and result type 'memref<*xf32, 1>' are cast incompatible}}
   %1 = memref_cast %0 : memref<2x5xf32, 0> to memref<*xf32, 1>
   return
 }

From b475eca1ed8b57bc7457c92b837f93db710c38bc Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Mon, 17 Aug 2020 17:58:14 +0300
Subject: [PATCH 016/101] [llvm-readobj/elf] - Merge mips-got-overlapped.test
 to mips-got.test and refine testing.

The `mips-got-overlapped.test` was introduced in D16968 and its intention is
to check that when there is an empty section at the same address as `.got`,
then we are able to locate `.got` and dump it.

The issue is that this test does not test llvm-readelf and uses a precompiled
object. This path starts using YAML instead and merges
mips-got-overlapped.test to mips-got.test.

Differential revision: https://reviews.llvm.org/D86080
---
 .../ELF/Inputs/got-over.exe.elf-mips          | Bin 1648 -> 0 bytes
 .../llvm-readobj/ELF/mips-got-overlapped.test |  45 -----------
 .../test/tools/llvm-readobj/ELF/mips-got.test |  72 ++++++++++++++++++
 3 files changed, 72 insertions(+), 45 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-readobj/ELF/Inputs/got-over.exe.elf-mips
 delete mode 100644 llvm/test/tools/llvm-readobj/ELF/mips-got-overlapped.test

diff --git a/llvm/test/tools/llvm-readobj/ELF/Inputs/got-over.exe.elf-mips b/llvm/test/tools/llvm-readobj/ELF/Inputs/got-over.exe.elf-mips
deleted file mode 100644
index 27644bff3302a4ceaad06cadb9f0b4eb0dc6ec0e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1648
zcmbVMO>fgc5FI;dfbx}s5Ru>zajKBIZUsM}DYR9kv_Wc%#0AU2ZW3X~R=kDg0u?6?
zNc76BKY&Az9Q!l)8%VvwduwmhMt~3_J<q(E*{|7Izh7;yIj$oUapf{>N0yxv=*J?}
zMke(ttftgtUT(?__-iJ%3@;&aTA~U~RDd(Y3W!_>Cm7>m#e&PQ2ywm_keM=qIDD-V
zBHxfi)XF#q{Y~7d@yAFexbLbL54v6)sytOo;J*`|pX*6ou%`kh4_u@hjKtqdQ}Oy~
z5_$O`Nr&FsboAPb<Iw8@I!)u;FdgOELy^7CzRvcwmkhFep_l4~=s@S=MCD2+8Mb0(
zH>YFmu>Xk^wJPc7kB1H5{v;|*$8j!H#pYL}<sKsM4s$aU1j>Qtd0glkYY#mKUV*+{
z`e(@j#2wz(RiF;cfb*^ye-2c^KS6U|`u<xEIkK!bEUtNldVOnGdTAUsm(<c9F3K@I
zg1dEGxr8yZnCArA-rFtO6dnL_*RBK3I^08zju@!R&mT|^rns8okII^|Xx!t5v_51U
zZ@B;T#a3%`<B9lwu7i;-n#6QV6b3qgznkYa77au3*V~<yR@?8at!=OF_&cqY_Np&_
z4|6Jw4iSY`5rt8rHaDJatDrmBi-R6$I3DJDBx*3!(I^u&Om(Ds!+ng1!-Y4|C?BN5
zB5j<A>IZq>upk+{5|#HcSO;BLR*w_I%|>~of0cP8vIkuB(-}0uw{FGu<2oGdCFHTM
zOm8iMv)7ozd#?jcpm+nO-%QW3->lgAI&hd?#(d~Q(~AxG?Azgd1oH{@^9f8qt^roK
zCS@)Yc?iw@ogxnq&S!=HMe<#M{}H^e$$#tEaC{(Kleq@~*Dc?nd5fSEvFoyL9gy>R
zHpXOYbI+WYc}LhLa6xR|BD|cl6q9%Ee_{5=SO(Zy(fo?fF~GZ{-`+<bHt(a9N*l(e
SWzhCMUcx?~_XYgjblxv5!?8yI

diff --git a/llvm/test/tools/llvm-readobj/ELF/mips-got-overlapped.test b/llvm/test/tools/llvm-readobj/ELF/mips-got-overlapped.test
deleted file mode 100644
index c8f81ccf9d2804..00000000000000
--- a/llvm/test/tools/llvm-readobj/ELF/mips-got-overlapped.test
+++ /dev/null
@@ -1,45 +0,0 @@
-# Check that llvm-readobj -A correctly shows .got section
-# content if there are some other zero-sized sections with the same
-# address as the .got. got-over.exe.elf-mips has zero-sized .data
-# section at the same offset .got section.
-
-# RUN: llvm-readobj -A %p/Inputs/got-over.exe.elf-mips | FileCheck %s
-
-# GOT-OBJ: Cannot find PLTGOT dynamic table tag.
-
-# CHECK:      Primary GOT {
-# CHECK-NEXT:   Canonical gp value: 0x418270
-# CHECK-NEXT:   Reserved entries [
-# CHECK-NEXT:     Entry {
-# CHECK-NEXT:       Address: 0x410280
-# CHECK-NEXT:       Access: -32752
-# CHECK-NEXT:       Initial: 0x0
-# CHECK-NEXT:       Purpose: Lazy resolver
-# CHECK-NEXT:     }
-# CHECK-NEXT:     Entry {
-# CHECK-NEXT:       Address: 0x410284
-# CHECK-NEXT:       Access: -32748
-# CHECK-NEXT:       Initial: 0x80000000
-# CHECK-NEXT:       Purpose: Module pointer (GNU extension)
-# CHECK-NEXT:     }
-# CHECK-NEXT:   ]
-# CHECK-NEXT:   Local entries [
-# CHECK-NEXT:     Entry {
-# CHECK-NEXT:       Address: 0x410288
-# CHECK-NEXT:       Access: -32744
-# CHECK-NEXT:       Initial: 0x4001B8
-# CHECK-NEXT:     }
-# CHECK-NEXT:   ]
-# CHECK-NEXT:   Global entries [
-# CHECK-NEXT:     Entry {
-# CHECK-NEXT:       Address: 0x41028C
-# CHECK-NEXT:       Access: -32740
-# CHECK-NEXT:       Initial: 0x0
-# CHECK-NEXT:       Value: 0x0
-# CHECK-NEXT:       Type: None
-# CHECK-NEXT:       Section: Undefined
-# CHECK-NEXT:       Name: _foo
-# CHECK-NEXT:     }
-# CHECK-NEXT:   ]
-# CHECK-NEXT:   Number of TLS and multi-GOT entries: 0
-# CHECK-NEXT: }
diff --git a/llvm/test/tools/llvm-readobj/ELF/mips-got.test b/llvm/test/tools/llvm-readobj/ELF/mips-got.test
index 7475a6d57d578d..24a06dd2b3bbd7 100644
--- a/llvm/test/tools/llvm-readobj/ELF/mips-got.test
+++ b/llvm/test/tools/llvm-readobj/ELF/mips-got.test
@@ -579,3 +579,75 @@ Sections:
 # RUN: llvm-readobj -A %t.err7.o 2>&1 | FileCheck %s -DFILE=%t.err7.o --check-prefix=NAME-ERR-NOTFOUND --implicit-check-not=warning:
 
 # NAME-ERR-NOTFOUND:      warning: '[[FILE]]': unable to read the name of SHT_PROGBITS section with index 2: a section [index 2] has an invalid sh_name (0xffff) offset which goes past the end of the section name string table
+
+## Check that we correctly show .got section content when there are some other zero-sized
+## sections with the same address as the .got section.
+## In this test the empty .data section has the same address as the .got section.
+
+# RUN: yaml2obj --docnum=4 %s -o %t.err7.o
+# RUN: llvm-readobj -A %t.err7.o 2>&1 | FileCheck %s -DFILE=%t.err7.o --check-prefix=SAME-ADDR-LLVM
+# RUN: llvm-readelf -A %t.err7.o 2>&1 | FileCheck %s -DFILE=%t.err7.o --check-prefix=SAME-ADDR-GNU
+
+# SAME-ADDR-LLVM:      Primary GOT {
+# SAME-ADDR-LLVM-NEXT:   Canonical gp value: 0x9112
+# SAME-ADDR-LLVM-NEXT:   Reserved entries [
+# SAME-ADDR-LLVM-NEXT:     Entry {
+# SAME-ADDR-LLVM-NEXT:       Address: 0x1122
+# SAME-ADDR-LLVM-NEXT:       Access: -32752
+# SAME-ADDR-LLVM-NEXT:       Initial: 0x0
+# SAME-ADDR-LLVM-NEXT:       Purpose: Lazy resolver
+# SAME-ADDR-LLVM-NEXT:     }
+# SAME-ADDR-LLVM-NEXT:   ]
+# SAME-ADDR-LLVM-NEXT:   Local entries [
+# SAME-ADDR-LLVM-NEXT:   ]
+# SAME-ADDR-LLVM-NEXT:   Global entries [
+# SAME-ADDR-LLVM-NEXT:     Entry {
+# SAME-ADDR-LLVM-NEXT:       Address: 0x112A
+# SAME-ADDR-LLVM-NEXT:       Access: -32744
+# SAME-ADDR-LLVM-NEXT:       Initial: 0x0
+# SAME-ADDR-LLVM-NEXT:       Value: 0x0
+# SAME-ADDR-LLVM-NEXT:       Type: None (0x0)
+# SAME-ADDR-LLVM-NEXT:       Section: Undefined (0x0)
+# SAME-ADDR-LLVM-NEXT:       Name: foo (1)
+# SAME-ADDR-LLVM-NEXT:     }
+# SAME-ADDR-LLVM-NEXT:   ]
+# SAME-ADDR-LLVM-NEXT:   Number of TLS and multi-GOT entries: 0
+# SAME-ADDR-LLVM-NEXT: }
+
+# SAME-ADDR-GNU:      Primary GOT:
+# SAME-ADDR-GNU-NEXT:  Canonical gp value: 0000000000009112
+# SAME-ADDR-GNU-EMPTY:
+# SAME-ADDR-GNU-NEXT:  Reserved entries:
+# SAME-ADDR-GNU-NEXT:            Address     Access          Initial Purpose
+# SAME-ADDR-GNU-NEXT:   0000000000001122 -32752(gp) 0000000000000000 Lazy resolver
+# SAME-ADDR-GNU-EMPTY:
+# SAME-ADDR-GNU-NEXT:  Global entries:
+# SAME-ADDR-GNU-NEXT:            Address     Access          Initial         Sym.Val. Type    Ndx Name
+# SAME-ADDR-GNU-NEXT:   000000000000112a -32744(gp) 0000000000000000 0000000000000000 NOTYPE  UND foo
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_MIPS
+Sections:
+  - Name:    .data
+    Type:    SHT_PROGBITS
+    Address: 0x1122
+    Size:    0
+  - Name:    .got
+    Type:    SHT_PROGBITS
+    Address: 0x1122
+    Size:    16
+  - Name: .dynamic
+    Type: SHT_DYNAMIC
+    Entries:
+      - Tag:   DT_MIPS_LOCAL_GOTNO
+        Value: 1
+      - Tag:   DT_MIPS_GOTSYM
+        Value: 1
+      - Tag:   DT_PLTGOT
+        Value: 0x1122
+DynamicSymbols:
+  - Name: foo

From 6786b3e307175a2e26b88c161c4a7ed999ef2185 Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Mon, 17 Aug 2020 16:38:56 +0300
Subject: [PATCH 017/101] [llvm-readobj/elf] - Refine the
 malformed-pt-dynamic.test.

This is splitted out from D85519, but significantly reworked.

Changes:
1) This test was changed to stop using python.
2) Use NoHeaders: true instead of `llvm-objcopy --strip-sections`.
3) Test llvm-readelf too (not just llvm-readobj).
4) Simplify the YAML used a bit (e.g. remove PT_LOAD).
5) Test 2 different cases: objects with section header table and without.

Differential revision: https://reviews.llvm.org/D86073
---
 .../ELF/malformed-pt-dynamic.test             | 88 ++++++++++++-------
 1 file changed, 56 insertions(+), 32 deletions(-)

diff --git a/llvm/test/tools/llvm-readobj/ELF/malformed-pt-dynamic.test b/llvm/test/tools/llvm-readobj/ELF/malformed-pt-dynamic.test
index 3ffdd57486a0e0..d73f55b5fe6a57 100644
--- a/llvm/test/tools/llvm-readobj/ELF/malformed-pt-dynamic.test
+++ b/llvm/test/tools/llvm-readobj/ELF/malformed-pt-dynamic.test
@@ -1,51 +1,75 @@
-# If the offset and/or size fields of the PT_DYNAMIC field become corrupted,
-# we should report a sensible message.
+## If the offset and/or size fields of the PT_DYNAMIC field become corrupted,
+## we should report a sensible message.
 
-# Creating such a malformed file is hard. The easiest way to simulate it is to
-# truncate the file. Note that the section headers must first be stripped or
-# llvm-readobj will fail to parse the file due to the section header table
-# offset pointing outside the file.
+## Case A: Test case where the size of the PT_DYNAMIC header is too large to fit in the file,
+##         but the start is within the file.
 
-# RUN: yaml2obj %s -o %t.base
-# RUN: llvm-objcopy --strip-sections %t.base %t.stripped
+## Case A.1: the section header table is present in the object. Check that we report a warning about the
+##           broken PT_DYNAMIC header, check we dump the dynamic table.
+# RUN: yaml2obj %s -DFILESIZE=0x131 -o %t1
+# RUN: llvm-readobj %t1 --dynamic-table 2>&1 | FileCheck -DFILE=%t1 %s --check-prefixes=WARN1,WARN1-LLVM
+# RUN: llvm-readelf %t1 --dynamic-table 2>&1 | FileCheck -DFILE=%t1 %s --check-prefixes=WARN1,WARN1-GNU
 
-# Test case where the size is too large to fit in the file, but the start is
-# within the file.
-# RUN: cp %t.stripped %t.truncated1
-# RUN: %python -c "with open(r'%t.truncated1', 'r+') as f: f.truncate(0x1001)"
-# RUN: llvm-readobj %t.truncated1 --dynamic-table 2>&1 | \
-# RUN:   FileCheck -DFILE=%t.truncated1 %s --check-prefix=WARN1
+# WARN1: warning: '[[FILE]]': PT_DYNAMIC segment offset (0x1000) + file size (0x131) exceeds the size of the file (0x1130)
 
-# WARN1: warning: '[[FILE]]': PT_DYNAMIC segment offset (0x1000) + file size (0x10) exceeds the size of the file (0x1001)
+# WARN1-LLVM:      DynamicSection [ (1 entries)
+# WARN1-LLVM-NEXT:   Tag                Type Name/Value
+# WARN1-LLVM-NEXT:   0x0000000000000000 NULL 0x0
+# WARN1-LLVM-NEXT: ]
 
-# Test case where the offset is too large to be in the file.
-# RUN: cp %t.stripped %t.truncated2
-# RUN: %python -c "with open(r'%t.truncated2', 'r+') as f: f.truncate(0xFFF)"
-# RUN: llvm-readobj %t.truncated2 --dynamic-table 2>&1 | \
-# RUN:   FileCheck -DFILE=%t.truncated2 %s  --check-prefix=WARN2
+# WARN1-GNU:       Dynamic section at offset 0x1000 contains 1 entries:
+# WARN1-GNU-NEXT:   Tag                Type   Name/Value
+# WARN1-GNU-NEXT:   0x0000000000000000 (NULL) 0x0
 
-# WARN2: warning: '[[FILE]]': PT_DYNAMIC segment offset (0x1000) + file size (0x10) exceeds the size of the file (0xfff)
+## Case A.2: in this case we drop section headers. The dynamic table is not dumped.
+# RUN: yaml2obj %s -DFILESIZE=0x119 -DNOHEADERS=true -o %t1.noheaders
+# RUN: llvm-readobj %t1.noheaders --dynamic-table 2>&1 | FileCheck -DFILE=%t1.noheaders %s \
+# RUN:   --check-prefix=WARN1-NOHEADERS --implicit-check-not="DynamicSection ["
+# RUN: llvm-readelf %t1.noheaders --dynamic-table 2>&1 | FileCheck -DFILE=%t1.noheaders %s \
+# RUN:   --check-prefix=WARN1-NOHEADERS --implicit-check-not="Dynamic section"
+
+# WARN1-NOHEADERS: warning: '[[FILE]]': PT_DYNAMIC segment offset (0x1000) + file size (0x119) exceeds the size of the file (0x1118)
+
+## Case B: Test case where the offset of the PT_DYNAMIC header is too large to be in the file.
+
+## Case B.1: the section header table is present in the object. Check that we report a warning about the
+##           broken PT_DYNAMIC header, but document that we do not dump the dynamic table, because
+##           return an error earlier.
+# RUN: yaml2obj %s -DOFFSET=0x1131 -o %t2
+# RUN: not llvm-readobj %t2 --dynamic-table 2>&1 | FileCheck -DFILE=%t2 %s --check-prefix=WARN2
+# RUN: not llvm-readelf %t2 --dynamic-table 2>&1 | FileCheck -DFILE=%t2 %s --check-prefix=WARN2
+
+# WARN2: warning: '[[FILE]]': PT_DYNAMIC segment offset (0x1131) + file size (0x10) exceeds the size of the file (0x1130)
+# WARN2: error: '[[FILE]]': Invalid data was encountered while parsing the file
+
+## Case B.2: in this case we drop section headers. The dynamic table is not dumped.
+# RUN: yaml2obj %s -DOFFSET=0x1119 -DNOHEADERS=true -o %t2.noheaders
+# RUN: llvm-readobj %t2.noheaders --dynamic-table 2>&1 | FileCheck -DFILE=%t2.noheaders %s \
+# RUN:   --check-prefix=WARN2-NOHEADERS --implicit-check-not="DynamicSection ["
+# RUN: llvm-readelf %t2.noheaders --dynamic-table 2>&1 | FileCheck -DFILE=%t2.noheaders %s \
+# RUN:   --check-prefix=WARN2-NOHEADERS --implicit-check-not="Dynamic section"
+
+# WARN2-NOHEADERS: warning: '[[FILE]]': PT_DYNAMIC segment offset (0x1119) + file size (0x10) exceeds the size of the file (0x1118)
 
 --- !ELF
 FileHeader:
   Class:   ELFCLASS64
   Data:    ELFDATA2LSB
   Type:    ET_EXEC
-  Machine: EM_X86_64
+  Machine: EM_NONE
 Sections:
-  - Name:         .dynamic
-    Type:         SHT_DYNAMIC
-    Address:      0x1000
-    AddressAlign: 0x1000
+  - Name:     .dynamic
+    Type:     SHT_DYNAMIC
+    Address:  0x1000
+    Offset:   0x1000
+    ShOffset: [[OFFSET=<none>]]
     Entries:
       - Tag:   DT_NULL
         Value: 0
 ProgramHeaders:
-  - Type: PT_LOAD
-    VAddr: 0x1000
-    Sections:
-      - Section: .dynamic
-  - Type: PT_DYNAMIC
-    VAddr: 0x1000
+  - Type:     PT_DYNAMIC
+    FileSize: [[FILESIZE=<none>]]
     Sections:
       - Section: .dynamic
+SectionHeaderTable:
+  NoHeaders: [[NOHEADERS=false]]

From 5e361e2aa4f602a6b71d241bf4bc1013d25c3bef Mon Sep 17 00:00:00 2001
From: Shinji Okumura <okuraofvegetable@gmail.com>
Date: Tue, 18 Aug 2020 18:04:47 +0900
Subject: [PATCH 018/101] [Attributor] Deduce noundef attribute

This patch introduces a new abstract attribute `AANoUndef` which corresponds to `noundef` IR attribute and deduce them.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D85184
---
 llvm/include/llvm/Transforms/IPO/Attributor.h |  30 +
 llvm/lib/Transforms/IPO/Attributor.cpp        |   9 +
 .../Transforms/IPO/AttributorAttributes.cpp   | 116 ++++
 .../2008-02-01-ReturnAttrs.ll                 |   8 +-
 .../ArgumentPromotion/X86/attributes.ll       |  40 +-
 .../X86/min-legal-vector-width.ll             | 160 ++---
 .../ArgumentPromotion/X86/thiscall.ll         |  24 +-
 .../Attributor/ArgumentPromotion/alignment.ll |  22 +-
 .../Attributor/ArgumentPromotion/attrs.ll     |   8 +-
 .../Attributor/ArgumentPromotion/basictest.ll |  16 +-
 .../Attributor/ArgumentPromotion/byval-2.ll   |   2 +-
 .../Attributor/ArgumentPromotion/byval.ll     |  16 +-
 .../ArgumentPromotion/control-flow2.ll        |   8 +-
 .../Attributor/ArgumentPromotion/inalloca.ll  |  10 +-
 .../live_called_from_dead.ll                  |   8 +-
 .../live_called_from_dead_2.ll                |  22 +-
 .../ArgumentPromotion/naked_functions.ll      |   2 +-
 .../Attributor/ArgumentPromotion/profile.ll   |   6 +-
 .../ArgumentPromotion/reserve-tbaa.ll         |   4 +-
 .../Attributor/ArgumentPromotion/sret.ll      |  14 +-
 .../Attributor/ArgumentPromotion/tail.ll      |   2 +-
 .../Attributor/ArgumentPromotion/variadic.ll  |   2 +-
 .../IPConstantProp/2009-09-24-byval-ptr.ll    |  14 +-
 .../IPConstantProp/multiple_callbacks.ll      |   8 +-
 .../IPConstantProp/openmp_parallel_for.ll     |  20 +-
 .../Attributor/IPConstantProp/pthreads.ll     |  32 +-
 .../IPConstantProp/return-argument.ll         |   8 +-
 .../IPConstantProp/thread_local_acs.ll        |   8 +-
 llvm/test/Transforms/Attributor/align.ll      |  44 +-
 llvm/test/Transforms/Attributor/callbacks.ll  |  56 +-
 llvm/test/Transforms/Attributor/depgraph.ll   |   1 +
 .../Attributor/dereferenceable-1.ll           |  98 ++-
 .../Transforms/Attributor/heap_to_stack.ll    | 610 ++++++++++++------
 .../Transforms/Attributor/internal-noalias.ll |  24 +-
 llvm/test/Transforms/Attributor/liveness.ll   |  28 +-
 .../Transforms/Attributor/memory_locations.ll |  54 +-
 llvm/test/Transforms/Attributor/misc.ll       |  12 +-
 llvm/test/Transforms/Attributor/misc_crash.ll |   8 +-
 llvm/test/Transforms/Attributor/noalias.ll    | 153 +++--
 .../test/Transforms/Attributor/nocapture-1.ll |   4 +-
 .../test/Transforms/Attributor/nocapture-2.ll |  18 +-
 llvm/test/Transforms/Attributor/nonnull.ll    |  16 +-
 .../Transforms/Attributor/noreturn_async.ll   |   2 +-
 .../Transforms/Attributor/noreturn_sync.ll    |   2 +-
 llvm/test/Transforms/Attributor/nosync.ll     |   2 +-
 llvm/test/Transforms/Attributor/noundef.ll    |  22 +
 .../read_write_returned_arguments_scc.ll      |  20 +-
 llvm/test/Transforms/Attributor/readattrs.ll  |   4 +-
 llvm/test/Transforms/Attributor/returned.ll   |  40 +-
 .../Attributor/undefined_behavior.ll          |  56 +-
 .../Transforms/Attributor/value-simplify.ll   |  39 +-
 .../Transforms/OpenMP/parallel_deletion.ll    |  24 +-
 52 files changed, 1214 insertions(+), 742 deletions(-)
 create mode 100644 llvm/test/Transforms/Attributor/noundef.ll

diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index b6c0a17fc3e7fa..73e25417452cf8 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -3560,6 +3560,36 @@ struct AAPotentialValues
   static const char ID;
 };
 
+/// An abstract interface for all noundef attributes.
+struct AANoUndef
+    : public IRAttribute<Attribute::NoUndef,
+                         StateWrapper<BooleanState, AbstractAttribute>> {
+  AANoUndef(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
+
+  /// Return true if we assume that the underlying value is noundef.
+  bool isAssumedNoUndef() const { return getAssumed(); }
+
+  /// Return true if we know that underlying value is noundef.
+  bool isKnownNoUndef() const { return getKnown(); }
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AANoUndef &createForPosition(const IRPosition &IRP, Attributor &A);
+
+  /// See AbstractAttribute::getName()
+  const std::string getName() const override { return "AANoUndef"; }
+
+  /// See AbstractAttribute::getIdAddr()
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is AANoUndef
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
 /// Run options, used by the pass manager.
 enum AttributorRunOption {
   NONE = 0,
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 54ec7174f9fb8a..6cd3e059c3a19c 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -1948,6 +1948,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
       // Every function with pointer return type might be marked
       // dereferenceable.
       getOrCreateAAFor<AADereferenceable>(RetPos);
+
+      // Every function with pointer return type might be marked noundef.
+      getOrCreateAAFor<AANoUndef>(RetPos);
     }
   }
 
@@ -1985,6 +1988,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
 
       // Every argument with pointer type might be privatizable (or promotable)
       getOrCreateAAFor<AAPrivatizablePtr>(ArgPos);
+
+      // Every argument with pointer type might be marked noundef.
+      getOrCreateAAFor<AANoUndef>(ArgPos);
     }
   }
 
@@ -2051,6 +2057,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
 
       // Call site argument attribute "nofree".
       getOrCreateAAFor<AANoFree>(CBArgPos);
+
+      // Call site argument attribute "noundef".
+      getOrCreateAAFor<AANoUndef>(CBArgPos);
     }
     return true;
   };
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index a1bcec889d1bda..721b8814542399 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -131,6 +131,7 @@ PIPE_OPERATOR(AAValueConstantRange)
 PIPE_OPERATOR(AAPrivatizablePtr)
 PIPE_OPERATOR(AAUndefinedBehavior)
 PIPE_OPERATOR(AAPotentialValues)
+PIPE_OPERATOR(AANoUndef)
 
 #undef PIPE_OPERATOR
 } // namespace llvm
@@ -7625,6 +7626,119 @@ struct AAPotentialValuesCallSiteArgument : AAPotentialValuesFloating {
   }
 };
 
+/// ------------------------ NoUndef Attribute ---------------------------------
+struct AANoUndefImpl : AANoUndef {
+  AANoUndefImpl(const IRPosition &IRP, Attributor &A) : AANoUndef(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    Value &V = getAssociatedValue();
+    if (isa<UndefValue>(V))
+      indicatePessimisticFixpoint();
+    else if (isa<FreezeInst>(V))
+      indicateOptimisticFixpoint();
+    else if (isGuaranteedNotToBeUndefOrPoison(&V))
+      indicateOptimisticFixpoint();
+    else
+      AANoUndef::initialize(A);
+  }
+
+  /// See followUsesInMBEC
+  bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
+                       AANoUndef::StateType &State) {
+    const Value *UseV = U->get();
+    const DominatorTree *DT = nullptr;
+    if (Function *F = getAnchorScope())
+      DT = A.getInfoCache().getAnalysisResultForFunction<DominatorTreeAnalysis>(
+          *F);
+    State.setKnown(isGuaranteedNotToBeUndefOrPoison(UseV, I, DT));
+    bool TrackUse = false;
+    // Track use for instructions which must produce undef or poison bits when
+    // at least one operand contains such bits.
+    if (isa<CastInst>(*I) || isa<GetElementPtrInst>(*I))
+      TrackUse = true;
+    return TrackUse;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "noundef" : "may-undef-or-poison";
+  }
+};
+
+struct AANoUndefFloating : public AANoUndefImpl {
+  AANoUndefFloating(const IRPosition &IRP, Attributor &A)
+      : AANoUndefImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoUndefImpl::initialize(A);
+    if (!getState().isAtFixpoint())
+      if (Instruction *CtxI = getCtxI())
+        followUsesInMBEC(*this, A, getState(), *CtxI);
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto VisitValueCB = [&](Value &V, const Instruction *CtxI,
+                            AANoUndef::StateType &T, bool Stripped) -> bool {
+      const auto &AA = A.getAAFor<AANoUndef>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
+        T.indicatePessimisticFixpoint();
+      } else {
+        const AANoUndef::StateType &S =
+            static_cast<const AANoUndef::StateType &>(AA.getState());
+        T ^= S;
+      }
+      return T.isValidState();
+    };
+
+    StateType T;
+    if (!genericValueTraversal<AANoUndef, StateType>(
+            A, getIRPosition(), *this, T, VisitValueCB, getCtxI()))
+      return indicatePessimisticFixpoint();
+
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(noundef) }
+};
+
+struct AANoUndefReturned final
+    : AAReturnedFromReturnedValues<AANoUndef, AANoUndefImpl> {
+  AANoUndefReturned(const IRPosition &IRP, Attributor &A)
+      : AAReturnedFromReturnedValues<AANoUndef, AANoUndefImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(noundef) }
+};
+
+struct AANoUndefArgument final
+    : AAArgumentFromCallSiteArguments<AANoUndef, AANoUndefImpl> {
+  AANoUndefArgument(const IRPosition &IRP, Attributor &A)
+      : AAArgumentFromCallSiteArguments<AANoUndef, AANoUndefImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(noundef) }
+};
+
+struct AANoUndefCallSiteArgument final : AANoUndefFloating {
+  AANoUndefCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AANoUndefFloating(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(noundef) }
+};
+
+struct AANoUndefCallSiteReturned final
+    : AACallSiteReturnedFromReturned<AANoUndef, AANoUndefImpl> {
+  AANoUndefCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AACallSiteReturnedFromReturned<AANoUndef, AANoUndefImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noundef) }
+};
 } // namespace
 
 const char AAReturnedValues::ID = 0;
@@ -7649,6 +7763,7 @@ const char AAMemoryBehavior::ID = 0;
 const char AAMemoryLocation::ID = 0;
 const char AAValueConstantRange::ID = 0;
 const char AAPotentialValues::ID = 0;
+const char AANoUndef::ID = 0;
 
 // Macro magic to create the static generator function for attributes that
 // follow the naming scheme.
@@ -7759,6 +7874,7 @@ CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAlign)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueConstantRange)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPotentialValues)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUndef)
 
 CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueSimplify)
 CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAIsDead)
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
index 639772d553f6ab..1a95bdb9ce351a 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/2008-02-01-ReturnAttrs.ll
@@ -7,7 +7,7 @@
 define internal i32 @deref(i32* %x) nounwind {
 ; IS__TUNIT_OPM: Function Attrs: argmemonly nofree nosync nounwind readonly willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@deref
-; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[X:%.*]])
+; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X:%.*]])
 ; IS__TUNIT_OPM-NEXT:  entry:
 ; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = load i32, i32* [[X]], align 4
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP2]]
@@ -23,7 +23,7 @@ define internal i32 @deref(i32* %x) nounwind {
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind readonly willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@deref
-; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[X:%.*]])
+; IS__CGSCC____-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X:%.*]])
 ; IS__CGSCC____-NEXT:  entry:
 ; IS__CGSCC____-NEXT:    [[TMP2:%.*]] = load i32, i32* [[X]], align 4
 ; IS__CGSCC____-NEXT:    ret i32 [[TMP2]]
@@ -40,7 +40,7 @@ define i32 @f(i32 %x) {
 ; IS__TUNIT_OPM-NEXT:  entry:
 ; IS__TUNIT_OPM-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
 ; IS__TUNIT_OPM-NEXT:    store i32 [[X]], i32* [[X_ADDR]], align 4
-; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = call i32 @deref(i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[X_ADDR]])
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = call i32 @deref(i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X_ADDR]])
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP1]]
 ;
 ; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone willreturn
@@ -59,7 +59,7 @@ define i32 @f(i32 %x) {
 ; IS__CGSCC____-NEXT:  entry:
 ; IS__CGSCC____-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4
 ; IS__CGSCC____-NEXT:    store i32 [[X]], i32* [[X_ADDR]], align 4
-; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = call i32 @deref(i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[X_ADDR]])
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = call i32 @deref(i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X_ADDR]])
 ; IS__CGSCC____-NEXT:    ret i32 [[TMP1]]
 ;
 entry:
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll
index db349295a54d4a..c5affd398d0cdf 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/attributes.ll
@@ -10,14 +10,14 @@ target triple = "x86_64-unknown-linux-gnu"
 
 define internal fastcc void @no_promote_avx2(<4 x i64>* %arg, <4 x i64>* readonly %arg1) #0 {
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@no_promote_avx2
-; NOT_TUNIT_NPM-SAME: (<4 x i64>* nocapture nofree nonnull writeonly align 32 dereferenceable(32) [[ARG:%.*]], <4 x i64>* nocapture nofree nonnull readonly align 32 dereferenceable(32) [[ARG1:%.*]])
+; NOT_TUNIT_NPM-SAME: (<4 x i64>* nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[ARG:%.*]], <4 x i64>* nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[ARG1:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  bb:
 ; NOT_TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <4 x i64>, <4 x i64>* [[ARG1]], align 32
 ; NOT_TUNIT_NPM-NEXT:    store <4 x i64> [[TMP]], <4 x i64>* [[ARG]], align 32
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@no_promote_avx2
-; IS__TUNIT_NPM-SAME: (<4 x i64>* noalias nocapture nofree nonnull writeonly align 32 dereferenceable(32) [[ARG:%.*]], <4 x i64>* noalias nocapture nofree nonnull readonly align 32 dereferenceable(32) [[ARG1:%.*]])
+; IS__TUNIT_NPM-SAME: (<4 x i64>* noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[ARG:%.*]], <4 x i64>* noalias nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[ARG1:%.*]])
 ; IS__TUNIT_NPM-NEXT:  bb:
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <4 x i64>, <4 x i64>* [[ARG1]], align 32
 ; IS__TUNIT_NPM-NEXT:    store <4 x i64> [[TMP]], <4 x i64>* [[ARG]], align 32
@@ -36,8 +36,8 @@ define void @no_promote(<4 x i64>* %arg) #1 {
 ; IS__TUNIT_OPM-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = alloca <4 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64>* [[TMP]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__TUNIT_OPM-NEXT:    call fastcc void @no_promote_avx2(<4 x i64>* nocapture nofree nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* nocapture nofree nonnull readonly align 32 dereferenceable(32) [[TMP]])
+; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_OPM-NEXT:    call fastcc void @no_promote_avx2(<4 x i64>* nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[TMP]])
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 32
 ; IS__TUNIT_OPM-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[ARG]], align 2
 ; IS__TUNIT_OPM-NEXT:    ret void
@@ -48,8 +48,8 @@ define void @no_promote(<4 x i64>* %arg) #1 {
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = alloca <4 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64>* [[TMP]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__TUNIT_NPM-NEXT:    call fastcc void @no_promote_avx2(<4 x i64>* noalias nocapture nofree nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* noalias nocapture nofree nonnull readonly align 32 dereferenceable(32) [[TMP]])
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_NPM-NEXT:    call fastcc void @no_promote_avx2(<4 x i64>* noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* noalias nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[TMP]])
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 32
 ; IS__TUNIT_NPM-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[ARG]], align 2
 ; IS__TUNIT_NPM-NEXT:    ret void
@@ -60,8 +60,8 @@ define void @no_promote(<4 x i64>* %arg) #1 {
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = alloca <4 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64>* [[TMP]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_OPM-NEXT:    call fastcc void @no_promote_avx2(<4 x i64>* nocapture nofree nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* nocapture nofree nonnull readonly align 32 dereferenceable(32) [[TMP]])
+; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_OPM-NEXT:    call fastcc void @no_promote_avx2(<4 x i64>* nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[TMP]])
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 32
 ; IS__CGSCC_OPM-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[ARG]], align 2
 ; IS__CGSCC_OPM-NEXT:    ret void
@@ -72,8 +72,8 @@ define void @no_promote(<4 x i64>* %arg) #1 {
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = alloca <4 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64>* [[TMP]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_NPM-NEXT:    call fastcc void @no_promote_avx2(<4 x i64>* noalias nocapture nofree nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* noalias nocapture nofree nonnull readonly align 32 dereferenceable(32) [[TMP]])
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_NPM-NEXT:    call fastcc void @no_promote_avx2(<4 x i64>* noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* noalias nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[TMP]])
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 32
 ; IS__CGSCC_NPM-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
@@ -91,14 +91,14 @@ bb:
 
 define internal fastcc void @promote_avx2(<4 x i64>* %arg, <4 x i64>* readonly %arg1) #0 {
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@promote_avx2
-; NOT_TUNIT_NPM-SAME: (<4 x i64>* nocapture nofree nonnull writeonly align 32 dereferenceable(32) [[ARG:%.*]], <4 x i64>* nocapture nofree nonnull readonly align 32 dereferenceable(32) [[ARG1:%.*]])
+; NOT_TUNIT_NPM-SAME: (<4 x i64>* nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[ARG:%.*]], <4 x i64>* nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[ARG1:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  bb:
 ; NOT_TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <4 x i64>, <4 x i64>* [[ARG1]], align 32
 ; NOT_TUNIT_NPM-NEXT:    store <4 x i64> [[TMP]], <4 x i64>* [[ARG]], align 32
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@promote_avx2
-; IS__TUNIT_NPM-SAME: (<4 x i64>* noalias nocapture nofree nonnull writeonly align 32 dereferenceable(32) [[ARG:%.*]], <4 x i64> [[TMP0:%.*]])
+; IS__TUNIT_NPM-SAME: (<4 x i64>* noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[ARG:%.*]], <4 x i64> [[TMP0:%.*]])
 ; IS__TUNIT_NPM-NEXT:  bb:
 ; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <4 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    store <4 x i64> [[TMP0]], <4 x i64>* [[ARG1_PRIV]], align 32
@@ -119,8 +119,8 @@ define void @promote(<4 x i64>* %arg) #0 {
 ; IS__TUNIT_OPM-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = alloca <4 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64>* [[TMP]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__TUNIT_OPM-NEXT:    call fastcc void @promote_avx2(<4 x i64>* nocapture nofree nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* nocapture nofree nonnull readonly align 32 dereferenceable(32) [[TMP]])
+; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_OPM-NEXT:    call fastcc void @promote_avx2(<4 x i64>* nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[TMP]])
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 32
 ; IS__TUNIT_OPM-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[ARG]], align 2
 ; IS__TUNIT_OPM-NEXT:    ret void
@@ -131,9 +131,9 @@ define void @promote(<4 x i64>* %arg) #0 {
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = alloca <4 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64>* [[TMP]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load <4 x i64>, <4 x i64>* [[TMP]], align 32
-; IS__TUNIT_NPM-NEXT:    call fastcc void @promote_avx2(<4 x i64>* noalias nocapture nofree nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64> [[TMP0]])
+; IS__TUNIT_NPM-NEXT:    call fastcc void @promote_avx2(<4 x i64>* noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64> [[TMP0]])
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 32
 ; IS__TUNIT_NPM-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[ARG]], align 2
 ; IS__TUNIT_NPM-NEXT:    ret void
@@ -144,8 +144,8 @@ define void @promote(<4 x i64>* %arg) #0 {
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = alloca <4 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64>* [[TMP]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_OPM-NEXT:    call fastcc void @promote_avx2(<4 x i64>* nocapture nofree nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* nocapture nofree nonnull readonly align 32 dereferenceable(32) [[TMP]])
+; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_OPM-NEXT:    call fastcc void @promote_avx2(<4 x i64>* nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[TMP]])
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 32
 ; IS__CGSCC_OPM-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[ARG]], align 2
 ; IS__CGSCC_OPM-NEXT:    ret void
@@ -156,8 +156,8 @@ define void @promote(<4 x i64>* %arg) #0 {
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = alloca <4 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = alloca <4 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64>* [[TMP]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_NPM-NEXT:    call fastcc void @promote_avx2(<4 x i64>* noalias nocapture nofree nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* noalias nocapture nofree nonnull readonly align 32 dereferenceable(32) [[TMP]])
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(32) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_NPM-NEXT:    call fastcc void @promote_avx2(<4 x i64>* noalias nocapture nofree noundef nonnull writeonly align 32 dereferenceable(32) [[TMP2]], <4 x i64>* noalias nocapture nofree noundef nonnull readonly align 32 dereferenceable(32) [[TMP]])
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 32
 ; IS__CGSCC_NPM-NEXT:    store <4 x i64> [[TMP4]], <4 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
index 4274e3c89111c1..6fa2d588382e18 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
@@ -12,14 +12,14 @@ target triple = "x86_64-unknown-linux-gnu"
 define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #0 {
 ;
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
-; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
+; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  bb:
 ; NOT_TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1]], align 64
 ; NOT_TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]])
+; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]])
 ; IS__TUNIT_NPM-NEXT:  bb:
 ; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
@@ -41,8 +41,8 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>*
 ; IS__TUNIT_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_OPM-NEXT:    ret void
@@ -53,9 +53,9 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>*
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64
-; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]])
+; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]])
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_NPM-NEXT:    ret void
@@ -66,8 +66,8 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>*
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_OPM-NEXT:    ret void
@@ -78,8 +78,8 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>*
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
@@ -99,14 +99,14 @@ bb:
 define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #1 {
 ;
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
-; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
+; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  bb:
 ; NOT_TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1]], align 64
 ; NOT_TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]])
+; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]])
 ; IS__TUNIT_NPM-NEXT:  bb:
 ; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
@@ -128,8 +128,8 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>*
 ; IS__TUNIT_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_OPM-NEXT:    ret void
@@ -140,9 +140,9 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>*
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64
-; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]])
+; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]])
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_NPM-NEXT:    ret void
@@ -153,8 +153,8 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>*
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_OPM-NEXT:    ret void
@@ -165,8 +165,8 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>*
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
@@ -186,14 +186,14 @@ bb:
 define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #1 {
 ;
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
-; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
+; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  bb:
 ; NOT_TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1]], align 64
 ; NOT_TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]])
+; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]])
 ; IS__TUNIT_NPM-NEXT:  bb:
 ; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
@@ -215,8 +215,8 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>*
 ; IS__TUNIT_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_OPM-NEXT:    ret void
@@ -227,9 +227,9 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>*
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64
-; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]])
+; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]])
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_NPM-NEXT:    ret void
@@ -240,8 +240,8 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>*
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_OPM-NEXT:    ret void
@@ -252,8 +252,8 @@ define void @avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>*
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
@@ -273,14 +273,14 @@ bb:
 define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #0 {
 ;
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
-; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
+; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  bb:
 ; NOT_TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1]], align 64
 ; NOT_TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]])
+; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]])
 ; IS__TUNIT_NPM-NEXT:  bb:
 ; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
@@ -302,8 +302,8 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>*
 ; IS__TUNIT_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_OPM-NEXT:    ret void
@@ -314,9 +314,9 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>*
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64
-; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]])
+; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]])
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_NPM-NEXT:    ret void
@@ -327,8 +327,8 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>*
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_OPM-NEXT:    ret void
@@ -339,8 +339,8 @@ define void @avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>*
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
@@ -360,14 +360,14 @@ bb:
 define internal fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #1 {
 ;
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256
-; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
+; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  bb:
 ; NOT_TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1]], align 64
 ; NOT_TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* noalias nocapture nofree nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
+; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
 ; IS__TUNIT_NPM-NEXT:  bb:
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1]], align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
@@ -387,8 +387,8 @@ define void @avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>*
 ; IS__TUNIT_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_OPM-NEXT:    ret void
@@ -399,8 +399,8 @@ define void @avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>*
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_NPM-NEXT:    ret void
@@ -411,8 +411,8 @@ define void @avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>*
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_OPM-NEXT:    ret void
@@ -423,8 +423,8 @@ define void @avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>*
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
@@ -444,14 +444,14 @@ bb:
 define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #2 {
 ;
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256
-; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
+; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  bb:
 ; NOT_TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1]], align 64
 ; NOT_TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* noalias nocapture nofree nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
+; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
 ; IS__TUNIT_NPM-NEXT:  bb:
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1]], align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
@@ -471,8 +471,8 @@ define void @avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>*
 ; IS__TUNIT_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_OPM-NEXT:    ret void
@@ -483,8 +483,8 @@ define void @avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>*
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_NPM-NEXT:    ret void
@@ -495,8 +495,8 @@ define void @avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>*
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_OPM-NEXT:    ret void
@@ -507,8 +507,8 @@ define void @avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>*
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
@@ -528,14 +528,14 @@ bb:
 define internal fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #3 {
 ;
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
-; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
+; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  bb:
 ; NOT_TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1]], align 64
 ; NOT_TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]])
+; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]])
 ; IS__TUNIT_NPM-NEXT:  bb:
 ; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
@@ -557,8 +557,8 @@ define void @avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* %ar
 ; IS__TUNIT_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_OPM-NEXT:    ret void
@@ -569,9 +569,9 @@ define void @avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* %ar
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64
-; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]])
+; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]])
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_NPM-NEXT:    ret void
@@ -582,8 +582,8 @@ define void @avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* %ar
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_OPM-NEXT:    ret void
@@ -594,8 +594,8 @@ define void @avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* %ar
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
@@ -615,14 +615,14 @@ bb:
 define internal fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* %arg, <8 x i64>* readonly %arg1) #4 {
 ;
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
-; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
+; NOT_TUNIT_NPM-SAME: (<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[ARG1:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  bb:
 ; NOT_TUNIT_NPM-NEXT:    [[TMP:%.*]] = load <8 x i64>, <8 x i64>* [[ARG1]], align 64
 ; NOT_TUNIT_NPM-NEXT:    store <8 x i64> [[TMP]], <8 x i64>* [[ARG]], align 64
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
-; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]])
+; IS__TUNIT_NPM-SAME: (<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]])
 ; IS__TUNIT_NPM-NEXT:  bb:
 ; IS__TUNIT_NPM-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP0]], <8 x i64>* [[ARG1_PRIV]], align 64
@@ -644,8 +644,8 @@ define void @avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* %ar
 ; IS__TUNIT_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__TUNIT_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_OPM-NEXT:    call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_OPM-NEXT:    ret void
@@ -656,9 +656,9 @@ define void @avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* %ar
 ; IS__TUNIT_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__TUNIT_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__TUNIT_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 32 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = load <8 x i64>, <8 x i64>* [[TMP]], align 64
-; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]])
+; IS__TUNIT_NPM-NEXT:    call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64> [[TMP0]])
 ; IS__TUNIT_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__TUNIT_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__TUNIT_NPM-NEXT:    ret void
@@ -669,8 +669,8 @@ define void @avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* %ar
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_OPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_OPM-NEXT:    call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_OPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_OPM-NEXT:    ret void
@@ -681,8 +681,8 @@ define void @avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* %ar
 ; IS__CGSCC_NPM-NEXT:    [[TMP:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP2:%.*]] = alloca <8 x i64>, align 32
 ; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64>* [[TMP]] to i8*
-; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
-; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* noalias nocapture nofree nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree nonnull readonly align 64 dereferenceable(64) [[TMP]])
+; IS__CGSCC_NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* nocapture noundef nonnull writeonly align 64 dereferenceable(64) [[TMP3]], i8 0, i64 32, i1 false)
+; IS__CGSCC_NPM-NEXT:    call fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(<8 x i64>* noalias nocapture nofree noundef nonnull writeonly align 64 dereferenceable(64) [[TMP2]], <8 x i64>* noalias nocapture nofree noundef nonnull readonly align 64 dereferenceable(64) [[TMP]])
 ; IS__CGSCC_NPM-NEXT:    [[TMP4:%.*]] = load <8 x i64>, <8 x i64>* [[TMP2]], align 64
 ; IS__CGSCC_NPM-NEXT:    store <8 x i64> [[TMP4]], <8 x i64>* [[ARG]], align 2
 ; IS__CGSCC_NPM-NEXT:    ret void
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/thiscall.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/thiscall.ll
index fa289c15cacd5e..f96cc52e9837af 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/thiscall.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/thiscall.ll
@@ -16,23 +16,23 @@ target triple = "i386-pc-windows-msvc19.11.0"
 
 define internal x86_thiscallcc void @internalfun(%struct.a* %this, <{ %struct.a }>* inalloca) {
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@internalfun
-; IS__TUNIT____-SAME: (%struct.a* noalias nocapture nofree readnone [[THIS:%.*]], <{ [[STRUCT_A:%.*]] }>* inalloca nonnull align 4 dereferenceable(1) [[TMP0:%.*]])
+; IS__TUNIT____-SAME: (%struct.a* noalias nocapture nofree readnone [[THIS:%.*]], <{ [[STRUCT_A:%.*]] }>* inalloca noundef nonnull align 4 dereferenceable(1) [[TMP0:%.*]])
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    [[A:%.*]] = getelementptr inbounds <{ [[STRUCT_A]] }>, <{ [[STRUCT_A]] }>* [[TMP0]], i32 0, i32 0
 ; IS__TUNIT____-NEXT:    [[ARGMEM:%.*]] = alloca inalloca <{ [[STRUCT_A]] }>, align 4
 ; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <{ [[STRUCT_A]] }>, <{ [[STRUCT_A]] }>* [[ARGMEM]], i32 0, i32 0
-; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call x86_thiscallcc %struct.a* @copy_ctor(%struct.a* nonnull align 4 dereferenceable(1) [[TMP1]], %struct.a* nonnull align 4 dereferenceable(1) [[A]])
-; IS__TUNIT____-NEXT:    call void @ext(<{ [[STRUCT_A]] }>* inalloca nonnull align 4 dereferenceable(1) [[ARGMEM]])
+; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call x86_thiscallcc %struct.a* @copy_ctor(%struct.a* noundef nonnull align 4 dereferenceable(1) [[TMP1]], %struct.a* noundef nonnull align 4 dereferenceable(1) [[A]])
+; IS__TUNIT____-NEXT:    call void @ext(<{ [[STRUCT_A]] }>* inalloca noundef nonnull align 4 dereferenceable(1) [[ARGMEM]])
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@internalfun
-; IS__CGSCC____-SAME: (%struct.a* nocapture nofree readnone [[THIS:%.*]], <{ [[STRUCT_A:%.*]] }>* inalloca nonnull align 4 dereferenceable(1) [[TMP0:%.*]])
+; IS__CGSCC____-SAME: (%struct.a* nocapture nofree readnone [[THIS:%.*]], <{ [[STRUCT_A:%.*]] }>* inalloca noundef nonnull align 4 dereferenceable(1) [[TMP0:%.*]])
 ; IS__CGSCC____-NEXT:  entry:
 ; IS__CGSCC____-NEXT:    [[A:%.*]] = getelementptr inbounds <{ [[STRUCT_A]] }>, <{ [[STRUCT_A]] }>* [[TMP0]], i32 0, i32 0
 ; IS__CGSCC____-NEXT:    [[ARGMEM:%.*]] = alloca inalloca <{ [[STRUCT_A]] }>, align 4
 ; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <{ [[STRUCT_A]] }>, <{ [[STRUCT_A]] }>* [[ARGMEM]], i32 0, i32 0
-; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call x86_thiscallcc %struct.a* @copy_ctor(%struct.a* nonnull align 4 dereferenceable(1) [[TMP1]], %struct.a* nonnull align 4 dereferenceable(1) [[A]])
-; IS__CGSCC____-NEXT:    call void @ext(<{ [[STRUCT_A]] }>* inalloca nonnull align 4 dereferenceable(1) [[ARGMEM]])
+; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call x86_thiscallcc %struct.a* @copy_ctor(%struct.a* noundef nonnull align 4 dereferenceable(1) [[TMP1]], %struct.a* noundef nonnull align 4 dereferenceable(1) [[A]])
+; IS__CGSCC____-NEXT:    call void @ext(<{ [[STRUCT_A]] }>* inalloca noundef nonnull align 4 dereferenceable(1) [[ARGMEM]])
 ; IS__CGSCC____-NEXT:    ret void
 ;
 entry:
@@ -48,18 +48,18 @@ entry:
 define void @exportedfun(%struct.a* %a) {
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@exportedfun
 ; IS__TUNIT____-SAME: (%struct.a* nocapture nofree readnone [[A:%.*]])
-; IS__TUNIT____-NEXT:    [[INALLOCA_SAVE:%.*]] = tail call i8* @llvm.stacksave()
+; IS__TUNIT____-NEXT:    [[INALLOCA_SAVE:%.*]] = tail call noundef i8* @llvm.stacksave()
 ; IS__TUNIT____-NEXT:    [[ARGMEM:%.*]] = alloca inalloca <{ [[STRUCT_A:%.*]] }>, align 4
-; IS__TUNIT____-NEXT:    call x86_thiscallcc void @internalfun(%struct.a* noalias nocapture nofree readnone undef, <{ [[STRUCT_A]] }>* inalloca nonnull align 4 dereferenceable(1) [[ARGMEM]])
-; IS__TUNIT____-NEXT:    call void @llvm.stackrestore(i8* [[INALLOCA_SAVE]])
+; IS__TUNIT____-NEXT:    call x86_thiscallcc void @internalfun(%struct.a* noalias nocapture nofree readnone undef, <{ [[STRUCT_A]] }>* inalloca noundef nonnull align 4 dereferenceable(1) [[ARGMEM]])
+; IS__TUNIT____-NEXT:    call void @llvm.stackrestore(i8* noundef [[INALLOCA_SAVE]])
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@exportedfun
 ; IS__CGSCC____-SAME: (%struct.a* nocapture nofree readnone [[A:%.*]])
-; IS__CGSCC____-NEXT:    [[INALLOCA_SAVE:%.*]] = tail call i8* @llvm.stacksave()
+; IS__CGSCC____-NEXT:    [[INALLOCA_SAVE:%.*]] = tail call noundef i8* @llvm.stacksave()
 ; IS__CGSCC____-NEXT:    [[ARGMEM:%.*]] = alloca inalloca <{ [[STRUCT_A:%.*]] }>, align 4
-; IS__CGSCC____-NEXT:    call x86_thiscallcc void @internalfun(%struct.a* noalias nocapture nofree readnone [[A]], <{ [[STRUCT_A]] }>* inalloca nonnull align 4 dereferenceable(1) [[ARGMEM]])
-; IS__CGSCC____-NEXT:    call void @llvm.stackrestore(i8* [[INALLOCA_SAVE]])
+; IS__CGSCC____-NEXT:    call x86_thiscallcc void @internalfun(%struct.a* noalias nocapture nofree readnone [[A]], <{ [[STRUCT_A]] }>* inalloca noundef nonnull align 4 dereferenceable(1) [[ARGMEM]])
+; IS__CGSCC____-NEXT:    call void @llvm.stackrestore(i8* noundef [[INALLOCA_SAVE]])
 ; IS__CGSCC____-NEXT:    ret void
 ;
   %inalloca.save = tail call i8* @llvm.stacksave()
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/alignment.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/alignment.ll
index 33cc4975d59608..59c590abe9e932 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/alignment.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/alignment.ll
@@ -8,7 +8,7 @@ define void @f() {
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@f()
 ; NOT_TUNIT_NPM-NEXT:  entry:
 ; NOT_TUNIT_NPM-NEXT:    [[A:%.*]] = alloca i32, align 1
-; NOT_TUNIT_NPM-NEXT:    call void @g(i32* noalias nocapture nonnull readonly dereferenceable(4) [[A]])
+; NOT_TUNIT_NPM-NEXT:    call void @g(i32* noalias nocapture noundef nonnull readonly dereferenceable(4) [[A]])
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@f()
@@ -26,7 +26,7 @@ entry:
 
 define internal void @g(i32* %a) {
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@g
-; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nonnull readonly dereferenceable(4) [[A:%.*]])
+; IS__TUNIT_OPM-SAME: (i32* noalias nocapture noundef nonnull readonly dereferenceable(4) [[A:%.*]])
 ; IS__TUNIT_OPM-NEXT:    [[AA:%.*]] = load i32, i32* [[A]], align 1
 ; IS__TUNIT_OPM-NEXT:    call void @z(i32 [[AA]])
 ; IS__TUNIT_OPM-NEXT:    ret void
@@ -40,7 +40,7 @@ define internal void @g(i32* %a) {
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@g
-; IS__CGSCC____-SAME: (i32* nocapture nonnull readonly dereferenceable(4) [[A:%.*]])
+; IS__CGSCC____-SAME: (i32* nocapture noundef nonnull readonly dereferenceable(4) [[A:%.*]])
 ; IS__CGSCC____-NEXT:    [[AA:%.*]] = load i32, i32* [[A]], align 1
 ; IS__CGSCC____-NEXT:    call void @z(i32 [[AA]])
 ; IS__CGSCC____-NEXT:    ret void
@@ -57,7 +57,7 @@ declare void @z(i32)
 define internal i32 @test(i32* %X, i64* %Y) {
 ; IS__TUNIT_OPM: Function Attrs: argmemonly nofree nosync nounwind readonly willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@test
-; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[X:%.*]], i64* noalias nocapture nofree nonnull readonly align 8 dereferenceable(8) [[Y:%.*]])
+; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X:%.*]], i64* noalias nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[Y:%.*]])
 ; IS__TUNIT_OPM-NEXT:    [[A:%.*]] = load i32, i32* [[X]], align 4
 ; IS__TUNIT_OPM-NEXT:    [[B:%.*]] = load i64, i64* [[Y]], align 8
 ; IS__TUNIT_OPM-NEXT:    [[C:%.*]] = add i32 [[A]], 1
@@ -89,7 +89,7 @@ define internal i32 @test(i32* %X, i64* %Y) {
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind readonly willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@test
-; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[X:%.*]], i64* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[Y:%.*]])
+; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[X:%.*]], i64* nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[Y:%.*]])
 ; IS__CGSCC____-NEXT:    [[A:%.*]] = load i32, i32* [[X]], align 4
 ; IS__CGSCC____-NEXT:    [[B:%.*]] = load i64, i64* [[Y]], align 8
 ; IS__CGSCC____-NEXT:    [[C:%.*]] = add i32 [[A]], 1
@@ -116,10 +116,10 @@ Return2:
 define internal i32 @caller(i32* %A) {
 ; IS__TUNIT_OPM: Function Attrs: argmemonly nofree nosync nounwind willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@caller
-; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A:%.*]])
+; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]])
 ; IS__TUNIT_OPM-NEXT:    [[B:%.*]] = alloca i64, align 8
 ; IS__TUNIT_OPM-NEXT:    store i64 1, i64* [[B]], align 8
-; IS__TUNIT_OPM-NEXT:    [[C:%.*]] = call i32 @test(i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A]], i64* noalias nocapture nofree nonnull readonly align 8 dereferenceable(8) [[B]])
+; IS__TUNIT_OPM-NEXT:    [[C:%.*]] = call i32 @test(i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A]], i64* noalias nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[B]])
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[C]]
 ;
 ; IS__TUNIT_NPM: Function Attrs: argmemonly nofree nosync nounwind willreturn
@@ -136,10 +136,10 @@ define internal i32 @caller(i32* %A) {
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@caller
-; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A:%.*]])
+; IS__CGSCC____-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]])
 ; IS__CGSCC____-NEXT:    [[B:%.*]] = alloca i64, align 8
 ; IS__CGSCC____-NEXT:    store i64 1, i64* [[B]], align 8
-; IS__CGSCC____-NEXT:    [[C:%.*]] = call i32 @test(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A]], i64* noalias nocapture nofree nonnull readonly align 8 dereferenceable(8) [[B]])
+; IS__CGSCC____-NEXT:    [[C:%.*]] = call i32 @test(i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A]], i64* noalias nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[B]])
 ; IS__CGSCC____-NEXT:    ret i32 [[C]]
 ;
   %B = alloca i64
@@ -153,7 +153,7 @@ define i32 @callercaller() {
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@callercaller()
 ; IS__TUNIT_OPM-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; IS__TUNIT_OPM-NEXT:    store i32 2, i32* [[B]], align 4
-; IS__TUNIT_OPM-NEXT:    [[X:%.*]] = call i32 @caller(i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]])
+; IS__TUNIT_OPM-NEXT:    [[X:%.*]] = call i32 @caller(i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[X]]
 ;
 ; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone
@@ -168,7 +168,7 @@ define i32 @callercaller() {
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@callercaller()
 ; IS__CGSCC____-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; IS__CGSCC____-NEXT:    store i32 2, i32* [[B]], align 4
-; IS__CGSCC____-NEXT:    [[X:%.*]] = call i32 @caller(i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]])
+; IS__CGSCC____-NEXT:    [[X:%.*]] = call i32 @caller(i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
 ; IS__CGSCC____-NEXT:    ret i32 [[X]]
 ;
   %B = alloca i32
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
index e7e67c71cf509d..e4a33ef7fc232e 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/attrs.ll
@@ -11,7 +11,7 @@ define internal i32 @f(%struct.ss* byval %b, i32* byval %X, i32 %i) nounwind {
 ;
 ; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@f
-; IS__TUNIT_OPM-SAME: (%struct.ss* noalias nocapture nofree nonnull byval align 8 dereferenceable(12) [[B:%.*]], i32* noalias nocapture nofree nonnull byval align 4 dereferenceable(4) [[X:%.*]], i32 [[I:%.*]])
+; IS__TUNIT_OPM-SAME: (%struct.ss* noalias nocapture nofree noundef nonnull byval align 8 dereferenceable(12) [[B:%.*]], i32* noalias nocapture nofree nonnull byval align 4 dereferenceable(4) [[X:%.*]], i32 [[I:%.*]])
 ; IS__TUNIT_OPM-NEXT:  entry:
 ; IS__TUNIT_OPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS:%.*]], %struct.ss* [[B]], i32 0, i32 0
 ; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 8
@@ -44,7 +44,7 @@ define internal i32 @f(%struct.ss* byval %b, i32* byval %X, i32 %i) nounwind {
 ;
 ; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@f
-; IS__CGSCC_OPM-SAME: (%struct.ss* noalias nocapture nofree nonnull byval align 8 dereferenceable(12) [[B:%.*]], i32* noalias nocapture nofree nonnull byval align 4 dereferenceable(4) [[X:%.*]])
+; IS__CGSCC_OPM-SAME: (%struct.ss* noalias nocapture nofree noundef nonnull byval align 8 dereferenceable(12) [[B:%.*]], i32* noalias nocapture nofree nonnull byval align 4 dereferenceable(4) [[X:%.*]])
 ; IS__CGSCC_OPM-NEXT:  entry:
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS:%.*]], %struct.ss* [[B]], i32 0, i32 0
 ; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 8
@@ -100,7 +100,7 @@ define i32 @test(i32* %X) {
 ; IS__TUNIT_OPM-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; IS__TUNIT_OPM-NEXT:    store i64 2, i64* [[TMP4]], align 4
-; IS__TUNIT_OPM-NEXT:    [[C:%.*]] = call i32 @f(%struct.ss* noalias nocapture nofree nonnull readonly byval align 8 dereferenceable(12) [[S]], i32* nocapture nofree readonly byval align 4 [[X]], i32 zeroext 0)
+; IS__TUNIT_OPM-NEXT:    [[C:%.*]] = call i32 @f(%struct.ss* noalias nocapture nofree noundef nonnull readonly byval align 8 dereferenceable(12) [[S]], i32* nocapture nofree readonly byval align 4 [[X]], i32 zeroext 0)
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[C]]
 ;
 ; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone willreturn
@@ -129,7 +129,7 @@ define i32 @test(i32* %X) {
 ; IS__CGSCC_OPM-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; IS__CGSCC_OPM-NEXT:    store i64 2, i64* [[TMP4]], align 4
-; IS__CGSCC_OPM-NEXT:    [[C:%.*]] = call i32 @f(%struct.ss* noalias nocapture nofree nonnull readnone byval align 8 dereferenceable(12) [[S]], i32* noalias nocapture nofree nonnull readnone byval align 4 dereferenceable(4) [[X]])
+; IS__CGSCC_OPM-NEXT:    [[C:%.*]] = call i32 @f(%struct.ss* noalias nocapture nofree noundef nonnull readnone byval align 8 dereferenceable(12) [[S]], i32* noalias nocapture nofree nonnull readnone byval align 4 dereferenceable(4) [[X]])
 ; IS__CGSCC_OPM-NEXT:    ret i32 [[C]]
 ;
 ; IS__CGSCC_NPM: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/basictest.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/basictest.ll
index 1522dfe907f005..ea60eb5a1d4900 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/basictest.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/basictest.ll
@@ -8,7 +8,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 define internal i32 @test(i32* %X, i32* %Y) {
 ; IS__TUNIT_OPM: Function Attrs: argmemonly nofree nosync nounwind readonly willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@test
-; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[X:%.*]], i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[Y:%.*]])
+; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X:%.*]], i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[Y:%.*]])
 ; IS__TUNIT_OPM-NEXT:    [[A:%.*]] = load i32, i32* [[X]], align 4
 ; IS__TUNIT_OPM-NEXT:    [[B:%.*]] = load i32, i32* [[Y]], align 4
 ; IS__TUNIT_OPM-NEXT:    [[C:%.*]] = add i32 [[A]], [[B]]
@@ -28,7 +28,7 @@ define internal i32 @test(i32* %X, i32* %Y) {
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind readonly willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@test
-; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[X:%.*]], i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[Y:%.*]])
+; IS__CGSCC____-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[X:%.*]], i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[Y:%.*]])
 ; IS__CGSCC____-NEXT:    [[A:%.*]] = load i32, i32* [[X]], align 4
 ; IS__CGSCC____-NEXT:    [[B:%.*]] = load i32, i32* [[Y]], align 4
 ; IS__CGSCC____-NEXT:    [[C:%.*]] = add i32 [[A]], [[B]]
@@ -43,10 +43,10 @@ define internal i32 @test(i32* %X, i32* %Y) {
 define internal i32 @caller(i32* %B) {
 ; IS__TUNIT_OPM: Function Attrs: argmemonly nofree nosync nounwind willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@caller
-; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B:%.*]])
+; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B:%.*]])
 ; IS__TUNIT_OPM-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; IS__TUNIT_OPM-NEXT:    store i32 1, i32* [[A]], align 4
-; IS__TUNIT_OPM-NEXT:    [[C:%.*]] = call i32 @test(i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A]], i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]])
+; IS__TUNIT_OPM-NEXT:    [[C:%.*]] = call i32 @test(i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A]], i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[C]]
 ;
 ; IS__TUNIT_NPM: Function Attrs: argmemonly nofree nosync nounwind willreturn
@@ -63,10 +63,10 @@ define internal i32 @caller(i32* %B) {
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@caller
-; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B:%.*]])
+; IS__CGSCC____-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B:%.*]])
 ; IS__CGSCC____-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; IS__CGSCC____-NEXT:    store i32 1, i32* [[A]], align 4
-; IS__CGSCC____-NEXT:    [[C:%.*]] = call i32 @test(i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A]], i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]])
+; IS__CGSCC____-NEXT:    [[C:%.*]] = call i32 @test(i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
 ; IS__CGSCC____-NEXT:    ret i32 [[C]]
 ;
   %A = alloca i32
@@ -80,7 +80,7 @@ define i32 @callercaller() {
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@callercaller()
 ; IS__TUNIT_OPM-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; IS__TUNIT_OPM-NEXT:    store i32 2, i32* [[B]], align 4
-; IS__TUNIT_OPM-NEXT:    [[X:%.*]] = call i32 @caller(i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]])
+; IS__TUNIT_OPM-NEXT:    [[X:%.*]] = call i32 @caller(i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[X]]
 ;
 ; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone
@@ -95,7 +95,7 @@ define i32 @callercaller() {
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@callercaller()
 ; IS__CGSCC____-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; IS__CGSCC____-NEXT:    store i32 2, i32* [[B]], align 4
-; IS__CGSCC____-NEXT:    [[X:%.*]] = call i32 @caller(i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]])
+; IS__CGSCC____-NEXT:    [[X:%.*]] = call i32 @caller(i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
 ; IS__CGSCC____-NEXT:    ret i32 [[X]]
 ;
   %B = alloca i32
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
index 3d410cf51bcc17..484d5bcaed3a48 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval-2.ll
@@ -9,7 +9,7 @@
 define internal void @f(%struct.ss* byval  %b, i32* byval %X) nounwind  {
 ; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@f
-; IS__CGSCC_OPM-SAME: (%struct.ss* noalias nocapture nofree nonnull byval align 8 dereferenceable(12) [[B:%.*]], i32* noalias nocapture nofree nonnull writeonly byval align 4 dereferenceable(4) [[X:%.*]])
+; IS__CGSCC_OPM-SAME: (%struct.ss* noalias nocapture nofree noundef nonnull byval align 8 dereferenceable(12) [[B:%.*]], i32* noalias nocapture nofree nonnull writeonly byval align 4 dereferenceable(4) [[X:%.*]])
 ; IS__CGSCC_OPM-NEXT:  entry:
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS:%.*]], %struct.ss* [[B]], i32 0, i32 0
 ; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 8
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
index f87bd4b802eb81..e04f0b02204b8c 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/byval.ll
@@ -11,7 +11,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 define internal i32 @f(%struct.ss* byval  %b) nounwind  {
 ; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@f
-; IS__TUNIT_OPM-SAME: (%struct.ss* noalias nocapture nofree nonnull byval align 8 dereferenceable(12) [[B:%.*]])
+; IS__TUNIT_OPM-SAME: (%struct.ss* noalias nocapture nofree noundef nonnull byval align 8 dereferenceable(12) [[B:%.*]])
 ; IS__TUNIT_OPM-NEXT:  entry:
 ; IS__TUNIT_OPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS:%.*]], %struct.ss* [[B]], i32 0, i32 0
 ; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 8
@@ -36,7 +36,7 @@ define internal i32 @f(%struct.ss* byval  %b) nounwind  {
 ;
 ; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@f
-; IS__CGSCC_OPM-SAME: (%struct.ss* noalias nocapture nofree nonnull byval align 8 dereferenceable(12) [[B:%.*]])
+; IS__CGSCC_OPM-SAME: (%struct.ss* noalias nocapture nofree noundef nonnull byval align 8 dereferenceable(12) [[B:%.*]])
 ; IS__CGSCC_OPM-NEXT:  entry:
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS:%.*]], %struct.ss* [[B]], i32 0, i32 0
 ; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 8
@@ -71,7 +71,7 @@ entry:
 define internal i32 @g(%struct.ss* byval align 32 %b) nounwind {
 ; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@g
-; IS__TUNIT_OPM-SAME: (%struct.ss* noalias nocapture nofree nonnull byval align 32 dereferenceable(12) [[B:%.*]])
+; IS__TUNIT_OPM-SAME: (%struct.ss* noalias nocapture nofree noundef nonnull byval align 32 dereferenceable(12) [[B:%.*]])
 ; IS__TUNIT_OPM-NEXT:  entry:
 ; IS__TUNIT_OPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS:%.*]], %struct.ss* [[B]], i32 0, i32 0
 ; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 32
@@ -96,7 +96,7 @@ define internal i32 @g(%struct.ss* byval align 32 %b) nounwind {
 ;
 ; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@g
-; IS__CGSCC_OPM-SAME: (%struct.ss* noalias nocapture nofree nonnull byval align 32 dereferenceable(12) [[B:%.*]])
+; IS__CGSCC_OPM-SAME: (%struct.ss* noalias nocapture nofree noundef nonnull byval align 32 dereferenceable(12) [[B:%.*]])
 ; IS__CGSCC_OPM-NEXT:  entry:
 ; IS__CGSCC_OPM-NEXT:    [[TMP:%.*]] = getelementptr [[STRUCT_SS:%.*]], %struct.ss* [[B]], i32 0, i32 0
 ; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP]], align 32
@@ -137,8 +137,8 @@ define i32 @main() nounwind  {
 ; IS__TUNIT_OPM-NEXT:    store i32 1, i32* [[TMP1]], align 8
 ; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; IS__TUNIT_OPM-NEXT:    store i64 2, i64* [[TMP4]], align 4
-; IS__TUNIT_OPM-NEXT:    [[C0:%.*]] = call i32 @f(%struct.ss* noalias nocapture nofree nonnull readonly byval align 8 dereferenceable(12) [[S]])
-; IS__TUNIT_OPM-NEXT:    [[C1:%.*]] = call i32 @g(%struct.ss* noalias nocapture nofree nonnull readonly byval align 32 dereferenceable(12) [[S]])
+; IS__TUNIT_OPM-NEXT:    [[C0:%.*]] = call i32 @f(%struct.ss* noalias nocapture nofree noundef nonnull readonly byval align 8 dereferenceable(12) [[S]])
+; IS__TUNIT_OPM-NEXT:    [[C1:%.*]] = call i32 @g(%struct.ss* noalias nocapture nofree noundef nonnull readonly byval align 32 dereferenceable(12) [[S]])
 ; IS__TUNIT_OPM-NEXT:    [[A:%.*]] = add i32 [[C0]], [[C1]]
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[A]]
 ;
@@ -171,8 +171,8 @@ define i32 @main() nounwind  {
 ; IS__CGSCC_OPM-NEXT:    store i32 1, i32* [[TMP1]], align 32
 ; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; IS__CGSCC_OPM-NEXT:    store i64 2, i64* [[TMP4]], align 4
-; IS__CGSCC_OPM-NEXT:    [[C0:%.*]] = call i32 @f(%struct.ss* noalias nocapture nofree nonnull readnone byval align 32 dereferenceable(12) [[S]])
-; IS__CGSCC_OPM-NEXT:    [[C1:%.*]] = call i32 @g(%struct.ss* noalias nocapture nofree nonnull readnone byval align 32 dereferenceable(12) [[S]])
+; IS__CGSCC_OPM-NEXT:    [[C0:%.*]] = call i32 @f(%struct.ss* noalias nocapture nofree noundef nonnull readnone byval align 32 dereferenceable(12) [[S]])
+; IS__CGSCC_OPM-NEXT:    [[C1:%.*]] = call i32 @g(%struct.ss* noalias nocapture nofree noundef nonnull readnone byval align 32 dereferenceable(12) [[S]])
 ; IS__CGSCC_OPM-NEXT:    [[A:%.*]] = add i32 [[C0]], [[C1]]
 ; IS__CGSCC_OPM-NEXT:    ret i32 [[A]]
 ;
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/control-flow2.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/control-flow2.ll
index ce997ba494931a..bc22fd6c862fa2 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/control-flow2.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/control-flow2.ll
@@ -9,7 +9,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 define internal i32 @callee(i1 %C, i32* %P) {
 ; IS__TUNIT_OPM: Function Attrs: argmemonly nofree nosync nounwind readonly willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@callee
-; IS__TUNIT_OPM-SAME: (i1 [[C:%.*]], i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[P:%.*]])
+; IS__TUNIT_OPM-SAME: (i1 [[C:%.*]], i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[P:%.*]])
 ; IS__TUNIT_OPM-NEXT:    br label [[F:%.*]]
 ; IS__TUNIT_OPM:       T:
 ; IS__TUNIT_OPM-NEXT:    unreachable
@@ -31,7 +31,7 @@ define internal i32 @callee(i1 %C, i32* %P) {
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind readonly willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@callee
-; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[P:%.*]])
+; IS__CGSCC____-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[P:%.*]])
 ; IS__CGSCC____-NEXT:    br label [[F:%.*]]
 ; IS__CGSCC____:       T:
 ; IS__CGSCC____-NEXT:    unreachable
@@ -54,7 +54,7 @@ define i32 @foo() {
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@foo()
 ; IS__TUNIT_OPM-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; IS__TUNIT_OPM-NEXT:    store i32 17, i32* [[A]], align 4
-; IS__TUNIT_OPM-NEXT:    [[X:%.*]] = call i32 @callee(i1 false, i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A]])
+; IS__TUNIT_OPM-NEXT:    [[X:%.*]] = call i32 @callee(i1 false, i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A]])
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[X]]
 ;
 ; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone willreturn
@@ -69,7 +69,7 @@ define i32 @foo() {
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@foo()
 ; IS__CGSCC____-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; IS__CGSCC____-NEXT:    store i32 17, i32* [[A]], align 4
-; IS__CGSCC____-NEXT:    [[X:%.*]] = call i32 @callee(i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A]])
+; IS__CGSCC____-NEXT:    [[X:%.*]] = call i32 @callee(i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A]])
 ; IS__CGSCC____-NEXT:    ret i32 [[X]]
 ;
   %A = alloca i32         ; <i32*> [#uses=2]
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll
index b7ff607c270382..5da4437f3ae24f 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/inalloca.ll
@@ -12,7 +12,7 @@ target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:1
 define internal i32 @f(%struct.ss* inalloca  %s) {
 ; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind readonly willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@f
-; IS__TUNIT____-SAME: (%struct.ss* inalloca noalias nocapture nofree nonnull align 4 dereferenceable(8) [[S:%.*]])
+; IS__TUNIT____-SAME: (%struct.ss* inalloca noalias nocapture nofree noundef nonnull align 4 dereferenceable(8) [[S:%.*]])
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    [[F0:%.*]] = getelementptr [[STRUCT_SS:%.*]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__TUNIT____-NEXT:    [[F1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
@@ -23,7 +23,7 @@ define internal i32 @f(%struct.ss* inalloca  %s) {
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind readonly willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@f
-; IS__CGSCC____-SAME: (%struct.ss* inalloca nocapture nofree nonnull align 4 dereferenceable(8) [[S:%.*]])
+; IS__CGSCC____-SAME: (%struct.ss* inalloca nocapture nofree noundef nonnull align 4 dereferenceable(8) [[S:%.*]])
 ; IS__CGSCC____-NEXT:  entry:
 ; IS__CGSCC____-NEXT:    [[F0:%.*]] = getelementptr [[STRUCT_SS:%.*]], %struct.ss* [[S]], i32 0, i32 0
 ; IS__CGSCC____-NEXT:    [[F1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
@@ -50,7 +50,7 @@ define i32 @main() {
 ; IS__TUNIT____-NEXT:    [[F1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; IS__TUNIT____-NEXT:    store i32 1, i32* [[F0]], align 4
 ; IS__TUNIT____-NEXT:    store i32 2, i32* [[F1]], align 4
-; IS__TUNIT____-NEXT:    [[R:%.*]] = call i32 @f(%struct.ss* inalloca noalias nocapture nofree nonnull align 4 dereferenceable(8) [[S]])
+; IS__TUNIT____-NEXT:    [[R:%.*]] = call i32 @f(%struct.ss* inalloca noalias nocapture nofree noundef nonnull align 4 dereferenceable(8) [[S]])
 ; IS__TUNIT____-NEXT:    ret i32 [[R]]
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
@@ -61,7 +61,7 @@ define i32 @main() {
 ; IS__CGSCC____-NEXT:    [[F1:%.*]] = getelementptr [[STRUCT_SS]], %struct.ss* [[S]], i32 0, i32 1
 ; IS__CGSCC____-NEXT:    store i32 1, i32* [[F0]], align 4
 ; IS__CGSCC____-NEXT:    store i32 2, i32* [[F1]], align 4
-; IS__CGSCC____-NEXT:    [[R:%.*]] = call i32 @f(%struct.ss* inalloca noalias nocapture nofree nonnull align 4 dereferenceable(8) [[S]])
+; IS__CGSCC____-NEXT:    [[R:%.*]] = call i32 @f(%struct.ss* inalloca noalias nocapture nofree noundef nonnull align 4 dereferenceable(8) [[S]])
 ; IS__CGSCC____-NEXT:    ret i32 [[R]]
 ;
 entry:
@@ -78,7 +78,7 @@ entry:
 define internal i1 @g(%struct.ss* %a, %struct.ss* inalloca %b) nounwind  {
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@g
-; IS__CGSCC____-SAME: (%struct.ss* nocapture nofree nonnull readnone align 4 dereferenceable(8) [[A:%.*]], %struct.ss* inalloca nocapture nofree nonnull writeonly align 4 dereferenceable(8) [[B:%.*]])
+; IS__CGSCC____-SAME: (%struct.ss* nocapture nofree noundef nonnull readnone align 4 dereferenceable(8) [[A:%.*]], %struct.ss* inalloca nocapture nofree noundef nonnull writeonly align 4 dereferenceable(8) [[B:%.*]])
 ; IS__CGSCC____-NEXT:  entry:
 ; IS__CGSCC____-NEXT:    ret i1 undef
 ;
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll
index bb14e16820c769..4c2886f83aa992 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll
@@ -13,7 +13,7 @@ define internal void @dead() {
 define internal i32 @test(i32* %X, i32* %Y) {
 ; IS__CGSCC_OPM: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test
-; IS__CGSCC_OPM-SAME: (i32* noalias nocapture nofree writeonly align 4 [[X:%.*]])
+; IS__CGSCC_OPM-SAME: (i32* noalias nocapture nofree noundef writeonly align 4 [[X:%.*]])
 ; IS__CGSCC_OPM-NEXT:    br i1 true, label [[LIVE:%.*]], label [[DEAD:%.*]]
 ; IS__CGSCC_OPM:       live:
 ; IS__CGSCC_OPM-NEXT:    store i32 0, i32* [[X]], align 4
@@ -23,7 +23,7 @@ define internal i32 @test(i32* %X, i32* %Y) {
 ;
 ; IS__CGSCC_NPM: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn writeonly
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@test
-; IS__CGSCC_NPM-SAME: (i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[X:%.*]])
+; IS__CGSCC_NPM-SAME: (i32* noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[X:%.*]])
 ; IS__CGSCC_NPM-NEXT:    br i1 true, label [[LIVE:%.*]], label [[DEAD:%.*]]
 ; IS__CGSCC_NPM:       live:
 ; IS__CGSCC_NPM-NEXT:    store i32 0, i32* [[X]], align 4
@@ -46,14 +46,14 @@ define internal i32 @caller(i32* %B) {
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@caller()
 ; IS__CGSCC_OPM-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; IS__CGSCC_OPM-NEXT:    store i32 1, i32* [[A]], align 4
-; IS__CGSCC_OPM-NEXT:    [[C:%.*]] = call i32 @test(i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[A]])
+; IS__CGSCC_OPM-NEXT:    [[C:%.*]] = call i32 @test(i32* noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]])
 ; IS__CGSCC_OPM-NEXT:    ret i32 0
 ;
 ; IS__CGSCC_NPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@caller()
 ; IS__CGSCC_NPM-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; IS__CGSCC_NPM-NEXT:    store i32 1, i32* [[A]], align 4
-; IS__CGSCC_NPM-NEXT:    [[C:%.*]] = call i32 @test(i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[A]])
+; IS__CGSCC_NPM-NEXT:    [[C:%.*]] = call i32 @test(i32* noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]])
 ; IS__CGSCC_NPM-NEXT:    ret i32 undef
 ;
   %A = alloca i32
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead_2.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead_2.ll
index fc1e6589499032..d3bc0c4d317796 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead_2.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead_2.ll
@@ -13,7 +13,7 @@ define internal void @dead() {
 define internal i32 @test(i32* %X, i32* %Y) {
 ; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@test
-; IS__TUNIT____-SAME: (i32* noalias nocapture nofree writeonly align 4 [[X:%.*]])
+; IS__TUNIT____-SAME: (i32* noalias nocapture nofree noundef writeonly align 4 [[X:%.*]])
 ; IS__TUNIT____-NEXT:    br i1 true, label [[LIVE:%.*]], label [[DEAD:%.*]]
 ; IS__TUNIT____:       live:
 ; IS__TUNIT____-NEXT:    store i32 0, i32* [[X]], align 4
@@ -23,7 +23,7 @@ define internal i32 @test(i32* %X, i32* %Y) {
 ;
 ; IS__CGSCC_OPM: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test
-; IS__CGSCC_OPM-SAME: (i32* nocapture nofree writeonly align 4 [[X:%.*]])
+; IS__CGSCC_OPM-SAME: (i32* nocapture nofree noundef writeonly align 4 [[X:%.*]])
 ; IS__CGSCC_OPM-NEXT:    br i1 true, label [[LIVE:%.*]], label [[DEAD:%.*]]
 ; IS__CGSCC_OPM:       live:
 ; IS__CGSCC_OPM-NEXT:    store i32 0, i32* [[X]], align 4
@@ -33,7 +33,7 @@ define internal i32 @test(i32* %X, i32* %Y) {
 ;
 ; IS__CGSCC_NPM: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn writeonly
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@test
-; IS__CGSCC_NPM-SAME: (i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[X:%.*]])
+; IS__CGSCC_NPM-SAME: (i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[X:%.*]])
 ; IS__CGSCC_NPM-NEXT:    br i1 true, label [[LIVE:%.*]], label [[DEAD:%.*]]
 ; IS__CGSCC_NPM:       live:
 ; IS__CGSCC_NPM-NEXT:    store i32 0, i32* [[X]], align 4
@@ -54,26 +54,26 @@ dead:
 define internal i32 @caller(i32* %B) {
 ; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@caller
-; IS__TUNIT____-SAME: (i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[B:%.*]])
+; IS__TUNIT____-SAME: (i32* noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B:%.*]])
 ; IS__TUNIT____-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; IS__TUNIT____-NEXT:    store i32 1, i32* [[A]], align 4
-; IS__TUNIT____-NEXT:    [[C:%.*]] = call i32 @test(i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[B]])
+; IS__TUNIT____-NEXT:    [[C:%.*]] = call i32 @test(i32* noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]])
 ; IS__TUNIT____-NEXT:    ret i32 0
 ;
 ; IS__CGSCC_OPM: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@caller
-; IS__CGSCC_OPM-SAME: (i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[B:%.*]])
+; IS__CGSCC_OPM-SAME: (i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B:%.*]])
 ; IS__CGSCC_OPM-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; IS__CGSCC_OPM-NEXT:    store i32 1, i32* [[A]], align 4
-; IS__CGSCC_OPM-NEXT:    [[C:%.*]] = call i32 @test(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[B]])
+; IS__CGSCC_OPM-NEXT:    [[C:%.*]] = call i32 @test(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]])
 ; IS__CGSCC_OPM-NEXT:    ret i32 0
 ;
 ; IS__CGSCC_NPM: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn writeonly
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@caller
-; IS__CGSCC_NPM-SAME: (i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[B:%.*]])
+; IS__CGSCC_NPM-SAME: (i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B:%.*]])
 ; IS__CGSCC_NPM-NEXT:    [[A:%.*]] = alloca i32, align 4
 ; IS__CGSCC_NPM-NEXT:    store i32 1, i32* [[A]], align 4
-; IS__CGSCC_NPM-NEXT:    [[C:%.*]] = call i32 @test(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[B]])
+; IS__CGSCC_NPM-NEXT:    [[C:%.*]] = call i32 @test(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]])
 ; IS__CGSCC_NPM-NEXT:    ret i32 undef
 ;
   %A = alloca i32
@@ -87,14 +87,14 @@ define i32 @callercaller() {
 ; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@callercaller()
 ; NOT_CGSCC_NPM-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; NOT_CGSCC_NPM-NEXT:    store i32 2, i32* [[B]], align 4
-; NOT_CGSCC_NPM-NEXT:    [[X:%.*]] = call i32 @caller(i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[B]])
+; NOT_CGSCC_NPM-NEXT:    [[X:%.*]] = call i32 @caller(i32* noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]])
 ; NOT_CGSCC_NPM-NEXT:    ret i32 0
 ;
 ; IS__CGSCC_NPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@callercaller()
 ; IS__CGSCC_NPM-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; IS__CGSCC_NPM-NEXT:    store i32 2, i32* [[B]], align 4
-; IS__CGSCC_NPM-NEXT:    [[X:%.*]] = call i32 @caller(i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[B]])
+; IS__CGSCC_NPM-NEXT:    [[X:%.*]] = call i32 @caller(i32* noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[B]])
 ; IS__CGSCC_NPM-NEXT:    ret i32 0
 ;
   %B = alloca i32
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/naked_functions.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/naked_functions.ll
index 0d3464c062fa20..dcd4feba716a00 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/naked_functions.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/naked_functions.ll
@@ -11,7 +11,7 @@
 define i32 @bar() {
 ; CHECK-LABEL: define {{[^@]+}}@bar()
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @foo(i32* nonnull align 4 dereferenceable(4) @g)
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @foo(i32* noundef nonnull align 4 dereferenceable(4) @g)
 ; CHECK-NEXT:    ret i32 [[CALL]]
 ;
 entry:
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/profile.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/profile.ll
index 1c847b88b52860..94be92dc73695f 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/profile.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/profile.ll
@@ -11,7 +11,7 @@ define void @caller() #0 {
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@caller()
 ; NOT_TUNIT_NPM-NEXT:    [[X:%.*]] = alloca i32, align 4
 ; NOT_TUNIT_NPM-NEXT:    store i32 42, i32* [[X]], align 4
-; NOT_TUNIT_NPM-NEXT:    call void @promote_i32_ptr(i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[X]]), !prof !0
+; NOT_TUNIT_NPM-NEXT:    call void @promote_i32_ptr(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[X]]), !prof !0
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@caller()
@@ -29,7 +29,7 @@ define void @caller() #0 {
 
 define internal void @promote_i32_ptr(i32* %xp) {
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@promote_i32_ptr
-; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[XP:%.*]])
+; IS__TUNIT_OPM-SAME: (i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[XP:%.*]])
 ; IS__TUNIT_OPM-NEXT:    [[X:%.*]] = load i32, i32* [[XP]], align 4
 ; IS__TUNIT_OPM-NEXT:    call void @use_i32(i32 [[X]])
 ; IS__TUNIT_OPM-NEXT:    ret void
@@ -43,7 +43,7 @@ define internal void @promote_i32_ptr(i32* %xp) {
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@promote_i32_ptr
-; IS__CGSCC____-SAME: (i32* nocapture nonnull readonly align 4 dereferenceable(4) [[XP:%.*]])
+; IS__CGSCC____-SAME: (i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[XP:%.*]])
 ; IS__CGSCC____-NEXT:    [[X:%.*]] = load i32, i32* [[XP]], align 4
 ; IS__CGSCC____-NEXT:    call void @use_i32(i32 [[X]])
 ; IS__CGSCC____-NEXT:    ret void
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/reserve-tbaa.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/reserve-tbaa.ll
index 38a159608827b5..2f7e41f080cd6f 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/reserve-tbaa.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/reserve-tbaa.ll
@@ -19,7 +19,7 @@
 define internal fastcc void @fn(i32* nocapture readonly %p1, i64* nocapture readonly %p2) {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@fn
-; IS__TUNIT____-SAME: (i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[P1:%.*]])
+; IS__TUNIT____-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[P1:%.*]])
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    [[TMP0:%.*]] = load i32, i32* @g, align 4, [[TBAA0:!tbaa !.*]]
 ; IS__TUNIT____-NEXT:    [[CONV1:%.*]] = trunc i32 [[TMP0]] to i8
@@ -51,7 +51,7 @@ define i32 @main() {
 ; IS__TUNIT____-NEXT:    store i32* @g, i32** [[TMP0]], align 8, [[TBAA5]]
 ; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = load i32*, i32** @a, align 8, [[TBAA5]]
 ; IS__TUNIT____-NEXT:    store i32 1, i32* [[TMP1]], align 4, [[TBAA0]]
-; IS__TUNIT____-NEXT:    call fastcc void @fn(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) @g)
+; IS__TUNIT____-NEXT:    call fastcc void @fn(i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) @g)
 ; IS__TUNIT____-NEXT:    ret i32 0
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind willreturn
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/sret.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/sret.ll
index a16d6fc49ee3dc..834df9a1c85447 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/sret.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/sret.ll
@@ -11,7 +11,7 @@ define internal void @add({i32, i32}* %this, i32* sret %r) {
 ;
 ; IS__TUNIT_OPM: Function Attrs: argmemonly nofree nosync nounwind willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@add
-; IS__TUNIT_OPM-SAME: ({ i32, i32 }* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[THIS:%.*]], i32* nocapture nofree nonnull sret writeonly align 4 dereferenceable(4) [[R:%.*]])
+; IS__TUNIT_OPM-SAME: ({ i32, i32 }* nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[THIS:%.*]], i32* nocapture nofree noundef nonnull sret writeonly align 4 dereferenceable(4) [[R:%.*]])
 ; IS__TUNIT_OPM-NEXT:    [[AP:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[THIS]], i32 0, i32 0
 ; IS__TUNIT_OPM-NEXT:    [[BP:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[THIS]], i32 0, i32 1
 ; IS__TUNIT_OPM-NEXT:    [[A:%.*]] = load i32, i32* [[AP]], align 8
@@ -22,7 +22,7 @@ define internal void @add({i32, i32}* %this, i32* sret %r) {
 ;
 ; IS__TUNIT_NPM: Function Attrs: argmemonly nofree nosync nounwind willreturn
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@add
-; IS__TUNIT_NPM-SAME: ({ i32, i32 }* noalias nocapture nofree nonnull readonly align 8 dereferenceable(8) [[THIS:%.*]], i32* noalias nocapture nofree nonnull sret writeonly align 4 dereferenceable(4) [[R:%.*]])
+; IS__TUNIT_NPM-SAME: ({ i32, i32 }* noalias nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[THIS:%.*]], i32* noalias nocapture nofree noundef nonnull sret writeonly align 4 dereferenceable(4) [[R:%.*]])
 ; IS__TUNIT_NPM-NEXT:    [[AP:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[THIS]], i32 0, i32 0
 ; IS__TUNIT_NPM-NEXT:    [[BP:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[THIS]], i32 0, i32 1
 ; IS__TUNIT_NPM-NEXT:    [[A:%.*]] = load i32, i32* [[AP]], align 8
@@ -33,7 +33,7 @@ define internal void @add({i32, i32}* %this, i32* sret %r) {
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@add
-; IS__CGSCC____-SAME: ({ i32, i32 }* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[THIS:%.*]], i32* nocapture nofree nonnull sret writeonly align 4 dereferenceable(4) [[R:%.*]])
+; IS__CGSCC____-SAME: ({ i32, i32 }* nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[THIS:%.*]], i32* nocapture nofree noundef nonnull sret writeonly align 4 dereferenceable(4) [[R:%.*]])
 ; IS__CGSCC____-NEXT:    [[AP:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[THIS]], i32 0, i32 0
 ; IS__CGSCC____-NEXT:    [[BP:%.*]] = getelementptr { i32, i32 }, { i32, i32 }* [[THIS]], i32 0, i32 1
 ; IS__CGSCC____-NEXT:    [[A:%.*]] = load i32, i32* [[AP]], align 8
@@ -56,28 +56,28 @@ define void @f() {
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@f()
 ; IS__TUNIT_OPM-NEXT:    [[R:%.*]] = alloca i32, align 4
 ; IS__TUNIT_OPM-NEXT:    [[PAIR:%.*]] = alloca { i32, i32 }, align 8
-; IS__TUNIT_OPM-NEXT:    call void @add({ i32, i32 }* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[PAIR]], i32* nocapture nofree nonnull sret writeonly align 4 dereferenceable(4) [[R]])
+; IS__TUNIT_OPM-NEXT:    call void @add({ i32, i32 }* nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[PAIR]], i32* nocapture nofree noundef nonnull sret writeonly align 4 dereferenceable(4) [[R]])
 ; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@f()
 ; IS__TUNIT_NPM-NEXT:    [[R:%.*]] = alloca i32, align 4
 ; IS__TUNIT_NPM-NEXT:    [[PAIR:%.*]] = alloca { i32, i32 }, align 8
-; IS__TUNIT_NPM-NEXT:    call void @add({ i32, i32 }* noalias nocapture nofree nonnull readonly align 8 dereferenceable(8) [[PAIR]], i32* noalias nocapture nofree nonnull sret writeonly align 4 dereferenceable(4) [[R]])
+; IS__TUNIT_NPM-NEXT:    call void @add({ i32, i32 }* noalias nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[PAIR]], i32* noalias nocapture nofree noundef nonnull sret writeonly align 4 dereferenceable(4) [[R]])
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@f()
 ; IS__CGSCC_OPM-NEXT:    [[R:%.*]] = alloca i32, align 4
 ; IS__CGSCC_OPM-NEXT:    [[PAIR:%.*]] = alloca { i32, i32 }, align 8
-; IS__CGSCC_OPM-NEXT:    call void @add({ i32, i32 }* nocapture nofree nonnull readonly align 8 dereferenceable(8) [[PAIR]], i32* nocapture nofree nonnull sret writeonly align 4 dereferenceable(4) [[R]])
+; IS__CGSCC_OPM-NEXT:    call void @add({ i32, i32 }* nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[PAIR]], i32* nocapture nofree noundef nonnull sret writeonly align 4 dereferenceable(4) [[R]])
 ; IS__CGSCC_OPM-NEXT:    ret void
 ;
 ; IS__CGSCC_NPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@f()
 ; IS__CGSCC_NPM-NEXT:    [[R:%.*]] = alloca i32, align 4
 ; IS__CGSCC_NPM-NEXT:    [[PAIR:%.*]] = alloca { i32, i32 }, align 8
-; IS__CGSCC_NPM-NEXT:    call void @add({ i32, i32 }* noalias nocapture nofree nonnull readonly align 8 dereferenceable(8) [[PAIR]], i32* noalias nocapture nofree nonnull sret writeonly align 4 dereferenceable(4) [[R]])
+; IS__CGSCC_NPM-NEXT:    call void @add({ i32, i32 }* noalias nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[PAIR]], i32* noalias nocapture nofree noundef nonnull sret writeonly align 4 dereferenceable(4) [[R]])
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
   %r = alloca i32
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll
index ce4f78065d1161..685e21df3d27c0 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/tail.ll
@@ -34,7 +34,7 @@ define internal void @bar(%pair* byval %Data) {
 ; IS__CGSCC_NPM-NEXT:    store i32 [[TMP0]], i32* [[DATA_PRIV_CAST]], align 4
 ; IS__CGSCC_NPM-NEXT:    [[DATA_PRIV_0_1:%.*]] = getelementptr [[PAIR]], %pair* [[DATA_PRIV]], i32 0, i32 1
 ; IS__CGSCC_NPM-NEXT:    store i32 [[TMP1]], i32* [[DATA_PRIV_0_1]], align 4
-; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = call i8* @foo(%pair* nonnull align 8 dereferenceable(8) [[DATA_PRIV]])
+; IS__CGSCC_NPM-NEXT:    [[TMP3:%.*]] = call i8* @foo(%pair* noundef nonnull align 8 dereferenceable(8) [[DATA_PRIV]])
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
   tail call i8* @foo(%pair* %Data)
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/variadic.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/variadic.ll
index 6120b725cc74fc..b072069f8945ef 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/variadic.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/variadic.ll
@@ -21,7 +21,7 @@ define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@main
 ; CHECK-SAME: (i32 [[ARGC:%.*]], i8** nocapture nofree readnone [[ARGV:%.*]])
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    tail call void (i8*, i8*, i8*, i8*, i8*, ...) @callee_t0f(i8* undef, i8* undef, i8* undef, i8* undef, i8* undef, %struct.tt0* nonnull byval align 8 dereferenceable(16) @t45)
+; CHECK-NEXT:    tail call void (i8*, i8*, i8*, i8*, i8*, ...) @callee_t0f(i8* undef, i8* undef, i8* undef, i8* undef, i8* undef, %struct.tt0* noundef nonnull byval align 8 dereferenceable(16) @t45)
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
index 98051fc678ad3c..a6e27f7254dd47 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/2009-09-24-byval-ptr.ll
@@ -11,7 +11,7 @@
 define internal void @vfu1(%struct.MYstr* byval align 4 %u) nounwind {
 ; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@vfu1
-; IS__CGSCC_OPM-SAME: (%struct.MYstr* noalias nocapture nofree nonnull writeonly byval align 8 dereferenceable(8) [[U:%.*]])
+; IS__CGSCC_OPM-SAME: (%struct.MYstr* noalias nocapture nofree noundef nonnull writeonly byval align 8 dereferenceable(8) [[U:%.*]])
 ; IS__CGSCC_OPM-NEXT:  entry:
 ; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_MYSTR:%.*]], %struct.MYstr* [[U]], i32 0, i32 1
 ; IS__CGSCC_OPM-NEXT:    store i32 99, i32* [[TMP0]], align 4
@@ -52,7 +52,7 @@ return:                                           ; preds = %entry
 define internal i32 @vfu2(%struct.MYstr* byval align 4 %u) nounwind readonly {
 ; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind readonly willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@vfu2
-; IS__TUNIT_OPM-SAME: (%struct.MYstr* noalias nocapture nofree nonnull readonly byval align 8 dereferenceable(8) [[U:%.*]])
+; IS__TUNIT_OPM-SAME: (%struct.MYstr* noalias nocapture nofree noundef nonnull readonly byval align 8 dereferenceable(8) [[U:%.*]])
 ; IS__TUNIT_OPM-NEXT:  entry:
 ; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_MYSTR:%.*]], %struct.MYstr* @mystr, i32 0, i32 1
 ; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
@@ -104,7 +104,7 @@ define i32 @unions() nounwind {
 ; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@unions()
 ; IS__TUNIT_OPM-NEXT:  entry:
-; IS__TUNIT_OPM-NEXT:    [[RESULT:%.*]] = call i32 @vfu2(%struct.MYstr* nocapture nofree nonnull readonly byval align 8 dereferenceable(8) @mystr)
+; IS__TUNIT_OPM-NEXT:    [[RESULT:%.*]] = call i32 @vfu2(%struct.MYstr* nocapture nofree noundef nonnull readonly byval align 8 dereferenceable(8) @mystr)
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[RESULT]]
 ;
 ; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind willreturn
@@ -132,7 +132,7 @@ entry:
 define internal i32 @vfu2_v2(%struct.MYstr* byval align 4 %u) nounwind readonly {
 ; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@vfu2_v2
-; IS__TUNIT_OPM-SAME: (%struct.MYstr* noalias nocapture nofree nonnull byval align 8 dereferenceable(8) [[U:%.*]])
+; IS__TUNIT_OPM-SAME: (%struct.MYstr* noalias nocapture nofree noundef nonnull byval align 8 dereferenceable(8) [[U:%.*]])
 ; IS__TUNIT_OPM-NEXT:  entry:
 ; IS__TUNIT_OPM-NEXT:    [[Z:%.*]] = getelementptr [[STRUCT_MYSTR:%.*]], %struct.MYstr* [[U]], i32 0, i32 1
 ; IS__TUNIT_OPM-NEXT:    store i32 99, i32* [[Z]], align 4
@@ -165,7 +165,7 @@ define internal i32 @vfu2_v2(%struct.MYstr* byval align 4 %u) nounwind readonly
 ;
 ; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@vfu2_v2
-; IS__CGSCC_OPM-SAME: (%struct.MYstr* noalias nocapture nofree nonnull byval align 8 dereferenceable(8) [[U:%.*]])
+; IS__CGSCC_OPM-SAME: (%struct.MYstr* noalias nocapture nofree noundef nonnull byval align 8 dereferenceable(8) [[U:%.*]])
 ; IS__CGSCC_OPM-NEXT:  entry:
 ; IS__CGSCC_OPM-NEXT:    [[Z:%.*]] = getelementptr [[STRUCT_MYSTR:%.*]], %struct.MYstr* [[U]], i32 0, i32 1
 ; IS__CGSCC_OPM-NEXT:    store i32 99, i32* [[Z]], align 4
@@ -212,7 +212,7 @@ define i32 @unions_v2() nounwind {
 ; IS__TUNIT_OPM: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@unions_v2()
 ; IS__TUNIT_OPM-NEXT:  entry:
-; IS__TUNIT_OPM-NEXT:    [[RESULT:%.*]] = call i32 @vfu2_v2(%struct.MYstr* nocapture nofree nonnull readonly byval align 8 dereferenceable(8) @mystr)
+; IS__TUNIT_OPM-NEXT:    [[RESULT:%.*]] = call i32 @vfu2_v2(%struct.MYstr* nocapture nofree noundef nonnull readonly byval align 8 dereferenceable(8) @mystr)
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[RESULT]]
 ;
 ; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone willreturn
@@ -228,7 +228,7 @@ define i32 @unions_v2() nounwind {
 ; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@unions_v2()
 ; IS__CGSCC_OPM-NEXT:  entry:
-; IS__CGSCC_OPM-NEXT:    [[RESULT:%.*]] = call i32 @vfu2_v2(%struct.MYstr* noalias nocapture nofree nonnull readnone byval align 8 dereferenceable(8) @mystr)
+; IS__CGSCC_OPM-NEXT:    [[RESULT:%.*]] = call i32 @vfu2_v2(%struct.MYstr* noalias nocapture nofree noundef nonnull readnone byval align 8 dereferenceable(8) @mystr)
 ; IS__CGSCC_OPM-NEXT:    ret i32 [[RESULT]]
 ;
 ; IS__CGSCC_NPM: Function Attrs: nofree norecurse nosync nounwind readonly willreturn
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/multiple_callbacks.ll b/llvm/test/Transforms/Attributor/IPConstantProp/multiple_callbacks.ll
index e6b8e8317a42ae..da44880a7c6fea 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/multiple_callbacks.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/multiple_callbacks.ll
@@ -128,10 +128,10 @@ entry:
 define void @foo() {
 ; CHECK-LABEL: define {{[^@]+}}@foo()
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @broker(i32 (i32)* nonnull @cb0, i32 (i32)* nonnull @cb1, i32 (i32)* nonnull @cb0, i32 0, i32 1)
-; CHECK-NEXT:    call void @broker(i32 (i32)* nonnull @cb1, i32 (i32)* nonnull @cb2, i32 (i32)* nonnull @cb2, i32 0, i32 1)
-; CHECK-NEXT:    call void @broker(i32 (i32)* nonnull @cb3, i32 (i32)* nonnull @cb2, i32 (i32)* nonnull @cb3, i32 0, i32 1)
-; CHECK-NEXT:    call void @broker(i32 (i32)* nonnull @cb4, i32 (i32)* nonnull @cb4, i32 (i32)* nonnull @cb4, i32 0, i32 1)
+; CHECK-NEXT:    call void @broker(i32 (i32)* noundef nonnull @cb0, i32 (i32)* noundef nonnull @cb1, i32 (i32)* noundef nonnull @cb0, i32 0, i32 1)
+; CHECK-NEXT:    call void @broker(i32 (i32)* noundef nonnull @cb1, i32 (i32)* noundef nonnull @cb2, i32 (i32)* noundef nonnull @cb2, i32 0, i32 1)
+; CHECK-NEXT:    call void @broker(i32 (i32)* noundef nonnull @cb3, i32 (i32)* noundef nonnull @cb2, i32 (i32)* noundef nonnull @cb3, i32 0, i32 1)
+; CHECK-NEXT:    call void @broker(i32 (i32)* noundef nonnull @cb4, i32 (i32)* noundef nonnull @cb4, i32 (i32)* noundef nonnull @cb4, i32 0, i32 1)
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll b/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll
index 137193b972ca6c..e33db1ca6f4a06 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/openmp_parallel_for.ll
@@ -36,7 +36,7 @@ define dso_local void @foo(i32 %N) {
 ; IS__TUNIT_OPM-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
 ; IS__TUNIT_OPM-NEXT:    store float 3.000000e+00, float* [[P]], align 4
 ; IS__TUNIT_OPM-NEXT:    store i32 7, i32* [[N_ADDR]], align 4
-; IS__TUNIT_OPM-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) [[GLOB1:@.*]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, float*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nocapture nonnull readonly align 4 dereferenceable(4) [[N_ADDR]], float* nocapture nonnull readonly align 4 dereferenceable(4) [[P]], i64 undef)
+; IS__TUNIT_OPM-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]], i32 3, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*, float*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[N_ADDR]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[P]], i64 undef)
 ; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@foo
@@ -47,7 +47,7 @@ define dso_local void @foo(i32 %N) {
 ; IS__TUNIT_NPM-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
 ; IS__TUNIT_NPM-NEXT:    store float 3.000000e+00, float* [[P]], align 4
 ; IS__TUNIT_NPM-NEXT:    store i32 7, i32* [[N_ADDR]], align 4
-; IS__TUNIT_NPM-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) [[GLOB1:@.*]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, float*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[N_ADDR]], float* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[P]], i64 undef)
+; IS__TUNIT_NPM-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]], i32 3, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*, float*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[N_ADDR]], float* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[P]], i64 undef)
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@foo
@@ -58,7 +58,7 @@ define dso_local void @foo(i32 %N) {
 ; IS__CGSCC_OPM-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
 ; IS__CGSCC_OPM-NEXT:    store float 3.000000e+00, float* [[P]], align 4
 ; IS__CGSCC_OPM-NEXT:    store i32 7, i32* [[N_ADDR]], align 4
-; IS__CGSCC_OPM-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) [[GLOB1:@.*]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, float*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nocapture nonnull readonly align 4 dereferenceable(4) [[N_ADDR]], float* nocapture nonnull readonly align 4 dereferenceable(4) [[P]], i64 4617315517961601024)
+; IS__CGSCC_OPM-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]], i32 3, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*, float*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[N_ADDR]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[P]], i64 4617315517961601024)
 ; IS__CGSCC_OPM-NEXT:    ret void
 ;
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@foo
@@ -69,7 +69,7 @@ define dso_local void @foo(i32 %N) {
 ; IS__CGSCC_NPM-NEXT:    store i32 [[N]], i32* [[N_ADDR]], align 4
 ; IS__CGSCC_NPM-NEXT:    store float 3.000000e+00, float* [[P]], align 4
 ; IS__CGSCC_NPM-NEXT:    store i32 7, i32* [[N_ADDR]], align 4
-; IS__CGSCC_NPM-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) [[GLOB1:@.*]], i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, float*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[N_ADDR]], float* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[P]], i64 4617315517961601024)
+; IS__CGSCC_NPM-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB1:@.*]], i32 3, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*, float*, i64)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[N_ADDR]], float* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[P]], i64 4617315517961601024)
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 entry:
@@ -84,7 +84,7 @@ entry:
 
 define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %N, float* dereferenceable(4) %p, i64 %q) {
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@.omp_outlined.
-; NOT_TUNIT_NPM-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nonnull readonly align 4 dereferenceable(4) [[N:%.*]], float* nocapture nonnull readonly align 4 dereferenceable(4) [[P:%.*]], i64 [[Q:%.*]])
+; NOT_TUNIT_NPM-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[N:%.*]], float* nocapture noundef nonnull readonly align 4 dereferenceable(4) [[P:%.*]], i64 [[Q:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  entry:
 ; NOT_TUNIT_NPM-NEXT:    [[Q_ADDR:%.*]] = alloca i64, align 8
 ; NOT_TUNIT_NPM-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
@@ -103,7 +103,7 @@ define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.
 ; NOT_TUNIT_NPM-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 ; NOT_TUNIT_NPM-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 ; NOT_TUNIT_NPM-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; NOT_TUNIT_NPM-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull align 8 dereferenceable(24) [[GLOB0:@.*]], i32 [[TMP5]], i32 34, i32* nonnull align 4 dereferenceable(4) [[DOTOMP_IS_LAST]], i32* nonnull align 4 dereferenceable(4) [[DOTOMP_LB]], i32* nonnull align 4 dereferenceable(4) [[DOTOMP_UB]], i32* nonnull align 4 dereferenceable(4) [[DOTOMP_STRIDE]], i32 1, i32 1)
+; NOT_TUNIT_NPM-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB0:@.*]], i32 [[TMP5]], i32 34, i32* noundef nonnull align 4 dereferenceable(4) [[DOTOMP_IS_LAST]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTOMP_LB]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTOMP_UB]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTOMP_STRIDE]], i32 1, i32 1)
 ; NOT_TUNIT_NPM-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
 ; NOT_TUNIT_NPM-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP6]], [[SUB3]]
 ; NOT_TUNIT_NPM-NEXT:    br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
@@ -139,13 +139,13 @@ define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.
 ; NOT_TUNIT_NPM-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 ; NOT_TUNIT_NPM:       omp.loop.exit:
 ; NOT_TUNIT_NPM-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; NOT_TUNIT_NPM-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* nonnull align 8 dereferenceable(24) [[GLOB0]], i32 [[TMP12]])
+; NOT_TUNIT_NPM-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB0]], i32 [[TMP12]])
 ; NOT_TUNIT_NPM-NEXT:    br label [[OMP_PRECOND_END]]
 ; NOT_TUNIT_NPM:       omp.precond.end:
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@.omp_outlined.
-; IS__TUNIT_NPM-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[N:%.*]], float* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[P:%.*]], i64 [[Q:%.*]])
+; IS__TUNIT_NPM-SAME: (i32* noalias nocapture readonly [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[N:%.*]], float* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[P:%.*]], i64 [[Q:%.*]])
 ; IS__TUNIT_NPM-NEXT:  entry:
 ; IS__TUNIT_NPM-NEXT:    [[Q_ADDR:%.*]] = alloca i64, align 8
 ; IS__TUNIT_NPM-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
@@ -164,7 +164,7 @@ define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.
 ; IS__TUNIT_NPM-NEXT:    store i32 1, i32* [[DOTOMP_STRIDE]], align 4
 ; IS__TUNIT_NPM-NEXT:    store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
 ; IS__TUNIT_NPM-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; IS__TUNIT_NPM-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull align 8 dereferenceable(24) [[GLOB0:@.*]], i32 [[TMP5]], i32 34, i32* nonnull align 4 dereferenceable(4) [[DOTOMP_IS_LAST]], i32* nonnull align 4 dereferenceable(4) [[DOTOMP_LB]], i32* nonnull align 4 dereferenceable(4) [[DOTOMP_UB]], i32* nonnull align 4 dereferenceable(4) [[DOTOMP_STRIDE]], i32 1, i32 1)
+; IS__TUNIT_NPM-NEXT:    call void @__kmpc_for_static_init_4(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB0:@.*]], i32 [[TMP5]], i32 34, i32* noundef nonnull align 4 dereferenceable(4) [[DOTOMP_IS_LAST]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTOMP_LB]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTOMP_UB]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTOMP_STRIDE]], i32 1, i32 1)
 ; IS__TUNIT_NPM-NEXT:    [[TMP6:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
 ; IS__TUNIT_NPM-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP6]], [[SUB3]]
 ; IS__TUNIT_NPM-NEXT:    br i1 [[CMP6]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
@@ -200,7 +200,7 @@ define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.
 ; IS__TUNIT_NPM-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 ; IS__TUNIT_NPM:       omp.loop.exit:
 ; IS__TUNIT_NPM-NEXT:    [[TMP12:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
-; IS__TUNIT_NPM-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* nonnull align 8 dereferenceable(24) [[GLOB0]], i32 [[TMP12]])
+; IS__TUNIT_NPM-NEXT:    call void @__kmpc_for_static_fini(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) [[GLOB0]], i32 [[TMP12]])
 ; IS__TUNIT_NPM-NEXT:    br label [[OMP_PRECOND_END]]
 ; IS__TUNIT_NPM:       omp.precond.end:
 ; IS__TUNIT_NPM-NEXT:    ret void
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll b/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll
index 7ac5b42d741723..bf3ee0ff8eec5b 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/pthreads.ll
@@ -36,10 +36,10 @@ define dso_local i32 @main() {
 ; IS__TUNIT____-NEXT:    [[ALLOC1:%.*]] = alloca i8, align 8
 ; IS__TUNIT____-NEXT:    [[ALLOC2:%.*]] = alloca i8, align 8
 ; IS__TUNIT____-NEXT:    [[THREAD:%.*]] = alloca i64, align 8
-; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call i32 @pthread_create(i64* nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture align 536870912 null, i8* (i8*)* nonnull @foo, i8* noalias nocapture nofree readnone align 536870912 undef)
-; IS__TUNIT____-NEXT:    [[CALL1:%.*]] = call i32 @pthread_create(i64* nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture align 536870912 null, i8* (i8*)* nonnull @bar, i8* noalias nofree nonnull readnone align 8 dereferenceable(8) "no-capture-maybe-returned" undef)
-; IS__TUNIT____-NEXT:    [[CALL2:%.*]] = call i32 @pthread_create(i64* nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture align 536870912 null, i8* (i8*)* nonnull @baz, i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(1) [[ALLOC1]])
-; IS__TUNIT____-NEXT:    [[CALL3:%.*]] = call i32 @pthread_create(i64* nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture align 536870912 null, i8* (i8*)* nonnull @buz, i8* noalias nofree nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" [[ALLOC2]])
+; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @foo, i8* noalias nocapture nofree noundef readnone align 536870912 undef)
+; IS__TUNIT____-NEXT:    [[CALL1:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @bar, i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(8) "no-capture-maybe-returned" undef)
+; IS__TUNIT____-NEXT:    [[CALL2:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @baz, i8* noalias nocapture nofree noundef nonnull readnone align 8 dereferenceable(1) [[ALLOC1]])
+; IS__TUNIT____-NEXT:    [[CALL3:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @buz, i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" [[ALLOC2]])
 ; IS__TUNIT____-NEXT:    ret i32 0
 ;
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@main()
@@ -47,10 +47,10 @@ define dso_local i32 @main() {
 ; IS__CGSCC____-NEXT:    [[ALLOC1:%.*]] = alloca i8, align 8
 ; IS__CGSCC____-NEXT:    [[ALLOC2:%.*]] = alloca i8, align 8
 ; IS__CGSCC____-NEXT:    [[THREAD:%.*]] = alloca i64, align 8
-; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call i32 @pthread_create(i64* nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture align 536870912 null, i8* (i8*)* nonnull @foo, i8* noalias nocapture nofree readnone align 536870912 null)
-; IS__CGSCC____-NEXT:    [[CALL1:%.*]] = call i32 @pthread_create(i64* nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture align 536870912 null, i8* (i8*)* nonnull @bar, i8* noalias nofree nonnull readnone align 8 dereferenceable(8) bitcast (i8** @GlobalVPtr to i8*))
-; IS__CGSCC____-NEXT:    [[CALL2:%.*]] = call i32 @pthread_create(i64* nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture align 536870912 null, i8* (i8*)* nonnull @baz, i8* noalias nocapture nofree nonnull readnone align 8 dereferenceable(1) [[ALLOC1]])
-; IS__CGSCC____-NEXT:    [[CALL3:%.*]] = call i32 @pthread_create(i64* nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture align 536870912 null, i8* (i8*)* nonnull @buz, i8* noalias nofree nonnull readnone align 8 dereferenceable(1) [[ALLOC2]])
+; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @foo, i8* noalias nocapture nofree noundef readnone align 536870912 null)
+; IS__CGSCC____-NEXT:    [[CALL1:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @bar, i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(8) bitcast (i8** @GlobalVPtr to i8*))
+; IS__CGSCC____-NEXT:    [[CALL2:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @baz, i8* noalias nocapture nofree noundef nonnull readnone align 8 dereferenceable(1) [[ALLOC1]])
+; IS__CGSCC____-NEXT:    [[CALL3:%.*]] = call i32 @pthread_create(i64* noundef nonnull align 8 dereferenceable(8) [[THREAD]], %union.pthread_attr_t* noalias nocapture noundef align 536870912 null, i8* (i8*)* noundef nonnull @buz, i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(1) [[ALLOC2]])
 ; IS__CGSCC____-NEXT:    ret i32 0
 ;
 entry:
@@ -69,13 +69,13 @@ declare !callback !0 dso_local i32 @pthread_create(i64*, %union.pthread_attr_t*,
 define internal i8* @foo(i8* %arg) {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@foo
-; IS__TUNIT____-SAME: (i8* noalias nofree readnone returned align 536870912 "no-capture-maybe-returned" [[ARG:%.*]])
+; IS__TUNIT____-SAME: (i8* noalias nofree noundef readnone returned align 536870912 "no-capture-maybe-returned" [[ARG:%.*]])
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    ret i8* null
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@foo
-; IS__CGSCC____-SAME: (i8* noalias nofree readnone returned align 536870912 "no-capture-maybe-returned" [[ARG:%.*]])
+; IS__CGSCC____-SAME: (i8* noalias nofree noundef readnone returned align 536870912 "no-capture-maybe-returned" [[ARG:%.*]])
 ; IS__CGSCC____-NEXT:  entry:
 ; IS__CGSCC____-NEXT:    ret i8* null
 ;
@@ -86,13 +86,13 @@ entry:
 define internal i8* @bar(i8* %arg) {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@bar
-; IS__TUNIT____-SAME: (i8* noalias nofree nonnull readnone returned align 8 dereferenceable(8) "no-capture-maybe-returned" [[ARG:%.*]])
+; IS__TUNIT____-SAME: (i8* noalias nofree noundef nonnull readnone returned align 8 dereferenceable(8) "no-capture-maybe-returned" [[ARG:%.*]])
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    ret i8* bitcast (i8** @GlobalVPtr to i8*)
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@bar
-; IS__CGSCC____-SAME: (i8* nofree readnone returned "no-capture-maybe-returned" [[ARG:%.*]])
+; IS__CGSCC____-SAME: (i8* nofree noundef readnone returned "no-capture-maybe-returned" [[ARG:%.*]])
 ; IS__CGSCC____-NEXT:  entry:
 ; IS__CGSCC____-NEXT:    ret i8* bitcast (i8** @GlobalVPtr to i8*)
 ;
@@ -103,13 +103,13 @@ entry:
 define internal i8* @baz(i8* %arg) {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@baz
-; IS__TUNIT____-SAME: (i8* noalias nofree nonnull readnone returned align 8 dereferenceable(1) "no-capture-maybe-returned" [[ARG:%.*]])
+; IS__TUNIT____-SAME: (i8* noalias nofree noundef nonnull readnone returned align 8 dereferenceable(1) "no-capture-maybe-returned" [[ARG:%.*]])
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    ret i8* [[ARG]]
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@baz
-; IS__CGSCC____-SAME: (i8* nofree nonnull readnone returned align 8 dereferenceable(1) "no-capture-maybe-returned" [[ARG:%.*]])
+; IS__CGSCC____-SAME: (i8* nofree noundef nonnull readnone returned align 8 dereferenceable(1) "no-capture-maybe-returned" [[ARG:%.*]])
 ; IS__CGSCC____-NEXT:  entry:
 ; IS__CGSCC____-NEXT:    ret i8* [[ARG]]
 ;
@@ -120,13 +120,13 @@ entry:
 define internal i8* @buz(i8* %arg) {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@buz
-; IS__TUNIT____-SAME: (i8* noalias nofree nonnull readnone returned align 8 dereferenceable(1) "no-capture-maybe-returned" [[ARG:%.*]])
+; IS__TUNIT____-SAME: (i8* noalias nofree noundef nonnull readnone returned align 8 dereferenceable(1) "no-capture-maybe-returned" [[ARG:%.*]])
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    ret i8* [[ARG]]
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@buz
-; IS__CGSCC____-SAME: (i8* nofree nonnull readnone returned align 8 dereferenceable(1) "no-capture-maybe-returned" [[ARG:%.*]])
+; IS__CGSCC____-SAME: (i8* nofree noundef nonnull readnone returned align 8 dereferenceable(1) "no-capture-maybe-returned" [[ARG:%.*]])
 ; IS__CGSCC____-NEXT:  entry:
 ; IS__CGSCC____-NEXT:    ret i8* [[ARG]]
 ;
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/return-argument.ll b/llvm/test/Transforms/Attributor/IPConstantProp/return-argument.ll
index c6af31713bfac9..0d472837ddb845 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/return-argument.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/return-argument.ll
@@ -8,7 +8,7 @@
 define internal i32* @incdec(i1 %C, i32* %V) {
 ; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@incdec
-; IS__TUNIT____-SAME: (i1 [[C:%.*]], i32* noalias nofree nonnull returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[V:%.*]])
+; IS__TUNIT____-SAME: (i1 [[C:%.*]], i32* noalias nofree noundef nonnull returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[V:%.*]])
 ; IS__TUNIT____-NEXT:    [[X:%.*]] = load i32, i32* [[V]], align 4
 ; IS__TUNIT____-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; IS__TUNIT____:       T:
@@ -22,7 +22,7 @@ define internal i32* @incdec(i1 %C, i32* %V) {
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@incdec
-; IS__CGSCC____-SAME: (i1 [[C:%.*]], i32* nofree nonnull returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[V:%.*]])
+; IS__CGSCC____-SAME: (i1 [[C:%.*]], i32* nofree noundef nonnull returned align 4 dereferenceable(4) "no-capture-maybe-returned" [[V:%.*]])
 ; IS__CGSCC____-NEXT:    [[X:%.*]] = load i32, i32* [[V]], align 4
 ; IS__CGSCC____-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; IS__CGSCC____:       T:
@@ -78,7 +78,7 @@ define void @caller(i1 %C) personality i32 (...)* @__gxx_personality_v0 {
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@caller
 ; IS__TUNIT____-SAME: (i1 [[C:%.*]]) [[ATTR2:#.*]] personality i32 (...)* @__gxx_personality_v0
 ; IS__TUNIT____-NEXT:    [[Q:%.*]] = alloca i32, align 4
-; IS__TUNIT____-NEXT:    [[W:%.*]] = call align 4 i32* @incdec(i1 [[C]], i32* noalias nofree nonnull align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q]])
+; IS__TUNIT____-NEXT:    [[W:%.*]] = call align 4 i32* @incdec(i1 [[C]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) "no-capture-maybe-returned" [[Q]])
 ; IS__TUNIT____-NEXT:    [[S1:%.*]] = call { i32, i32 } @foo(i32 1, i32 2)
 ; IS__TUNIT____-NEXT:    [[X1:%.*]] = extractvalue { i32, i32 } [[S1]], 0
 ; IS__TUNIT____-NEXT:    [[S2:%.*]] = call { i32, i32 } @foo(i32 3, i32 4)
@@ -97,7 +97,7 @@ define void @caller(i1 %C) personality i32 (...)* @__gxx_personality_v0 {
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@caller
 ; IS__CGSCC____-SAME: (i1 [[C:%.*]]) [[ATTR1:#.*]] personality i32 (...)* @__gxx_personality_v0
 ; IS__CGSCC____-NEXT:    [[Q:%.*]] = alloca i32, align 4
-; IS__CGSCC____-NEXT:    [[W:%.*]] = call align 4 i32* @incdec(i1 [[C]], i32* noalias nofree nonnull align 4 dereferenceable(4) [[Q]])
+; IS__CGSCC____-NEXT:    [[W:%.*]] = call align 4 i32* @incdec(i1 [[C]], i32* noalias nofree noundef nonnull align 4 dereferenceable(4) [[Q]])
 ; IS__CGSCC____-NEXT:    [[S1:%.*]] = call { i32, i32 } @foo(i32 1, i32 2)
 ; IS__CGSCC____-NEXT:    [[X1:%.*]] = extractvalue { i32, i32 } [[S1]], 0
 ; IS__CGSCC____-NEXT:    [[S2:%.*]] = call { i32, i32 } @foo(i32 3, i32 4)
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/thread_local_acs.ll b/llvm/test/Transforms/Attributor/IPConstantProp/thread_local_acs.ll
index 904860e4921c7b..4405b7bc1b0955 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/thread_local_acs.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/thread_local_acs.ll
@@ -26,7 +26,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 define internal i32 @callee(i32* %thread_local_ptr, i32* %shared_ptr) {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readonly willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@callee
-; IS__TUNIT____-SAME: (i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[THREAD_LOCAL_PTR:%.*]], i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[SHARED_PTR:%.*]])
+; IS__TUNIT____-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[THREAD_LOCAL_PTR:%.*]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[SHARED_PTR:%.*]])
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    [[TMP:%.*]] = load i32, i32* [[THREAD_LOCAL_PTR]], align 4
 ; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = load i32, i32* @gsh, align 4
@@ -35,7 +35,7 @@ define internal i32 @callee(i32* %thread_local_ptr, i32* %shared_ptr) {
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readonly willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@callee
-; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[THREAD_LOCAL_PTR:%.*]], i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[SHARED_PTR:%.*]])
+; IS__CGSCC____-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[THREAD_LOCAL_PTR:%.*]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[SHARED_PTR:%.*]])
 ; IS__CGSCC____-NEXT:  entry:
 ; IS__CGSCC____-NEXT:    [[TMP:%.*]] = load i32, i32* [[THREAD_LOCAL_PTR]], align 4
 ; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = load i32, i32* @gsh, align 4
@@ -52,12 +52,12 @@ entry:
 define dso_local void @caller() {
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@caller()
 ; IS__TUNIT____-NEXT:  entry:
-; IS__TUNIT____-NEXT:    call void @broker(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) @gtl, i32 (i32*, i32*)* nonnull @callee, i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) undef)
+; IS__TUNIT____-NEXT:    call void @broker(i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) @gtl, i32 (i32*, i32*)* noundef nonnull @callee, i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) undef)
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@caller()
 ; IS__CGSCC____-NEXT:  entry:
-; IS__CGSCC____-NEXT:    call void @broker(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) @gtl, i32 (i32*, i32*)* nonnull @callee, i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) @gsh)
+; IS__CGSCC____-NEXT:    call void @broker(i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) @gtl, i32 (i32*, i32*)* noundef nonnull @callee, i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) @gsh)
 ; IS__CGSCC____-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/Attributor/align.ll b/llvm/test/Transforms/Attributor/align.ll
index b31327b6a85231..7df160e817b5f9 100644
--- a/llvm/test/Transforms/Attributor/align.ll
+++ b/llvm/test/Transforms/Attributor/align.ll
@@ -149,7 +149,7 @@ define i32* @test6_2() #0 {
 define internal i8* @f1(i8* readnone %0) local_unnamed_addr #0 {
 ; IS__TUNIT____: Function Attrs: nofree noinline nosync nounwind readnone uwtable
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@f1
-; IS__TUNIT____-SAME: (i8* noalias nofree nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" [[TMP0:%.*]]) local_unnamed_addr
+; IS__TUNIT____-SAME: (i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" [[TMP0:%.*]]) local_unnamed_addr
 ; IS__TUNIT____-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP0]], null
 ; IS__TUNIT____-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP5:%.*]]
 ; IS__TUNIT____:       3:
@@ -161,7 +161,7 @@ define internal i8* @f1(i8* readnone %0) local_unnamed_addr #0 {
 ;
 ; IS__CGSCC____: Function Attrs: nofree noinline nosync nounwind readnone uwtable
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@f1
-; IS__CGSCC____-SAME: (i8* nofree nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" [[TMP0:%.*]]) local_unnamed_addr
+; IS__CGSCC____-SAME: (i8* nofree noundef nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" [[TMP0:%.*]]) local_unnamed_addr
 ; IS__CGSCC____-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP0]], null
 ; IS__CGSCC____-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP5:%.*]]
 ; IS__CGSCC____:       3:
@@ -191,7 +191,7 @@ define internal i8* @f2(i8* readnone %0) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8* @a1, null
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP4:%.*]], label [[TMP2:%.*]]
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i8* @f1(i8* noalias nofree nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" @a1)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i8* @f1(i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" @a1)
 ; CHECK-NEXT:    br label [[TMP6:%.*]]
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i8* @f3()
@@ -224,7 +224,7 @@ define internal i8* @f3(i8* readnone %0) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8* @a2, null
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP4:%.*]]
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i8* @f1(i8* noalias nofree nonnull readnone align 16 dereferenceable(1) "no-capture-maybe-returned" @a2)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i8* @f1(i8* noalias nofree noundef nonnull readnone align 16 dereferenceable(1) "no-capture-maybe-returned" @a2)
 ; CHECK-NEXT:    br label [[TMP4]]
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi i8* [ [[TMP3]], [[TMP2]] ], [ @a1, [[TMP0:%.*]] ]
@@ -247,12 +247,12 @@ define internal i8* @f3(i8* readnone %0) local_unnamed_addr #0 {
 define align 4 i8* @test7() #0 {
 ; IS__TUNIT____: Function Attrs: nofree noinline nosync nounwind readnone uwtable
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@test7()
-; IS__TUNIT____-NEXT:    [[C:%.*]] = tail call i8* @f1(i8* noalias nofree nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" @a1)
+; IS__TUNIT____-NEXT:    [[C:%.*]] = tail call i8* @f1(i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" @a1)
 ; IS__TUNIT____-NEXT:    ret i8* [[C]]
 ;
 ; IS__CGSCC____: Function Attrs: nofree noinline nosync nounwind readnone uwtable
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@test7()
-; IS__CGSCC____-NEXT:    [[C:%.*]] = tail call nonnull align 8 dereferenceable(1) i8* @f1(i8* noalias nofree nonnull readnone align 8 dereferenceable(1) @a1)
+; IS__CGSCC____-NEXT:    [[C:%.*]] = tail call nonnull align 8 dereferenceable(1) i8* @f1(i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(1) @a1)
 ; IS__CGSCC____-NEXT:    ret i8* [[C]]
 ;
   %c = tail call i8* @f1(i8* align 8 dereferenceable(1) @a1)
@@ -264,7 +264,7 @@ define align 4 i8* @test7() #0 {
 define internal i8* @f1b(i8* readnone %0) local_unnamed_addr #0 {
 ; IS__TUNIT____: Function Attrs: nofree noinline nosync nounwind uwtable
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@f1b
-; IS__TUNIT____-SAME: (i8* noalias nofree nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" [[TMP0:%.*]]) local_unnamed_addr
+; IS__TUNIT____-SAME: (i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" [[TMP0:%.*]]) local_unnamed_addr
 ; IS__TUNIT____-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP0]], null
 ; IS__TUNIT____-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP5:%.*]]
 ; IS__TUNIT____:       3:
@@ -278,7 +278,7 @@ define internal i8* @f1b(i8* readnone %0) local_unnamed_addr #0 {
 ;
 ; IS__CGSCC____: Function Attrs: nofree noinline nosync nounwind uwtable
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@f1b
-; IS__CGSCC____-SAME: (i8* nofree nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" [[TMP0:%.*]]) local_unnamed_addr
+; IS__CGSCC____-SAME: (i8* nofree noundef nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" [[TMP0:%.*]]) local_unnamed_addr
 ; IS__CGSCC____-NEXT:    [[TMP2:%.*]] = icmp eq i8* [[TMP0]], null
 ; IS__CGSCC____-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP5:%.*]]
 ; IS__CGSCC____:       3:
@@ -312,7 +312,7 @@ define internal i8* @f2b(i8* readnone %0) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8* @a1, null
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP4:%.*]], label [[TMP2:%.*]]
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i8* @f1b(i8* noalias nofree nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" @a1)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i8* @f1b(i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" @a1)
 ; CHECK-NEXT:    br label [[TMP6:%.*]]
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = tail call i8* @f3b()
@@ -346,7 +346,7 @@ define internal i8* @f3b(i8* readnone %0) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i8* @a2, null
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP4:%.*]]
 ; CHECK:       2:
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i8* @f1b(i8* noalias nofree nonnull readnone align 16 dereferenceable(1) "no-capture-maybe-returned" @a2)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i8* @f1b(i8* noalias nofree noundef nonnull readnone align 16 dereferenceable(1) "no-capture-maybe-returned" @a2)
 ; CHECK-NEXT:    br label [[TMP4]]
 ; CHECK:       4:
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi i8* [ [[TMP3]], [[TMP2]] ], [ @a1, [[TMP0:%.*]] ]
@@ -368,13 +368,13 @@ define align 4 i32* @test7b(i32* align 32 %p) #0 {
 ; IS__TUNIT____: Function Attrs: nofree noinline nosync nounwind uwtable
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@test7b
 ; IS__TUNIT____-SAME: (i32* nofree readnone returned align 32 "no-capture-maybe-returned" [[P:%.*]])
-; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = tail call i8* @f1b(i8* noalias nofree nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" @a1)
+; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = tail call i8* @f1b(i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(1) "no-capture-maybe-returned" @a1)
 ; IS__TUNIT____-NEXT:    ret i32* [[P]]
 ;
 ; IS__CGSCC____: Function Attrs: nofree noinline nosync nounwind uwtable
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@test7b
 ; IS__CGSCC____-SAME: (i32* nofree readnone returned align 32 "no-capture-maybe-returned" [[P:%.*]])
-; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = tail call i8* @f1b(i8* noalias nofree nonnull readnone align 8 dereferenceable(1) @a1)
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = tail call i8* @f1b(i8* noalias nofree noundef nonnull readnone align 8 dereferenceable(1) @a1)
 ; IS__CGSCC____-NEXT:    ret i32* [[P]]
 ;
   tail call i8* @f1b(i8* align 8 dereferenceable(1) @a1)
@@ -384,12 +384,12 @@ define align 4 i32* @test7b(i32* align 32 %p) #0 {
 ; TEST 8
 define void @test8_helper() {
 ; CHECK-LABEL: define {{[^@]+}}@test8_helper()
-; CHECK-NEXT:    [[PTR0:%.*]] = tail call i32* @unknown()
-; CHECK-NEXT:    [[PTR1:%.*]] = tail call align 4 i32* @unknown()
-; CHECK-NEXT:    [[PTR2:%.*]] = tail call align 8 i32* @unknown()
-; CHECK-NEXT:    tail call void @test8(i32* noalias nocapture readnone align 4 [[PTR1]], i32* noalias nocapture readnone align 4 [[PTR1]], i32* noalias nocapture readnone [[PTR0]])
-; CHECK-NEXT:    tail call void @test8(i32* noalias nocapture readnone align 8 [[PTR2]], i32* noalias nocapture readnone align 4 [[PTR1]], i32* noalias nocapture readnone align 4 [[PTR1]])
-; CHECK-NEXT:    tail call void @test8(i32* noalias nocapture readnone align 8 [[PTR2]], i32* noalias nocapture readnone align 4 [[PTR1]], i32* noalias nocapture readnone align 4 [[PTR1]])
+; CHECK-NEXT:    [[PTR0:%.*]] = tail call noundef i32* @unknown()
+; CHECK-NEXT:    [[PTR1:%.*]] = tail call noundef align 4 i32* @unknown()
+; CHECK-NEXT:    [[PTR2:%.*]] = tail call noundef align 8 i32* @unknown()
+; CHECK-NEXT:    tail call void @test8(i32* noalias nocapture noundef readnone align 4 [[PTR1]], i32* noalias nocapture noundef readnone align 4 [[PTR1]], i32* noalias nocapture noundef readnone [[PTR0]])
+; CHECK-NEXT:    tail call void @test8(i32* noalias nocapture noundef readnone align 8 [[PTR2]], i32* noalias nocapture noundef readnone align 4 [[PTR1]], i32* noalias nocapture noundef readnone align 4 [[PTR1]])
+; CHECK-NEXT:    tail call void @test8(i32* noalias nocapture noundef readnone align 8 [[PTR2]], i32* noalias nocapture noundef readnone align 4 [[PTR1]], i32* noalias nocapture noundef readnone align 4 [[PTR1]])
 ; CHECK-NEXT:    ret void
 ;
   %ptr0 = tail call i32* @unknown()
@@ -406,10 +406,10 @@ declare void @user_i32_ptr(i32* nocapture readnone) nounwind
 define internal void @test8(i32* %a, i32* %b, i32* %c) {
 ; IS__TUNIT____: Function Attrs: nounwind
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@test8
-; IS__TUNIT____-SAME: (i32* noalias nocapture readnone align 4 [[A:%.*]], i32* noalias nocapture readnone align 4 [[B:%.*]], i32* noalias nocapture readnone [[C:%.*]])
-; IS__TUNIT____-NEXT:    call void @user_i32_ptr(i32* noalias nocapture readnone align 4 [[A]])
-; IS__TUNIT____-NEXT:    call void @user_i32_ptr(i32* noalias nocapture readnone align 4 [[B]])
-; IS__TUNIT____-NEXT:    call void @user_i32_ptr(i32* noalias nocapture readnone [[C]])
+; IS__TUNIT____-SAME: (i32* noalias nocapture noundef readnone align 4 [[A:%.*]], i32* noalias nocapture noundef readnone align 4 [[B:%.*]], i32* noalias nocapture noundef readnone [[C:%.*]])
+; IS__TUNIT____-NEXT:    call void @user_i32_ptr(i32* noalias nocapture noundef readnone align 4 [[A]])
+; IS__TUNIT____-NEXT:    call void @user_i32_ptr(i32* noalias nocapture noundef readnone align 4 [[B]])
+; IS__TUNIT____-NEXT:    call void @user_i32_ptr(i32* noalias nocapture noundef readnone [[C]])
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC____: Function Attrs: nounwind
diff --git a/llvm/test/Transforms/Attributor/callbacks.ll b/llvm/test/Transforms/Attributor/callbacks.ll
index f1dfacea8a96ef..7abb5fd241ce73 100644
--- a/llvm/test/Transforms/Attributor/callbacks.ll
+++ b/llvm/test/Transforms/Attributor/callbacks.ll
@@ -25,7 +25,7 @@ define void @t0_caller(i32* %a) {
 ; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_OPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_OPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t0_callback_broker(i32* noalias nocapture align 536870912 null, i32* nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i32**)* @t0_callback_callee to void (i32*, i32*, ...)*), i32* align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t0_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t0_callback_callee to void (i32*, i32*, ...)*), i32* align 256 [[A]], i64 undef, i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@t0_caller
@@ -37,7 +37,7 @@ define void @t0_caller(i32* %a) {
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_NPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_NPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t0_callback_broker(i32* noalias nocapture align 536870912 null, i32* nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i32**)* @t0_callback_callee to void (i32*, i32*, ...)*), i32* align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t0_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t0_callback_callee to void (i32*, i32*, ...)*), i32* align 256 [[A]], i64 undef, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@t0_caller
@@ -49,7 +49,7 @@ define void @t0_caller(i32* %a) {
 ; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__CGSCC_OPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__CGSCC_OPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__CGSCC_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t0_callback_broker(i32* noalias nocapture align 536870912 null, i32* nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i32**)* @t0_callback_callee to void (i32*, i32*, ...)*), i32* align 256 [[A]], i64 99, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__CGSCC_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t0_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t0_callback_callee to void (i32*, i32*, ...)*), i32* align 256 [[A]], i64 99, i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__CGSCC_OPM-NEXT:    ret void
 ;
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@t0_caller
@@ -61,7 +61,7 @@ define void @t0_caller(i32* %a) {
 ; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__CGSCC_NPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__CGSCC_NPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__CGSCC_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t0_callback_broker(i32* noalias nocapture align 536870912 null, i32* nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i64, i32**)* @t0_callback_callee to void (i32*, i32*, ...)*), i32* align 256 [[A]], i64 99, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__CGSCC_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t0_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t0_callback_callee to void (i32*, i32*, ...)*), i32* align 256 [[A]], i64 99, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 entry:
@@ -80,7 +80,7 @@ entry:
 define internal void @t0_callback_callee(i32* %is_not_null, i32* %ptr, i32* %a, i64 %b, i32** %c) {
 ;
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@t0_callback_callee
-; NOT_TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* align 256 [[A:%.*]], i64 [[B:%.*]], i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
+; NOT_TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* align 256 [[A:%.*]], i64 [[B:%.*]], i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  entry:
 ; NOT_TUNIT_NPM-NEXT:    [[PTR_VAL:%.*]] = load i32, i32* [[PTR]], align 8
 ; NOT_TUNIT_NPM-NEXT:    store i32 [[PTR_VAL]], i32* [[IS_NOT_NULL]], align 4
@@ -89,7 +89,7 @@ define internal void @t0_callback_callee(i32* %is_not_null, i32* %ptr, i32* %a,
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@t0_callback_callee
-; IS__TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* align 256 [[A:%.*]], i64 [[B:%.*]], i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
+; IS__TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* align 256 [[A:%.*]], i64 [[B:%.*]], i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
 ; IS__TUNIT_NPM-NEXT:  entry:
 ; IS__TUNIT_NPM-NEXT:    [[PTR_VAL:%.*]] = load i32, i32* [[PTR]], align 8
 ; IS__TUNIT_NPM-NEXT:    store i32 [[PTR_VAL]], i32* [[IS_NOT_NULL]], align 4
@@ -124,7 +124,7 @@ define void @t1_caller(i32* noalias %a) {
 ; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_OPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_OPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@t1_caller
@@ -136,7 +136,7 @@ define void @t1_caller(i32* noalias %a) {
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_NPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_NPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@t1_caller
@@ -148,7 +148,7 @@ define void @t1_caller(i32* noalias %a) {
 ; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__CGSCC_OPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__CGSCC_OPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__CGSCC_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 99, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__CGSCC_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 99, i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__CGSCC_OPM-NEXT:    ret void
 ;
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@t1_caller
@@ -160,7 +160,7 @@ define void @t1_caller(i32* noalias %a) {
 ; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__CGSCC_NPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__CGSCC_NPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__CGSCC_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 99, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__CGSCC_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t1_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t1_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 99, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 entry:
@@ -180,7 +180,7 @@ define internal void @t1_callback_callee(i32* %is_not_null, i32* %ptr, i32* %a,
 ;
 ; NOT_TUNIT_NPM: Function Attrs: nosync
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@t1_callback_callee
-; NOT_TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
+; NOT_TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  entry:
 ; NOT_TUNIT_NPM-NEXT:    [[PTR_VAL:%.*]] = load i32, i32* [[PTR]], align 8
 ; NOT_TUNIT_NPM-NEXT:    store i32 [[PTR_VAL]], i32* [[IS_NOT_NULL]], align 4
@@ -190,7 +190,7 @@ define internal void @t1_callback_callee(i32* %is_not_null, i32* %ptr, i32* %a,
 ;
 ; IS__TUNIT_NPM: Function Attrs: nosync
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@t1_callback_callee
-; IS__TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* noalias nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
+; IS__TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* noalias nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
 ; IS__TUNIT_NPM-NEXT:  entry:
 ; IS__TUNIT_NPM-NEXT:    [[PTR_VAL:%.*]] = load i32, i32* [[PTR]], align 8
 ; IS__TUNIT_NPM-NEXT:    store i32 [[PTR_VAL]], i32* [[IS_NOT_NULL]], align 4
@@ -224,7 +224,7 @@ define void @t2_caller(i32* noalias %a) {
 ; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_OPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_OPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@t2_caller
@@ -236,7 +236,7 @@ define void @t2_caller(i32* noalias %a) {
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_NPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_NPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@t2_caller
@@ -248,7 +248,7 @@ define void @t2_caller(i32* noalias %a) {
 ; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__CGSCC_OPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__CGSCC_OPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__CGSCC_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 99, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__CGSCC_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 99, i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__CGSCC_OPM-NEXT:    ret void
 ;
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@t2_caller
@@ -260,7 +260,7 @@ define void @t2_caller(i32* noalias %a) {
 ; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__CGSCC_NPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__CGSCC_NPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__CGSCC_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 99, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__CGSCC_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t2_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t2_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 99, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 entry:
@@ -281,7 +281,7 @@ entry:
 define internal void @t2_callback_callee(i32* %is_not_null, i32* %ptr, i32* %a, i64 %b, i32** %c) {
 ;
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@t2_callback_callee
-; NOT_TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
+; NOT_TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  entry:
 ; NOT_TUNIT_NPM-NEXT:    [[PTR_VAL:%.*]] = load i32, i32* [[PTR]], align 8
 ; NOT_TUNIT_NPM-NEXT:    store i32 [[PTR_VAL]], i32* [[IS_NOT_NULL]], align 4
@@ -290,7 +290,7 @@ define internal void @t2_callback_callee(i32* %is_not_null, i32* %ptr, i32* %a,
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@t2_callback_callee
-; IS__TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
+; IS__TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
 ; IS__TUNIT_NPM-NEXT:  entry:
 ; IS__TUNIT_NPM-NEXT:    [[PTR_VAL:%.*]] = load i32, i32* [[PTR]], align 8
 ; IS__TUNIT_NPM-NEXT:    store i32 [[PTR_VAL]], i32* [[IS_NOT_NULL]], align 4
@@ -324,8 +324,8 @@ define void @t3_caller(i32* noalias %a) {
 ; IS__TUNIT_OPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_OPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_OPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
-; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 undef, i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@t3_caller
@@ -337,8 +337,8 @@ define void @t3_caller(i32* noalias %a) {
 ; IS__TUNIT_NPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__TUNIT_NPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__TUNIT_NPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
-; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__TUNIT_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 undef, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@t3_caller
@@ -350,8 +350,8 @@ define void @t3_caller(i32* noalias %a) {
 ; IS__CGSCC_OPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__CGSCC_OPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__CGSCC_OPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__CGSCC_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 99, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
-; IS__CGSCC_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 99, i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__CGSCC_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 99, i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__CGSCC_OPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* nocapture align 256 [[A]], i64 99, i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__CGSCC_OPM-NEXT:    ret void
 ;
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@t3_caller
@@ -363,8 +363,8 @@ define void @t3_caller(i32* noalias %a) {
 ; IS__CGSCC_NPM-NEXT:    [[TMP0:%.*]] = bitcast i32* [[B]] to i8*
 ; IS__CGSCC_NPM-NEXT:    store i32 42, i32* [[B]], align 32
 ; IS__CGSCC_NPM-NEXT:    store i32* [[B]], i32** [[C]], align 64
-; IS__CGSCC_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 99, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
-; IS__CGSCC_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture align 536870912 null, i32* noalias nocapture nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 99, i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__CGSCC_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 99, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
+; IS__CGSCC_NPM-NEXT:    call void (i32*, i32*, void (i32*, i32*, ...)*, ...) @t3_callback_broker(i32* noalias nocapture noundef align 536870912 null, i32* noalias nocapture noundef nonnull align 128 dereferenceable(4) [[PTR]], void (i32*, i32*, ...)* nocapture noundef bitcast (void (i32*, i32*, i32*, i64, i32**)* @t3_callback_callee to void (i32*, i32*, ...)*), i32* noalias nocapture align 256 [[A]], i64 99, i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C]])
 ; IS__CGSCC_NPM-NEXT:    ret void
 ;
 entry:
@@ -386,7 +386,7 @@ entry:
 define internal void @t3_callback_callee(i32* %is_not_null, i32* %ptr, i32* %a, i64 %b, i32** %c) {
 ;
 ; NOT_TUNIT_NPM-LABEL: define {{[^@]+}}@t3_callback_callee
-; NOT_TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** nocapture nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
+; NOT_TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
 ; NOT_TUNIT_NPM-NEXT:  entry:
 ; NOT_TUNIT_NPM-NEXT:    [[PTR_VAL:%.*]] = load i32, i32* [[PTR]], align 8
 ; NOT_TUNIT_NPM-NEXT:    store i32 [[PTR_VAL]], i32* [[IS_NOT_NULL]], align 4
@@ -395,7 +395,7 @@ define internal void @t3_callback_callee(i32* %is_not_null, i32* %ptr, i32* %a,
 ; NOT_TUNIT_NPM-NEXT:    ret void
 ;
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@t3_callback_callee
-; IS__TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** noalias nocapture nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
+; IS__TUNIT_NPM-SAME: (i32* nocapture nonnull writeonly align 4 dereferenceable(4) [[IS_NOT_NULL:%.*]], i32* nocapture nonnull readonly align 8 dereferenceable(4) [[PTR:%.*]], i32* nocapture align 256 [[A:%.*]], i64 [[B:%.*]], i32** noalias nocapture noundef nonnull readonly align 64 dereferenceable(8) [[C:%.*]])
 ; IS__TUNIT_NPM-NEXT:  entry:
 ; IS__TUNIT_NPM-NEXT:    [[PTR_VAL:%.*]] = load i32, i32* [[PTR]], align 8
 ; IS__TUNIT_NPM-NEXT:    store i32 [[PTR_VAL]], i32* [[IS_NOT_NULL]], align 4
diff --git a/llvm/test/Transforms/Attributor/depgraph.ll b/llvm/test/Transforms/Attributor/depgraph.ll
index 059587789035e6..f7de3287b88359 100644
--- a/llvm/test/Transforms/Attributor/depgraph.ll
+++ b/llvm/test/Transforms/Attributor/depgraph.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=attributor-cgscc -disable-output -attributor-print-dep < %s 2>&1 | FileCheck %s --check-prefixes=GRAPH
 ; RUN: opt -passes=attributor-cgscc -disable-output -attributor-dump-dep-graph -attributor-depgraph-dot-filename-prefix=%t < %s 2>/dev/null
 ; RUN: FileCheck %s -input-file=%t_0.dot --check-prefix=DOT
diff --git a/llvm/test/Transforms/Attributor/dereferenceable-1.ll b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
index 3f8fb81a2636b6..816d3df44560d9 100644
--- a/llvm/test/Transforms/Attributor/dereferenceable-1.ll
+++ b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
@@ -275,21 +275,37 @@ if.false:
 }
 
 define void @f7_2(i1 %c) {
-; CHECK: Function Attrs: nounwind willreturn
-; CHECK-LABEL: define {{[^@]+}}@f7_2
-; CHECK-SAME: (i1 [[C:%.*]])
-; CHECK-NEXT:    [[PTR:%.*]] = tail call nonnull align 4 dereferenceable(4) i32* @unkown_ptr()
-; CHECK-NEXT:    [[A:%.*]] = tail call i32 @unkown_f(i32* nonnull align 4 dereferenceable(4) [[PTR]])
-; CHECK-NEXT:    [[ARG_A_0:%.*]] = load i32, i32* [[PTR]], align 4
-; CHECK-NEXT:    [[B:%.*]] = tail call i32 @unkown_f(i32* nonnull align 4 dereferenceable(4) [[PTR]])
-; CHECK-NEXT:    br i1 [[C]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
-; CHECK:       if.true:
-; CHECK-NEXT:    [[C:%.*]] = tail call i32 @unkown_f(i32* nonnull align 4 dereferenceable(8) [[PTR]])
-; CHECK-NEXT:    [[D:%.*]] = tail call i32 @unkown_f(i32* nonnull align 4 dereferenceable(8) [[PTR]])
-; CHECK-NEXT:    [[E:%.*]] = tail call i32 @unkown_f(i32* nonnull align 4 dereferenceable(8) [[PTR]])
-; CHECK-NEXT:    ret void
-; CHECK:       if.false:
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_NPM: Function Attrs: nounwind willreturn
+; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@f7_2
+; NOT_CGSCC_NPM-SAME: (i1 [[C:%.*]])
+; NOT_CGSCC_NPM-NEXT:    [[PTR:%.*]] = tail call noundef nonnull align 4 dereferenceable(4) i32* @unkown_ptr()
+; NOT_CGSCC_NPM-NEXT:    [[A:%.*]] = tail call i32 @unkown_f(i32* noundef nonnull align 4 dereferenceable(4) [[PTR]])
+; NOT_CGSCC_NPM-NEXT:    [[ARG_A_0:%.*]] = load i32, i32* [[PTR]], align 4
+; NOT_CGSCC_NPM-NEXT:    [[B:%.*]] = tail call i32 @unkown_f(i32* noundef nonnull align 4 dereferenceable(4) [[PTR]])
+; NOT_CGSCC_NPM-NEXT:    br i1 [[C]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; NOT_CGSCC_NPM:       if.true:
+; NOT_CGSCC_NPM-NEXT:    [[C:%.*]] = tail call i32 @unkown_f(i32* noundef nonnull align 4 dereferenceable(8) [[PTR]])
+; NOT_CGSCC_NPM-NEXT:    [[D:%.*]] = tail call i32 @unkown_f(i32* noundef nonnull align 4 dereferenceable(8) [[PTR]])
+; NOT_CGSCC_NPM-NEXT:    [[E:%.*]] = tail call i32 @unkown_f(i32* noundef nonnull align 4 dereferenceable(8) [[PTR]])
+; NOT_CGSCC_NPM-NEXT:    ret void
+; NOT_CGSCC_NPM:       if.false:
+; NOT_CGSCC_NPM-NEXT:    ret void
+;
+; IS__CGSCC_NPM: Function Attrs: nounwind willreturn
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@f7_2
+; IS__CGSCC_NPM-SAME: (i1 [[C:%.*]])
+; IS__CGSCC_NPM-NEXT:    [[PTR:%.*]] = tail call nonnull align 4 dereferenceable(4) i32* @unkown_ptr()
+; IS__CGSCC_NPM-NEXT:    [[A:%.*]] = tail call i32 @unkown_f(i32* noundef nonnull align 4 dereferenceable(4) [[PTR]])
+; IS__CGSCC_NPM-NEXT:    [[ARG_A_0:%.*]] = load i32, i32* [[PTR]], align 4
+; IS__CGSCC_NPM-NEXT:    [[B:%.*]] = tail call i32 @unkown_f(i32* noundef nonnull align 4 dereferenceable(4) [[PTR]])
+; IS__CGSCC_NPM-NEXT:    br i1 [[C]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; IS__CGSCC_NPM:       if.true:
+; IS__CGSCC_NPM-NEXT:    [[C:%.*]] = tail call i32 @unkown_f(i32* noundef nonnull align 4 dereferenceable(8) [[PTR]])
+; IS__CGSCC_NPM-NEXT:    [[D:%.*]] = tail call i32 @unkown_f(i32* noundef nonnull align 4 dereferenceable(8) [[PTR]])
+; IS__CGSCC_NPM-NEXT:    [[E:%.*]] = tail call i32 @unkown_f(i32* noundef nonnull align 4 dereferenceable(8) [[PTR]])
+; IS__CGSCC_NPM-NEXT:    ret void
+; IS__CGSCC_NPM:       if.false:
+; IS__CGSCC_NPM-NEXT:    ret void
 ;
   %ptr =  tail call i32* @unkown_ptr()
   %A = tail call i32 @unkown_f(i32* %ptr)
@@ -1017,23 +1033,41 @@ define void @nonnull_assume_call(i8* %arg1, i8* %arg2, i8* %arg3, i8* %arg4) {
 ; ATTRIBUTOR-NEXT:    call void @unknown()
 ; ATTRIBUTOR-NEXT:    ret void
 ;
-; CHECK-LABEL: define {{[^@]+}}@nonnull_assume_call
-; CHECK-SAME: (i8* [[ARG1:%.*]], i8* [[ARG2:%.*]], i8* [[ARG3:%.*]], i8* [[ARG4:%.*]])
-; CHECK-NEXT:    call void @unknown()
-; CHECK-NEXT:    [[P:%.*]] = call nonnull dereferenceable(101) i32* @unkown_ptr()
-; CHECK-NEXT:    call void @unknown_use32(i32* nonnull dereferenceable(101) [[P]])
-; CHECK-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(42) [[ARG4]])
-; CHECK-NEXT:    call void @unknown_use8(i8* nonnull [[ARG3]])
-; CHECK-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(31) [[ARG2]])
-; CHECK-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(2) [[ARG1]])
-; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "nonnull"(i8* [[ARG3]]), "dereferenceable"(i8* [[ARG1]], i64 1), "dereferenceable"(i8* [[ARG1]], i64 2), "dereferenceable"(i32* [[P]], i64 101), "dereferenceable_or_null"(i8* [[ARG2]], i64 31), "dereferenceable_or_null"(i8* [[ARG4]], i64 42) ]
-; CHECK-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(2) [[ARG1]])
-; CHECK-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(31) [[ARG2]])
-; CHECK-NEXT:    call void @unknown_use8(i8* nonnull [[ARG3]])
-; CHECK-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(42) [[ARG4]])
-; CHECK-NEXT:    call void @unknown_use32(i32* nonnull dereferenceable(101) [[P]])
-; CHECK-NEXT:    call void @unknown()
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_OPM-LABEL: define {{[^@]+}}@nonnull_assume_call
+; NOT_CGSCC_OPM-SAME: (i8* [[ARG1:%.*]], i8* [[ARG2:%.*]], i8* [[ARG3:%.*]], i8* [[ARG4:%.*]])
+; NOT_CGSCC_OPM-NEXT:    call void @unknown()
+; NOT_CGSCC_OPM-NEXT:    [[P:%.*]] = call noundef nonnull dereferenceable(101) i32* @unkown_ptr()
+; NOT_CGSCC_OPM-NEXT:    call void @unknown_use32(i32* noundef nonnull dereferenceable(101) [[P]])
+; NOT_CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(42) [[ARG4]])
+; NOT_CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull [[ARG3]])
+; NOT_CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(31) [[ARG2]])
+; NOT_CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(2) [[ARG1]])
+; NOT_CGSCC_OPM-NEXT:    call void @llvm.assume(i1 true) [ "nonnull"(i8* [[ARG3]]), "dereferenceable"(i8* [[ARG1]], i64 1), "dereferenceable"(i8* [[ARG1]], i64 2), "dereferenceable"(i32* [[P]], i64 101), "dereferenceable_or_null"(i8* [[ARG2]], i64 31), "dereferenceable_or_null"(i8* [[ARG4]], i64 42) ]
+; NOT_CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(2) [[ARG1]])
+; NOT_CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(31) [[ARG2]])
+; NOT_CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull [[ARG3]])
+; NOT_CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(42) [[ARG4]])
+; NOT_CGSCC_OPM-NEXT:    call void @unknown_use32(i32* noundef nonnull dereferenceable(101) [[P]])
+; NOT_CGSCC_OPM-NEXT:    call void @unknown()
+; NOT_CGSCC_OPM-NEXT:    ret void
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@nonnull_assume_call
+; IS__CGSCC_OPM-SAME: (i8* [[ARG1:%.*]], i8* [[ARG2:%.*]], i8* [[ARG3:%.*]], i8* [[ARG4:%.*]])
+; IS__CGSCC_OPM-NEXT:    call void @unknown()
+; IS__CGSCC_OPM-NEXT:    [[P:%.*]] = call nonnull dereferenceable(101) i32* @unkown_ptr()
+; IS__CGSCC_OPM-NEXT:    call void @unknown_use32(i32* noundef nonnull dereferenceable(101) [[P]])
+; IS__CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(42) [[ARG4]])
+; IS__CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull [[ARG3]])
+; IS__CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(31) [[ARG2]])
+; IS__CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(2) [[ARG1]])
+; IS__CGSCC_OPM-NEXT:    call void @llvm.assume(i1 true) [ "nonnull"(i8* [[ARG3]]), "dereferenceable"(i8* [[ARG1]], i64 1), "dereferenceable"(i8* [[ARG1]], i64 2), "dereferenceable"(i32* [[P]], i64 101), "dereferenceable_or_null"(i8* [[ARG2]], i64 31), "dereferenceable_or_null"(i8* [[ARG4]], i64 42) ]
+; IS__CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(2) [[ARG1]])
+; IS__CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(31) [[ARG2]])
+; IS__CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull [[ARG3]])
+; IS__CGSCC_OPM-NEXT:    call void @unknown_use8(i8* nonnull dereferenceable(42) [[ARG4]])
+; IS__CGSCC_OPM-NEXT:    call void @unknown_use32(i32* noundef nonnull dereferenceable(101) [[P]])
+; IS__CGSCC_OPM-NEXT:    call void @unknown()
+; IS__CGSCC_OPM-NEXT:    ret void
 ;
   call void @unknown()
   %p = call i32* @unkown_ptr()
diff --git a/llvm/test/Transforms/Attributor/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll
index 28c0166dd0cd62..3451fa8a59f44f 100644
--- a/llvm/test/Transforms/Attributor/heap_to_stack.ll
+++ b/llvm/test/Transforms/Attributor/heap_to_stack.ll
@@ -43,12 +43,19 @@ define void @nofree_arg_only(i8* %p1, i8* %p2) {
 ; TEST 1 - negative, pointer freed in another function.
 
 define void @test1() {
-; CHECK-LABEL: define {{[^@]+}}@test1()
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; CHECK-NEXT:    tail call void @nocapture_func_frees_pointer(i8* noalias nocapture [[TMP1]])
-; CHECK-NEXT:    tail call void (...) @func_throws()
-; CHECK-NEXT:    tail call void @free(i8* noalias nocapture [[TMP1]])
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@test1()
+; NOT_CGSCC_NPM-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; NOT_CGSCC_NPM-NEXT:    tail call void @nocapture_func_frees_pointer(i8* noalias nocapture noundef [[TMP1]])
+; NOT_CGSCC_NPM-NEXT:    tail call void (...) @func_throws()
+; NOT_CGSCC_NPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
+; NOT_CGSCC_NPM-NEXT:    ret void
+;
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@test1()
+; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC_NPM-NEXT:    tail call void @nocapture_func_frees_pointer(i8* noalias nocapture noundef [[TMP1]])
+; IS__CGSCC_NPM-NEXT:    tail call void (...) @func_throws()
+; IS__CGSCC_NPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
+; IS__CGSCC_NPM-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   tail call void @nocapture_func_frees_pointer(i8* %1)
@@ -60,11 +67,17 @@ define void @test1() {
 ; TEST 2 - negative, call to a sync function.
 
 define void @test2() {
-; CHECK-LABEL: define {{[^@]+}}@test2()
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; CHECK-NEXT:    tail call void @sync_func(i8* [[TMP1]])
-; CHECK-NEXT:    tail call void @free(i8* nocapture [[TMP1]])
-; CHECK-NEXT:    ret void
+; IS__TUNIT____-LABEL: define {{[^@]+}}@test2()
+; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT____-NEXT:    tail call void @sync_func(i8* noundef [[TMP1]])
+; IS__TUNIT____-NEXT:    tail call void @free(i8* nocapture noundef [[TMP1]])
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test2()
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC____-NEXT:    tail call void @sync_func(i8* noundef [[TMP1]])
+; IS__CGSCC____-NEXT:    tail call void @free(i8* nocapture noundef [[TMP1]])
+; IS__CGSCC____-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   tail call void @sync_func(i8* %1)
@@ -75,16 +88,22 @@ define void @test2() {
 ; TEST 3 - 1 malloc, 1 free
 
 define void @test3() {
-; IS________OPM-LABEL: define {{[^@]+}}@test3()
-; IS________OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; IS________OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
-; IS________OPM-NEXT:    tail call void @free(i8* noalias nocapture [[TMP1]])
-; IS________OPM-NEXT:    ret void
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@test3()
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT_OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__TUNIT_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
+; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@test3()
 ; IS________NPM-NEXT:    [[TMP1:%.*]] = alloca i8, i64 4, align 1
-; IS________NPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
+; IS________NPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
 ; IS________NPM-NEXT:    ret void
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test3()
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC_OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   tail call void @no_sync_func(i8* %1)
@@ -93,18 +112,25 @@ define void @test3() {
 }
 
 define void @test3a(i8* %p) {
-; IS________OPM-LABEL: define {{[^@]+}}@test3a
-; IS________OPM-SAME: (i8* nocapture [[P:%.*]])
-; IS________OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; IS________OPM-NEXT:    tail call void @nofree_arg_only(i8* nocapture nofree [[TMP1]], i8* nocapture [[P]])
-; IS________OPM-NEXT:    tail call void @free(i8* noalias nocapture [[TMP1]])
-; IS________OPM-NEXT:    ret void
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@test3a
+; IS__TUNIT_OPM-SAME: (i8* nocapture [[P:%.*]])
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT_OPM-NEXT:    tail call void @nofree_arg_only(i8* nocapture nofree noundef [[TMP1]], i8* nocapture [[P]])
+; IS__TUNIT_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
+; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@test3a
 ; IS________NPM-SAME: (i8* nocapture [[P:%.*]])
 ; IS________NPM-NEXT:    [[TMP1:%.*]] = alloca i8, i64 4, align 1
-; IS________NPM-NEXT:    tail call void @nofree_arg_only(i8* noalias nocapture nofree [[TMP1]], i8* nocapture [[P]])
+; IS________NPM-NEXT:    tail call void @nofree_arg_only(i8* noalias nocapture nofree noundef [[TMP1]], i8* nocapture [[P]])
 ; IS________NPM-NEXT:    ret void
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test3a
+; IS__CGSCC_OPM-SAME: (i8* nocapture [[P:%.*]])
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC_OPM-NEXT:    tail call void @nofree_arg_only(i8* nocapture nofree noundef [[TMP1]], i8* nocapture [[P]])
+; IS__CGSCC_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   tail call void @nofree_arg_only(i8* %1, i8* %p)
@@ -117,15 +143,15 @@ declare noalias i8* @aligned_alloc(i64, i64)
 define void @test3b(i8* %p) {
 ; IS________OPM-LABEL: define {{[^@]+}}@test3b
 ; IS________OPM-SAME: (i8* nocapture [[P:%.*]])
-; IS________OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @aligned_alloc(i64 32, i64 128)
-; IS________OPM-NEXT:    tail call void @nofree_arg_only(i8* nocapture nofree [[TMP1]], i8* nocapture [[P]])
-; IS________OPM-NEXT:    tail call void @free(i8* noalias nocapture [[TMP1]])
+; IS________OPM-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @aligned_alloc(i64 32, i64 128)
+; IS________OPM-NEXT:    tail call void @nofree_arg_only(i8* nocapture nofree noundef [[TMP1]], i8* nocapture [[P]])
+; IS________OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
 ; IS________OPM-NEXT:    ret void
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@test3b
 ; IS________NPM-SAME: (i8* nocapture [[P:%.*]])
 ; IS________NPM-NEXT:    [[TMP1:%.*]] = alloca i8, i64 128, align 32
-; IS________NPM-NEXT:    tail call void @nofree_arg_only(i8* noalias nocapture nofree [[TMP1]], i8* nocapture [[P]])
+; IS________NPM-NEXT:    tail call void @nofree_arg_only(i8* noalias nocapture nofree noundef [[TMP1]], i8* nocapture [[P]])
 ; IS________NPM-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @aligned_alloc(i64 32, i64 128)
@@ -136,11 +162,17 @@ define void @test3b(i8* %p) {
 
 ; leave alone non-constant alignments.
 define void @test3c(i64 %alignment) {
-; CHECK-LABEL: define {{[^@]+}}@test3c
-; CHECK-SAME: (i64 [[ALIGNMENT:%.*]])
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @aligned_alloc(i64 [[ALIGNMENT]], i64 128)
-; CHECK-NEXT:    tail call void @free(i8* noalias nocapture [[TMP1]])
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_OPM-LABEL: define {{[^@]+}}@test3c
+; NOT_CGSCC_OPM-SAME: (i64 [[ALIGNMENT:%.*]])
+; NOT_CGSCC_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @aligned_alloc(i64 [[ALIGNMENT]], i64 128)
+; NOT_CGSCC_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
+; NOT_CGSCC_OPM-NEXT:    ret void
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test3c
+; IS__CGSCC_OPM-SAME: (i64 [[ALIGNMENT:%.*]])
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @aligned_alloc(i64 [[ALIGNMENT]], i64 128)
+; IS__CGSCC_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @aligned_alloc(i64 %alignment, i64 128)
   tail call void @free(i8* %1)
@@ -151,16 +183,16 @@ declare noalias i8* @calloc(i64, i64)
 
 define void @test0() {
 ; IS________OPM-LABEL: define {{[^@]+}}@test0()
-; IS________OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @calloc(i64 2, i64 4)
-; IS________OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
-; IS________OPM-NEXT:    tail call void @free(i8* noalias nocapture [[TMP1]])
+; IS________OPM-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @calloc(i64 2, i64 4)
+; IS________OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS________OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
 ; IS________OPM-NEXT:    ret void
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@test0()
 ; IS________NPM-NEXT:    [[TMP1:%.*]] = alloca i8, i64 8, align 1
 ; IS________NPM-NEXT:    [[CALLOC_BC:%.*]] = bitcast i8* [[TMP1]] to i8*
 ; IS________NPM-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[CALLOC_BC]], i8 0, i64 8, i1 false)
-; IS________NPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
+; IS________NPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
 ; IS________NPM-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @calloc(i64 2, i64 4)
@@ -171,15 +203,20 @@ define void @test0() {
 
 ; TEST 4
 define void @test4() {
-; IS________OPM-LABEL: define {{[^@]+}}@test4()
-; IS________OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; IS________OPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree [[TMP1]])
-; IS________OPM-NEXT:    ret void
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@test4()
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT_OPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@test4()
 ; IS________NPM-NEXT:    [[TMP1:%.*]] = alloca i8, i64 4, align 1
-; IS________NPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree [[TMP1]])
+; IS________NPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree noundef [[TMP1]])
 ; IS________NPM-NEXT:    ret void
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test4()
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC_OPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   tail call void @nofree_func(i8* %1)
@@ -190,20 +227,20 @@ define void @test4() {
 ; are in nofree functions and are not captured
 
 define void @test5(i32, i8* %p) {
-; IS________OPM-LABEL: define {{[^@]+}}@test5
-; IS________OPM-SAME: (i32 [[TMP0:%.*]], i8* nocapture [[P:%.*]])
-; IS________OPM-NEXT:    [[TMP2:%.*]] = tail call noalias i8* @malloc(i64 4)
-; IS________OPM-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
-; IS________OPM-NEXT:    br i1 [[TMP3]], label [[TMP5:%.*]], label [[TMP4:%.*]]
-; IS________OPM:       4:
-; IS________OPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree [[TMP2]])
-; IS________OPM-NEXT:    br label [[TMP6:%.*]]
-; IS________OPM:       5:
-; IS________OPM-NEXT:    tail call void @nofree_arg_only(i8* nocapture nofree [[TMP2]], i8* nocapture [[P]])
-; IS________OPM-NEXT:    tail call void @free(i8* noalias nocapture [[TMP2]])
-; IS________OPM-NEXT:    br label [[TMP6]]
-; IS________OPM:       6:
-; IS________OPM-NEXT:    ret void
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@test5
+; IS__TUNIT_OPM-SAME: (i32 [[TMP0:%.*]], i8* nocapture [[P:%.*]])
+; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; IS__TUNIT_OPM-NEXT:    br i1 [[TMP3]], label [[TMP5:%.*]], label [[TMP4:%.*]]
+; IS__TUNIT_OPM:       4:
+; IS__TUNIT_OPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree noundef [[TMP2]])
+; IS__TUNIT_OPM-NEXT:    br label [[TMP6:%.*]]
+; IS__TUNIT_OPM:       5:
+; IS__TUNIT_OPM-NEXT:    tail call void @nofree_arg_only(i8* nocapture nofree noundef [[TMP2]], i8* nocapture [[P]])
+; IS__TUNIT_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP2]])
+; IS__TUNIT_OPM-NEXT:    br label [[TMP6]]
+; IS__TUNIT_OPM:       6:
+; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@test5
 ; IS________NPM-SAME: (i32 [[TMP0:%.*]], i8* nocapture [[P:%.*]])
@@ -211,13 +248,28 @@ define void @test5(i32, i8* %p) {
 ; IS________NPM-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
 ; IS________NPM-NEXT:    br i1 [[TMP3]], label [[TMP5:%.*]], label [[TMP4:%.*]]
 ; IS________NPM:       4:
-; IS________NPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree [[TMP2]])
+; IS________NPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree noundef [[TMP2]])
 ; IS________NPM-NEXT:    br label [[TMP6:%.*]]
 ; IS________NPM:       5:
-; IS________NPM-NEXT:    tail call void @nofree_arg_only(i8* noalias nocapture nofree [[TMP2]], i8* nocapture [[P]])
+; IS________NPM-NEXT:    tail call void @nofree_arg_only(i8* noalias nocapture nofree noundef [[TMP2]], i8* nocapture [[P]])
 ; IS________NPM-NEXT:    br label [[TMP6]]
 ; IS________NPM:       6:
 ; IS________NPM-NEXT:    ret void
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test5
+; IS__CGSCC_OPM-SAME: (i32 [[TMP0:%.*]], i8* nocapture [[P:%.*]])
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; IS__CGSCC_OPM-NEXT:    br i1 [[TMP3]], label [[TMP5:%.*]], label [[TMP4:%.*]]
+; IS__CGSCC_OPM:       4:
+; IS__CGSCC_OPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree noundef [[TMP2]])
+; IS__CGSCC_OPM-NEXT:    br label [[TMP6:%.*]]
+; IS__CGSCC_OPM:       5:
+; IS__CGSCC_OPM-NEXT:    tail call void @nofree_arg_only(i8* nocapture nofree noundef [[TMP2]], i8* nocapture [[P]])
+; IS__CGSCC_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP2]])
+; IS__CGSCC_OPM-NEXT:    br label [[TMP6]]
+; IS__CGSCC_OPM:       6:
+; IS__CGSCC_OPM-NEXT:    ret void
 ;
   %2 = tail call noalias i8* @malloc(i64 4)
   %3 = icmp eq i32 %0, 0
@@ -239,20 +291,20 @@ define void @test5(i32, i8* %p) {
 ; TEST 6 - all exit paths have a call to free
 
 define void @test6(i32) {
-; IS________OPM-LABEL: define {{[^@]+}}@test6
-; IS________OPM-SAME: (i32 [[TMP0:%.*]])
-; IS________OPM-NEXT:    [[TMP2:%.*]] = tail call noalias i8* @malloc(i64 4)
-; IS________OPM-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
-; IS________OPM-NEXT:    br i1 [[TMP3]], label [[TMP5:%.*]], label [[TMP4:%.*]]
-; IS________OPM:       4:
-; IS________OPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree [[TMP2]])
-; IS________OPM-NEXT:    tail call void @free(i8* noalias nocapture [[TMP2]])
-; IS________OPM-NEXT:    br label [[TMP6:%.*]]
-; IS________OPM:       5:
-; IS________OPM-NEXT:    tail call void @free(i8* noalias nocapture [[TMP2]])
-; IS________OPM-NEXT:    br label [[TMP6]]
-; IS________OPM:       6:
-; IS________OPM-NEXT:    ret void
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@test6
+; IS__TUNIT_OPM-SAME: (i32 [[TMP0:%.*]])
+; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; IS__TUNIT_OPM-NEXT:    br i1 [[TMP3]], label [[TMP5:%.*]], label [[TMP4:%.*]]
+; IS__TUNIT_OPM:       4:
+; IS__TUNIT_OPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree noundef [[TMP2]])
+; IS__TUNIT_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP2]])
+; IS__TUNIT_OPM-NEXT:    br label [[TMP6:%.*]]
+; IS__TUNIT_OPM:       5:
+; IS__TUNIT_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP2]])
+; IS__TUNIT_OPM-NEXT:    br label [[TMP6]]
+; IS__TUNIT_OPM:       6:
+; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@test6
 ; IS________NPM-SAME: (i32 [[TMP0:%.*]])
@@ -260,12 +312,27 @@ define void @test6(i32) {
 ; IS________NPM-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
 ; IS________NPM-NEXT:    br i1 [[TMP3]], label [[TMP5:%.*]], label [[TMP4:%.*]]
 ; IS________NPM:       4:
-; IS________NPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree [[TMP2]])
+; IS________NPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree noundef [[TMP2]])
 ; IS________NPM-NEXT:    br label [[TMP6:%.*]]
 ; IS________NPM:       5:
 ; IS________NPM-NEXT:    br label [[TMP6]]
 ; IS________NPM:       6:
 ; IS________NPM-NEXT:    ret void
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test6
+; IS__CGSCC_OPM-SAME: (i32 [[TMP0:%.*]])
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP0]], 0
+; IS__CGSCC_OPM-NEXT:    br i1 [[TMP3]], label [[TMP5:%.*]], label [[TMP4:%.*]]
+; IS__CGSCC_OPM:       4:
+; IS__CGSCC_OPM-NEXT:    tail call void @nofree_func(i8* noalias nocapture nofree noundef [[TMP2]])
+; IS__CGSCC_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP2]])
+; IS__CGSCC_OPM-NEXT:    br label [[TMP6:%.*]]
+; IS__CGSCC_OPM:       5:
+; IS__CGSCC_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP2]])
+; IS__CGSCC_OPM-NEXT:    br label [[TMP6]]
+; IS__CGSCC_OPM:       6:
+; IS__CGSCC_OPM-NEXT:    ret void
 ;
   %2 = tail call noalias i8* @malloc(i64 4)
   %3 = icmp eq i32 %0, 0
@@ -308,14 +375,23 @@ define void @test7() {
 ; TEST 8 - Negative: bitcast pointer used in capture function
 
 define void @test8() {
-; CHECK-LABEL: define {{[^@]+}}@test8()
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; CHECK-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-; CHECK-NEXT:    store i32 10, i32* [[TMP2]], align 4
-; CHECK-NEXT:    tail call void @foo(i32* align 4 [[TMP2]])
-; CHECK-NEXT:    tail call void @free(i8* nocapture nonnull align 4 dereferenceable(4) [[TMP1]])
-; CHECK-NEXT:    ret void
+; IS__TUNIT____-LABEL: define {{[^@]+}}@test8()
+; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT____-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__TUNIT____-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__TUNIT____-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__TUNIT____-NEXT:    tail call void @foo(i32* noundef align 4 [[TMP2]])
+; IS__TUNIT____-NEXT:    tail call void @free(i8* nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test8()
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC____-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__CGSCC____-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__CGSCC____-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__CGSCC____-NEXT:    tail call void @foo(i32* noundef align 4 [[TMP2]])
+; IS__CGSCC____-NEXT:    tail call void @free(i8* nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__CGSCC____-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   tail call void @no_sync_func(i8* %1)
@@ -329,14 +405,23 @@ define void @test8() {
 
 ; TEST 9 - FIXME: malloc should be converted.
 define void @test9() {
-; CHECK-LABEL: define {{[^@]+}}@test9()
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; CHECK-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-; CHECK-NEXT:    store i32 10, i32* [[TMP2]], align 4
-; CHECK-NEXT:    tail call void @foo_nounw(i32* nofree align 4 [[TMP2]])
-; CHECK-NEXT:    tail call void @free(i8* nocapture nonnull align 4 dereferenceable(4) [[TMP1]])
-; CHECK-NEXT:    ret void
+; IS__TUNIT____-LABEL: define {{[^@]+}}@test9()
+; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT____-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__TUNIT____-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__TUNIT____-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__TUNIT____-NEXT:    tail call void @foo_nounw(i32* nofree noundef align 4 [[TMP2]])
+; IS__TUNIT____-NEXT:    tail call void @free(i8* nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test9()
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC____-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__CGSCC____-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__CGSCC____-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__CGSCC____-NEXT:    tail call void @foo_nounw(i32* nofree noundef align 4 [[TMP2]])
+; IS__CGSCC____-NEXT:    tail call void @free(i8* nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__CGSCC____-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   tail call void @no_sync_func(i8* %1)
@@ -351,22 +436,31 @@ define void @test9() {
 ; TEST 10 - 1 malloc, 1 free
 
 define i32 @test10() {
-; IS________OPM-LABEL: define {{[^@]+}}@test10()
-; IS________OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; IS________OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
-; IS________OPM-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-; IS________OPM-NEXT:    store i32 10, i32* [[TMP2]], align 4
-; IS________OPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
-; IS________OPM-NEXT:    tail call void @free(i8* noalias nocapture nonnull align 4 dereferenceable(4) [[TMP1]])
-; IS________OPM-NEXT:    ret i32 [[TMP3]]
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@test10()
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT_OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__TUNIT_OPM-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; IS__TUNIT_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP3]]
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@test10()
 ; IS________NPM-NEXT:    [[TMP1:%.*]] = alloca i8, i64 4, align 1
-; IS________NPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
+; IS________NPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
 ; IS________NPM-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
 ; IS________NPM-NEXT:    store i32 10, i32* [[TMP2]], align 4
 ; IS________NPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
 ; IS________NPM-NEXT:    ret i32 [[TMP3]]
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test10()
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC_OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__CGSCC_OPM-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; IS__CGSCC_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP3]]
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   tail call void @no_sync_func(i8* %1)
@@ -378,24 +472,34 @@ define i32 @test10() {
 }
 
 define i32 @test_lifetime() {
-; IS________OPM-LABEL: define {{[^@]+}}@test_lifetime()
-; IS________OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; IS________OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
-; IS________OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* noalias nocapture nonnull align 4 dereferenceable(4) [[TMP1]])
-; IS________OPM-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-; IS________OPM-NEXT:    store i32 10, i32* [[TMP2]], align 4
-; IS________OPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
-; IS________OPM-NEXT:    tail call void @free(i8* noalias nocapture nonnull align 4 dereferenceable(4) [[TMP1]])
-; IS________OPM-NEXT:    ret i32 [[TMP3]]
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@test_lifetime()
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT_OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__TUNIT_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* noalias nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__TUNIT_OPM-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; IS__TUNIT_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP3]]
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@test_lifetime()
 ; IS________NPM-NEXT:    [[TMP1:%.*]] = alloca i8, i64 4, align 1
-; IS________NPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
-; IS________NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* noalias nocapture nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS________NPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS________NPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* noalias nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
 ; IS________NPM-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
 ; IS________NPM-NEXT:    store i32 10, i32* [[TMP2]], align 4
 ; IS________NPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
 ; IS________NPM-NEXT:    ret i32 [[TMP3]]
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test_lifetime()
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC_OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* noalias nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__CGSCC_OPM-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; IS__CGSCC_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP3]]
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   tail call void @no_sync_func(i8* %1)
@@ -410,11 +514,17 @@ define i32 @test_lifetime() {
 ; TEST 11
 
 define void @test11() {
-; CHECK-LABEL: define {{[^@]+}}@test11()
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; CHECK-NEXT:    tail call void @sync_will_return(i8* [[TMP1]])
-; CHECK-NEXT:    tail call void @free(i8* nocapture [[TMP1]])
-; CHECK-NEXT:    ret void
+; IS__TUNIT____-LABEL: define {{[^@]+}}@test11()
+; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT____-NEXT:    tail call void @sync_will_return(i8* noundef [[TMP1]])
+; IS__TUNIT____-NEXT:    tail call void @free(i8* nocapture noundef [[TMP1]])
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test11()
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC____-NEXT:    tail call void @sync_will_return(i8* noundef [[TMP1]])
+; IS__CGSCC____-NEXT:    tail call void @free(i8* nocapture noundef [[TMP1]])
+; IS__CGSCC____-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   tail call void @sync_will_return(i8* %1)
@@ -424,37 +534,37 @@ define void @test11() {
 
 ; TEST 12
 define i32 @irreducible_cfg(i32 %0) {
-; IS________OPM-LABEL: define {{[^@]+}}@irreducible_cfg
-; IS________OPM-SAME: (i32 [[TMP0:%.*]])
-; IS________OPM-NEXT:    [[TMP2:%.*]] = call noalias i8* @malloc(i64 4)
-; IS________OPM-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to i32*
-; IS________OPM-NEXT:    store i32 10, i32* [[TMP3]], align 4
-; IS________OPM-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP0]], 1
-; IS________OPM-NEXT:    br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP7:%.*]]
-; IS________OPM:       5:
-; IS________OPM-NEXT:    [[TMP6:%.*]] = add nsw i32 [[TMP0]], 5
-; IS________OPM-NEXT:    br label [[TMP13:%.*]]
-; IS________OPM:       7:
-; IS________OPM-NEXT:    br label [[TMP8:%.*]]
-; IS________OPM:       8:
-; IS________OPM-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP14:%.*]], [[TMP13]] ], [ 1, [[TMP7]] ]
-; IS________OPM-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4
-; IS________OPM-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1
-; IS________OPM-NEXT:    store i32 [[TMP10]], i32* [[TMP3]], align 4
-; IS________OPM-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], 0
-; IS________OPM-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP15:%.*]]
-; IS________OPM:       12:
-; IS________OPM-NEXT:    br label [[TMP13]]
-; IS________OPM:       13:
-; IS________OPM-NEXT:    [[DOT1:%.*]] = phi i32 [ [[TMP6]], [[TMP5]] ], [ [[DOT0]], [[TMP12]] ]
-; IS________OPM-NEXT:    [[TMP14]] = add nsw i32 [[DOT1]], 1
-; IS________OPM-NEXT:    br label [[TMP8]]
-; IS________OPM:       15:
-; IS________OPM-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP3]], align 4
-; IS________OPM-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP3]] to i8*
-; IS________OPM-NEXT:    call void @free(i8* nocapture [[TMP17]])
-; IS________OPM-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP3]], align 4
-; IS________OPM-NEXT:    ret i32 [[TMP18]]
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@irreducible_cfg
+; IS__TUNIT_OPM-SAME: (i32 [[TMP0:%.*]])
+; IS__TUNIT_OPM-NEXT:    [[TMP2:%.*]] = call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT_OPM-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to i32*
+; IS__TUNIT_OPM-NEXT:    store i32 10, i32* [[TMP3]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP0]], 1
+; IS__TUNIT_OPM-NEXT:    br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP7:%.*]]
+; IS__TUNIT_OPM:       5:
+; IS__TUNIT_OPM-NEXT:    [[TMP6:%.*]] = add nsw i32 [[TMP0]], 5
+; IS__TUNIT_OPM-NEXT:    br label [[TMP13:%.*]]
+; IS__TUNIT_OPM:       7:
+; IS__TUNIT_OPM-NEXT:    br label [[TMP8:%.*]]
+; IS__TUNIT_OPM:       8:
+; IS__TUNIT_OPM-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP14:%.*]], [[TMP13]] ], [ 1, [[TMP7]] ]
+; IS__TUNIT_OPM-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1
+; IS__TUNIT_OPM-NEXT:    store i32 [[TMP10]], i32* [[TMP3]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], 0
+; IS__TUNIT_OPM-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP15:%.*]]
+; IS__TUNIT_OPM:       12:
+; IS__TUNIT_OPM-NEXT:    br label [[TMP13]]
+; IS__TUNIT_OPM:       13:
+; IS__TUNIT_OPM-NEXT:    [[DOT1:%.*]] = phi i32 [ [[TMP6]], [[TMP5]] ], [ [[DOT0]], [[TMP12]] ]
+; IS__TUNIT_OPM-NEXT:    [[TMP14]] = add nsw i32 [[DOT1]], 1
+; IS__TUNIT_OPM-NEXT:    br label [[TMP8]]
+; IS__TUNIT_OPM:       15:
+; IS__TUNIT_OPM-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP3]], align 4
+; IS__TUNIT_OPM-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP3]] to i8*
+; IS__TUNIT_OPM-NEXT:    call void @free(i8* nocapture noundef [[TMP17]])
+; IS__TUNIT_OPM-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP3]], align 4
+; IS__TUNIT_OPM-NEXT:    ret i32 [[TMP18]]
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@irreducible_cfg
 ; IS________NPM-SAME: (i32 [[TMP0:%.*]])
@@ -485,6 +595,38 @@ define i32 @irreducible_cfg(i32 %0) {
 ; IS________NPM-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP3]] to i8*
 ; IS________NPM-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP3]], align 4
 ; IS________NPM-NEXT:    ret i32 [[TMP17]]
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@irreducible_cfg
+; IS__CGSCC_OPM-SAME: (i32 [[TMP0:%.*]])
+; IS__CGSCC_OPM-NEXT:    [[TMP2:%.*]] = call noalias i8* @malloc(i64 4)
+; IS__CGSCC_OPM-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to i32*
+; IS__CGSCC_OPM-NEXT:    store i32 10, i32* [[TMP3]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP0]], 1
+; IS__CGSCC_OPM-NEXT:    br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP7:%.*]]
+; IS__CGSCC_OPM:       5:
+; IS__CGSCC_OPM-NEXT:    [[TMP6:%.*]] = add nsw i32 [[TMP0]], 5
+; IS__CGSCC_OPM-NEXT:    br label [[TMP13:%.*]]
+; IS__CGSCC_OPM:       7:
+; IS__CGSCC_OPM-NEXT:    br label [[TMP8:%.*]]
+; IS__CGSCC_OPM:       8:
+; IS__CGSCC_OPM-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP14:%.*]], [[TMP13]] ], [ 1, [[TMP7]] ]
+; IS__CGSCC_OPM-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP9]], -1
+; IS__CGSCC_OPM-NEXT:    store i32 [[TMP10]], i32* [[TMP3]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP11:%.*]] = icmp ne i32 [[TMP9]], 0
+; IS__CGSCC_OPM-NEXT:    br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP15:%.*]]
+; IS__CGSCC_OPM:       12:
+; IS__CGSCC_OPM-NEXT:    br label [[TMP13]]
+; IS__CGSCC_OPM:       13:
+; IS__CGSCC_OPM-NEXT:    [[DOT1:%.*]] = phi i32 [ [[TMP6]], [[TMP5]] ], [ [[DOT0]], [[TMP12]] ]
+; IS__CGSCC_OPM-NEXT:    [[TMP14]] = add nsw i32 [[DOT1]], 1
+; IS__CGSCC_OPM-NEXT:    br label [[TMP8]]
+; IS__CGSCC_OPM:       15:
+; IS__CGSCC_OPM-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP3]], align 4
+; IS__CGSCC_OPM-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP3]] to i8*
+; IS__CGSCC_OPM-NEXT:    call void @free(i8* nocapture noundef [[TMP17]])
+; IS__CGSCC_OPM-NEXT:    [[TMP18:%.*]] = load i32, i32* [[TMP3]], align 4
+; IS__CGSCC_OPM-NEXT:    ret i32 [[TMP18]]
 ;
   %2 = call noalias i8* @malloc(i64 4)
   %3 = bitcast i8* %2 to i32*
@@ -589,14 +731,23 @@ define i32 @malloc_in_loop(i32 %0) {
 
 ; Malloc/Calloc too large
 define i32 @test13() {
-; CHECK-LABEL: define {{[^@]+}}@test13()
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 256)
-; CHECK-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-; CHECK-NEXT:    store i32 10, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-NEXT:    tail call void @free(i8* noalias nocapture nonnull align 4 dereferenceable(4) [[TMP1]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; IS__TUNIT____-LABEL: define {{[^@]+}}@test13()
+; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 256)
+; IS__TUNIT____-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__TUNIT____-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__TUNIT____-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__TUNIT____-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; IS__TUNIT____-NEXT:    tail call void @free(i8* noalias nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__TUNIT____-NEXT:    ret i32 [[TMP3]]
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test13()
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 256)
+; IS__CGSCC____-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__CGSCC____-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__CGSCC____-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__CGSCC____-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; IS__CGSCC____-NEXT:    tail call void @free(i8* noalias nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__CGSCC____-NEXT:    ret i32 [[TMP3]]
 ;
   %1 = tail call noalias i8* @malloc(i64 256)
   tail call void @no_sync_func(i8* %1)
@@ -608,14 +759,23 @@ define i32 @test13() {
 }
 
 define i32 @test_sle() {
-; CHECK-LABEL: define {{[^@]+}}@test_sle()
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 -1)
-; CHECK-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-; CHECK-NEXT:    store i32 10, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-NEXT:    tail call void @free(i8* noalias nocapture nonnull align 4 dereferenceable(4) [[TMP1]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; IS__TUNIT____-LABEL: define {{[^@]+}}@test_sle()
+; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 -1)
+; IS__TUNIT____-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__TUNIT____-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__TUNIT____-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__TUNIT____-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; IS__TUNIT____-NEXT:    tail call void @free(i8* noalias nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__TUNIT____-NEXT:    ret i32 [[TMP3]]
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test_sle()
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 -1)
+; IS__CGSCC____-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__CGSCC____-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__CGSCC____-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__CGSCC____-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; IS__CGSCC____-NEXT:    tail call void @free(i8* noalias nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__CGSCC____-NEXT:    ret i32 [[TMP3]]
 ;
   %1 = tail call noalias i8* @malloc(i64 -1)
   tail call void @no_sync_func(i8* %1)
@@ -627,14 +787,23 @@ define i32 @test_sle() {
 }
 
 define i32 @test_overflow() {
-; CHECK-LABEL: define {{[^@]+}}@test_overflow()
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @calloc(i64 65537, i64 65537)
-; CHECK-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
-; CHECK-NEXT:    store i32 10, i32* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-NEXT:    tail call void @free(i8* noalias nocapture nonnull align 4 dereferenceable(4) [[TMP1]])
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; IS__TUNIT____-LABEL: define {{[^@]+}}@test_overflow()
+; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @calloc(i64 65537, i64 65537)
+; IS__TUNIT____-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__TUNIT____-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__TUNIT____-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__TUNIT____-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; IS__TUNIT____-NEXT:    tail call void @free(i8* noalias nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__TUNIT____-NEXT:    ret i32 [[TMP3]]
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test_overflow()
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @calloc(i64 65537, i64 65537)
+; IS__CGSCC____-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__CGSCC____-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+; IS__CGSCC____-NEXT:    store i32 10, i32* [[TMP2]], align 4
+; IS__CGSCC____-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
+; IS__CGSCC____-NEXT:    tail call void @free(i8* noalias nocapture noundef nonnull align 4 dereferenceable(4) [[TMP1]])
+; IS__CGSCC____-NEXT:    ret i32 [[TMP3]]
 ;
   %1 = tail call noalias i8* @calloc(i64 65537, i64 65537)
   tail call void @no_sync_func(i8* %1)
@@ -646,11 +815,17 @@ define i32 @test_overflow() {
 }
 
 define void @test14() {
-; CHECK-LABEL: define {{[^@]+}}@test14()
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @calloc(i64 64, i64 4)
-; CHECK-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
-; CHECK-NEXT:    tail call void @free(i8* noalias nocapture [[TMP1]])
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_OPM-LABEL: define {{[^@]+}}@test14()
+; NOT_CGSCC_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @calloc(i64 64, i64 4)
+; NOT_CGSCC_OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; NOT_CGSCC_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
+; NOT_CGSCC_OPM-NEXT:    ret void
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test14()
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @calloc(i64 64, i64 4)
+; IS__CGSCC_OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @calloc(i64 64, i64 4)
   tail call void @no_sync_func(i8* %1)
@@ -659,12 +834,19 @@ define void @test14() {
 }
 
 define void @test15(i64 %S) {
-; CHECK-LABEL: define {{[^@]+}}@test15
-; CHECK-SAME: (i64 [[S:%.*]])
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 [[S]])
-; CHECK-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree [[TMP1]])
-; CHECK-NEXT:    tail call void @free(i8* noalias nocapture [[TMP1]])
-; CHECK-NEXT:    ret void
+; IS__TUNIT____-LABEL: define {{[^@]+}}@test15
+; IS__TUNIT____-SAME: (i64 [[S:%.*]])
+; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 [[S]])
+; IS__TUNIT____-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__TUNIT____-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test15
+; IS__CGSCC____-SAME: (i64 [[S:%.*]])
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 [[S]])
+; IS__CGSCC____-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef [[TMP1]])
+; IS__CGSCC____-NEXT:    tail call void @free(i8* noalias nocapture noundef [[TMP1]])
+; IS__CGSCC____-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @malloc(i64 %S)
   tail call void @no_sync_func(i8* %1)
@@ -673,20 +855,28 @@ define void @test15(i64 %S) {
 }
 
 define void @test16a(i8 %v, i8** %P) {
-; IS________OPM-LABEL: define {{[^@]+}}@test16a
-; IS________OPM-SAME: (i8 [[V:%.*]], i8** nocapture nofree readnone [[P:%.*]])
-; IS________OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; IS________OPM-NEXT:    store i8 [[V]], i8* [[TMP1]], align 1
-; IS________OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree nonnull dereferenceable(1) [[TMP1]])
-; IS________OPM-NEXT:    tail call void @free(i8* noalias nocapture nonnull dereferenceable(1) [[TMP1]])
-; IS________OPM-NEXT:    ret void
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@test16a
+; IS__TUNIT_OPM-SAME: (i8 [[V:%.*]], i8** nocapture nofree readnone [[P:%.*]])
+; IS__TUNIT_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT_OPM-NEXT:    store i8 [[V]], i8* [[TMP1]], align 1
+; IS__TUNIT_OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef nonnull dereferenceable(1) [[TMP1]])
+; IS__TUNIT_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef nonnull dereferenceable(1) [[TMP1]])
+; IS__TUNIT_OPM-NEXT:    ret void
 ;
 ; IS________NPM-LABEL: define {{[^@]+}}@test16a
 ; IS________NPM-SAME: (i8 [[V:%.*]], i8** nocapture nofree readnone [[P:%.*]])
 ; IS________NPM-NEXT:    [[TMP1:%.*]] = alloca i8, i64 4, align 1
 ; IS________NPM-NEXT:    store i8 [[V]], i8* [[TMP1]], align 1
-; IS________NPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree nonnull dereferenceable(1) [[TMP1]])
+; IS________NPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef nonnull dereferenceable(1) [[TMP1]])
 ; IS________NPM-NEXT:    ret void
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test16a
+; IS__CGSCC_OPM-SAME: (i8 [[V:%.*]], i8** nocapture nofree readnone [[P:%.*]])
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC_OPM-NEXT:    store i8 [[V]], i8* [[TMP1]], align 1
+; IS__CGSCC_OPM-NEXT:    tail call void @no_sync_func(i8* noalias nocapture nofree noundef nonnull dereferenceable(1) [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    tail call void @free(i8* noalias nocapture noundef nonnull dereferenceable(1) [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   store i8 %v, i8* %1
@@ -696,13 +886,21 @@ define void @test16a(i8 %v, i8** %P) {
 }
 
 define void @test16b(i8 %v, i8** %P) {
-; CHECK-LABEL: define {{[^@]+}}@test16b
-; CHECK-SAME: (i8 [[V:%.*]], i8** nocapture writeonly [[P:%.*]])
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; CHECK-NEXT:    store i8* [[TMP1]], i8** [[P]], align 8
-; CHECK-NEXT:    tail call void @no_sync_func(i8* nocapture nofree [[TMP1]])
-; CHECK-NEXT:    tail call void @free(i8* nocapture [[TMP1]])
-; CHECK-NEXT:    ret void
+; IS__TUNIT____-LABEL: define {{[^@]+}}@test16b
+; IS__TUNIT____-SAME: (i8 [[V:%.*]], i8** nocapture writeonly [[P:%.*]])
+; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT____-NEXT:    store i8* [[TMP1]], i8** [[P]], align 8
+; IS__TUNIT____-NEXT:    tail call void @no_sync_func(i8* nocapture nofree noundef [[TMP1]])
+; IS__TUNIT____-NEXT:    tail call void @free(i8* nocapture noundef [[TMP1]])
+; IS__TUNIT____-NEXT:    ret void
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test16b
+; IS__CGSCC____-SAME: (i8 [[V:%.*]], i8** nocapture writeonly [[P:%.*]])
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC____-NEXT:    store i8* [[TMP1]], i8** [[P]], align 8
+; IS__CGSCC____-NEXT:    tail call void @no_sync_func(i8* nocapture nofree noundef [[TMP1]])
+; IS__CGSCC____-NEXT:    tail call void @free(i8* nocapture noundef [[TMP1]])
+; IS__CGSCC____-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   store i8* %1, i8** %P
@@ -712,13 +910,21 @@ define void @test16b(i8 %v, i8** %P) {
 }
 
 define void @test16c(i8 %v, i8** %P) {
-; CHECK-LABEL: define {{[^@]+}}@test16c
-; CHECK-SAME: (i8 [[V:%.*]], i8** nocapture writeonly [[P:%.*]])
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; CHECK-NEXT:    store i8* [[TMP1]], i8** [[P]], align 8
-; CHECK-NEXT:    tail call void @no_sync_func(i8* nocapture nofree [[TMP1]])
-; CHECK-NEXT:    tail call void @free(i8* nocapture [[TMP1]])
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_OPM-LABEL: define {{[^@]+}}@test16c
+; NOT_CGSCC_OPM-SAME: (i8 [[V:%.*]], i8** nocapture writeonly [[P:%.*]])
+; NOT_CGSCC_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; NOT_CGSCC_OPM-NEXT:    store i8* [[TMP1]], i8** [[P]], align 8
+; NOT_CGSCC_OPM-NEXT:    tail call void @no_sync_func(i8* nocapture nofree noundef [[TMP1]])
+; NOT_CGSCC_OPM-NEXT:    tail call void @free(i8* nocapture noundef [[TMP1]])
+; NOT_CGSCC_OPM-NEXT:    ret void
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test16c
+; IS__CGSCC_OPM-SAME: (i8 [[V:%.*]], i8** nocapture writeonly [[P:%.*]])
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC_OPM-NEXT:    store i8* [[TMP1]], i8** [[P]], align 8
+; IS__CGSCC_OPM-NEXT:    tail call void @no_sync_func(i8* nocapture nofree noundef [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    tail call void @free(i8* nocapture noundef [[TMP1]])
+; IS__CGSCC_OPM-NEXT:    ret void
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   store i8* %1, i8** %P
diff --git a/llvm/test/Transforms/Attributor/internal-noalias.ll b/llvm/test/Transforms/Attributor/internal-noalias.ll
index d4e27d24bc5535..b71c07bd402093 100644
--- a/llvm/test/Transforms/Attributor/internal-noalias.ll
+++ b/llvm/test/Transforms/Attributor/internal-noalias.ll
@@ -96,8 +96,8 @@ define dso_local i32 @visible_local(i32* %A) #0 {
 ; IS__TUNIT____-NEXT:  entry:
 ; IS__TUNIT____-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; IS__TUNIT____-NEXT:    store i32 5, i32* [[B]], align 4
-; IS__TUNIT____-NEXT:    [[CALL1:%.*]] = call i32 @noalias_args(i32* nocapture nofree readonly align 4 [[A]], i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]])
-; IS__TUNIT____-NEXT:    [[CALL2:%.*]] = call i32 @noalias_args_argmem(i32* nocapture nofree readonly align 4 [[A]], i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]])
+; IS__TUNIT____-NEXT:    [[CALL1:%.*]] = call i32 @noalias_args(i32* nocapture nofree readonly align 4 [[A]], i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
+; IS__TUNIT____-NEXT:    [[CALL2:%.*]] = call i32 @noalias_args_argmem(i32* nocapture nofree readonly align 4 [[A]], i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
 ; IS__TUNIT____-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL1]], [[CALL2]]
 ; IS__TUNIT____-NEXT:    ret i32 [[ADD]]
 ;
@@ -107,8 +107,8 @@ define dso_local i32 @visible_local(i32* %A) #0 {
 ; IS__CGSCC____-NEXT:  entry:
 ; IS__CGSCC____-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; IS__CGSCC____-NEXT:    store i32 5, i32* [[B]], align 4
-; IS__CGSCC____-NEXT:    [[CALL1:%.*]] = call i32 @noalias_args(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A]], i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]])
-; IS__CGSCC____-NEXT:    [[CALL2:%.*]] = call i32 @noalias_args_argmem(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A]], i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]])
+; IS__CGSCC____-NEXT:    [[CALL1:%.*]] = call i32 @noalias_args(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A]], i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
+; IS__CGSCC____-NEXT:    [[CALL2:%.*]] = call i32 @noalias_args_argmem(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A]], i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
 ; IS__CGSCC____-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL1]], [[CALL2]]
 ; IS__CGSCC____-NEXT:    ret i32 [[ADD]]
 ;
@@ -124,7 +124,7 @@ entry:
 define internal i32 @noalias_args_argmem_ro(i32* %A, i32* %B) #1 {
 ; IS__TUNIT_OPM: Function Attrs: argmemonly nofree noinline nosync nounwind readonly uwtable willreturn
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@noalias_args_argmem_ro
-; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A:%.*]], i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B:%.*]])
+; IS__TUNIT_OPM-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]], i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B:%.*]])
 ; IS__TUNIT_OPM-NEXT:    [[T0:%.*]] = load i32, i32* [[A]], align 4
 ; IS__TUNIT_OPM-NEXT:    [[T1:%.*]] = load i32, i32* [[B]], align 4
 ; IS__TUNIT_OPM-NEXT:    [[ADD:%.*]] = add nsw i32 [[T0]], [[T1]]
@@ -144,7 +144,7 @@ define internal i32 @noalias_args_argmem_ro(i32* %A, i32* %B) #1 {
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree noinline norecurse nosync nounwind readonly uwtable willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@noalias_args_argmem_ro
-; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[A:%.*]], i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B:%.*]])
+; IS__CGSCC____-SAME: (i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[A:%.*]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B:%.*]])
 ; IS__CGSCC____-NEXT:    [[T0:%.*]] = load i32, i32* [[A]], align 4
 ; IS__CGSCC____-NEXT:    [[T1:%.*]] = load i32, i32* [[B]], align 4
 ; IS__CGSCC____-NEXT:    [[ADD:%.*]] = add nsw i32 [[T0]], [[T1]]
@@ -161,7 +161,7 @@ define i32 @visible_local_2() {
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@visible_local_2()
 ; IS__TUNIT_OPM-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; IS__TUNIT_OPM-NEXT:    store i32 5, i32* [[B]], align 4
-; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call i32 @noalias_args_argmem_ro(i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]], i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]])
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call i32 @noalias_args_argmem_ro(i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]], i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
 ; IS__TUNIT_OPM-NEXT:    ret i32 [[CALL]]
 ;
 ; IS__TUNIT_NPM: Function Attrs: nofree nosync nounwind readnone willreturn
@@ -177,7 +177,7 @@ define i32 @visible_local_2() {
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@visible_local_2()
 ; IS__CGSCC____-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; IS__CGSCC____-NEXT:    store i32 5, i32* [[B]], align 4
-; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call i32 @noalias_args_argmem_ro(i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]], i32* noalias nocapture nofree nonnull readonly align 4 dereferenceable(4) [[B]])
+; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call i32 @noalias_args_argmem_ro(i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]], i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[B]])
 ; IS__CGSCC____-NEXT:    ret i32 [[CALL]]
 ;
   %B = alloca i32, align 4
@@ -189,14 +189,14 @@ define i32 @visible_local_2() {
 define internal i32 @noalias_args_argmem_rn(i32* %A, i32* %B) #1 {
 ; IS__TUNIT____: Function Attrs: argmemonly nofree noinline nosync nounwind uwtable willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@noalias_args_argmem_rn
-; IS__TUNIT____-SAME: (i32* noalias nocapture nofree nonnull align 4 dereferenceable(4) [[B:%.*]])
+; IS__TUNIT____-SAME: (i32* noalias nocapture nofree noundef nonnull align 4 dereferenceable(4) [[B:%.*]])
 ; IS__TUNIT____-NEXT:    [[T0:%.*]] = load i32, i32* [[B]], align 4
 ; IS__TUNIT____-NEXT:    store i32 0, i32* [[B]], align 4
 ; IS__TUNIT____-NEXT:    ret i32 [[T0]]
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree noinline norecurse nosync nounwind uwtable willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@noalias_args_argmem_rn
-; IS__CGSCC____-SAME: (i32* nocapture nofree nonnull align 4 dereferenceable(4) [[B:%.*]])
+; IS__CGSCC____-SAME: (i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[B:%.*]])
 ; IS__CGSCC____-NEXT:    [[T0:%.*]] = load i32, i32* [[B]], align 4
 ; IS__CGSCC____-NEXT:    store i32 0, i32* [[B]], align 4
 ; IS__CGSCC____-NEXT:    ret i32 [[T0]]
@@ -211,14 +211,14 @@ define i32 @visible_local_3() {
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@visible_local_3()
 ; IS__TUNIT____-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; IS__TUNIT____-NEXT:    store i32 5, i32* [[B]], align 4
-; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call i32 @noalias_args_argmem_rn(i32* noalias nocapture nofree nonnull align 4 dereferenceable(4) [[B]])
+; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call i32 @noalias_args_argmem_rn(i32* noalias nocapture nofree noundef nonnull align 4 dereferenceable(4) [[B]])
 ; IS__TUNIT____-NEXT:    ret i32 [[CALL]]
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@visible_local_3()
 ; IS__CGSCC____-NEXT:    [[B:%.*]] = alloca i32, align 4
 ; IS__CGSCC____-NEXT:    store i32 5, i32* [[B]], align 4
-; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call i32 @noalias_args_argmem_rn(i32* noalias nocapture nofree nonnull align 4 dereferenceable(4) [[B]])
+; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call i32 @noalias_args_argmem_rn(i32* noalias nocapture nofree noundef nonnull align 4 dereferenceable(4) [[B]])
 ; IS__CGSCC____-NEXT:    ret i32 [[CALL]]
 ;
   %B = alloca i32, align 4
diff --git a/llvm/test/Transforms/Attributor/liveness.ll b/llvm/test/Transforms/Attributor/liveness.ll
index f3bd7ef1460a85..62d195a1f66bbf 100644
--- a/llvm/test/Transforms/Attributor/liveness.ll
+++ b/llvm/test/Transforms/Attributor/liveness.ll
@@ -1758,12 +1758,12 @@ define void @call_via_pointer_with_dead_args(i32* %a, i32* %b, void (i32*, i32*,
 ; FIXME: We have to prevent the propagation of %fp in the new pm CGSCC pass until the CallGraphUpdater can handle the new call edge.
 define internal void @call_via_pointer_with_dead_args_internal_a(i32* %a, i32* %b, void (i32*, i32*, i32*, i64, i32**)* %fp) {
 ; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@call_via_pointer_with_dead_args_internal_a
-; NOT_CGSCC_NPM-SAME: (i32* [[A:%.*]], i32* nonnull align 128 dereferenceable(4) [[B:%.*]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull [[FP:%.*]])
+; NOT_CGSCC_NPM-SAME: (i32* [[A:%.*]], i32* noundef nonnull align 128 dereferenceable(4) [[B:%.*]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull [[FP:%.*]])
 ; NOT_CGSCC_NPM-NEXT:    call void @called_via_pointer(i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[B]], i32* [[A]], i64 -1, i32** null)
 ; NOT_CGSCC_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@call_via_pointer_with_dead_args_internal_a
-; IS__CGSCC____-SAME: (i32* [[A:%.*]], i32* nonnull align 128 dereferenceable(4) [[B:%.*]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull [[FP:%.*]])
+; IS__CGSCC____-SAME: (i32* [[A:%.*]], i32* noundef nonnull align 128 dereferenceable(4) [[B:%.*]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull [[FP:%.*]])
 ; IS__CGSCC____-NEXT:    call void [[FP]](i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[B]], i32* [[A]], i64 -1, i32** null)
 ; IS__CGSCC____-NEXT:    ret void
 ;
@@ -1772,12 +1772,12 @@ define internal void @call_via_pointer_with_dead_args_internal_a(i32* %a, i32* %
 }
 define internal void @call_via_pointer_with_dead_args_internal_b(i32* %a, i32* %b, void (i32*, i32*, i32*, i64, i32**)* %fp) {
 ; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@call_via_pointer_with_dead_args_internal_b
-; NOT_CGSCC_NPM-SAME: (i32* [[A:%.*]], i32* nonnull align 128 dereferenceable(4) [[B:%.*]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull [[FP:%.*]])
+; NOT_CGSCC_NPM-SAME: (i32* [[A:%.*]], i32* noundef nonnull align 128 dereferenceable(4) [[B:%.*]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull [[FP:%.*]])
 ; NOT_CGSCC_NPM-NEXT:    call void @called_via_pointer_internal_2(i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[B]], i32* [[A]], i64 -1, i32** null)
 ; NOT_CGSCC_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@call_via_pointer_with_dead_args_internal_b
-; IS__CGSCC____-SAME: (i32* [[A:%.*]], i32* nonnull align 128 dereferenceable(4) [[B:%.*]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull [[FP:%.*]])
+; IS__CGSCC____-SAME: (i32* [[A:%.*]], i32* noundef nonnull align 128 dereferenceable(4) [[B:%.*]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull [[FP:%.*]])
 ; IS__CGSCC____-NEXT:    call void [[FP]](i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[B]], i32* [[A]], i64 -1, i32** null)
 ; IS__CGSCC____-NEXT:    ret void
 ;
@@ -1791,10 +1791,10 @@ define void @call_via_pointer_with_dead_args_caller(i32* %a, i32* %b) {
 ; NOT_CGSCC_NPM-NEXT:    [[PTR2:%.*]] = alloca i32, align 128
 ; NOT_CGSCC_NPM-NEXT:    [[PTR3:%.*]] = alloca i32, align 128
 ; NOT_CGSCC_NPM-NEXT:    [[PTR4:%.*]] = alloca i32, align 128
-; NOT_CGSCC_NPM-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[PTR1]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree @called_via_pointer)
-; NOT_CGSCC_NPM-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[PTR2]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree @called_via_pointer_internal_1)
-; NOT_CGSCC_NPM-NEXT:    call void @call_via_pointer_with_dead_args_internal_a(i32* [[B]], i32* nonnull align 128 dereferenceable(4) [[PTR3]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree @called_via_pointer)
-; NOT_CGSCC_NPM-NEXT:    call void @call_via_pointer_with_dead_args_internal_b(i32* [[B]], i32* nonnull align 128 dereferenceable(4) [[PTR4]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree @called_via_pointer_internal_2)
+; NOT_CGSCC_NPM-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* noundef nonnull align 128 dereferenceable(4) [[PTR1]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef @called_via_pointer)
+; NOT_CGSCC_NPM-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* noundef nonnull align 128 dereferenceable(4) [[PTR2]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef @called_via_pointer_internal_1)
+; NOT_CGSCC_NPM-NEXT:    call void @call_via_pointer_with_dead_args_internal_a(i32* [[B]], i32* noundef nonnull align 128 dereferenceable(4) [[PTR3]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef @called_via_pointer)
+; NOT_CGSCC_NPM-NEXT:    call void @call_via_pointer_with_dead_args_internal_b(i32* [[B]], i32* noundef nonnull align 128 dereferenceable(4) [[PTR4]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef @called_via_pointer_internal_2)
 ; NOT_CGSCC_NPM-NEXT:    ret void
 ;
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@call_via_pointer_with_dead_args_caller
@@ -1803,10 +1803,10 @@ define void @call_via_pointer_with_dead_args_caller(i32* %a, i32* %b) {
 ; IS__CGSCC____-NEXT:    [[PTR2:%.*]] = alloca i32, align 128
 ; IS__CGSCC____-NEXT:    [[PTR3:%.*]] = alloca i32, align 128
 ; IS__CGSCC____-NEXT:    [[PTR4:%.*]] = alloca i32, align 128
-; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[PTR1]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull @called_via_pointer)
-; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* nonnull align 128 dereferenceable(4) [[PTR2]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull @called_via_pointer_internal_1)
-; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args_internal_a(i32* [[B]], i32* nonnull align 128 dereferenceable(4) [[PTR3]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull @called_via_pointer)
-; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args_internal_b(i32* [[B]], i32* nonnull align 128 dereferenceable(4) [[PTR4]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree nonnull @called_via_pointer_internal_2)
+; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* noundef nonnull align 128 dereferenceable(4) [[PTR1]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull @called_via_pointer)
+; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args(i32* [[A]], i32* noundef nonnull align 128 dereferenceable(4) [[PTR2]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull @called_via_pointer_internal_1)
+; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args_internal_a(i32* [[B]], i32* noundef nonnull align 128 dereferenceable(4) [[PTR3]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull @called_via_pointer)
+; IS__CGSCC____-NEXT:    call void @call_via_pointer_with_dead_args_internal_b(i32* [[B]], i32* noundef nonnull align 128 dereferenceable(4) [[PTR4]], void (i32*, i32*, i32*, i64, i32**)* nocapture nofree noundef nonnull @called_via_pointer_internal_2)
 ; IS__CGSCC____-NEXT:    ret void
 ;
   %ptr1 = alloca i32, align 128
@@ -1997,7 +1997,7 @@ define void @bad_gep() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[N:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    [[M:%.*]] = alloca i8, align 1
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 1, i8* noalias nocapture nonnull dereferenceable(1) [[N]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 1, i8* noalias nocapture noundef nonnull dereferenceable(1) [[N]])
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    unreachable
@@ -2006,7 +2006,7 @@ define void @bad_gep() {
 ; CHECK:       if.end:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 1, i8* noalias nocapture nonnull dereferenceable(1) [[N]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 1, i8* noalias nocapture noundef nonnull dereferenceable(1) [[N]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/Attributor/memory_locations.ll b/llvm/test/Transforms/Attributor/memory_locations.ll
index 23bff129101805..aeb66e9dbe4a20 100644
--- a/llvm/test/Transforms/Attributor/memory_locations.ll
+++ b/llvm/test/Transforms/Attributor/memory_locations.ll
@@ -327,7 +327,7 @@ define void @callerB1() {
 ; CHECK: Function Attrs: readnone
 ; CHECK-LABEL: define {{[^@]+}}@callerB1()
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca i8, align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @argmem_only(i8* nonnull dereferenceable(1) [[STACK]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @argmem_only(i8* noundef nonnull dereferenceable(1) [[STACK]])
 ; CHECK-NEXT:    ret void
 ;
   %stack = alloca i8
@@ -338,7 +338,7 @@ define void @callerB2() {
 ; CHECK: Function Attrs: inaccessiblememonly
 ; CHECK-LABEL: define {{[^@]+}}@callerB2()
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca i8, align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @inaccesible_argmem_only_decl(i8* nonnull dereferenceable(1) [[STACK]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @inaccesible_argmem_only_decl(i8* noundef nonnull dereferenceable(1) [[STACK]])
 ; CHECK-NEXT:    ret void
 ;
   %stack = alloca i8
@@ -346,20 +346,30 @@ define void @callerB2() {
   ret void
 }
 define void @callerC1() {
-; CHECK-LABEL: define {{[^@]+}}@callerC1()
-; CHECK-NEXT:    [[UNKNOWN:%.*]] = call i8* @unknown_ptr()
-; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @argmem_only(i8* [[UNKNOWN]])
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@callerC1()
+; NOT_CGSCC_NPM-NEXT:    [[UNKNOWN:%.*]] = call noundef i8* @unknown_ptr()
+; NOT_CGSCC_NPM-NEXT:    [[TMP1:%.*]] = call i8* @argmem_only(i8* noundef [[UNKNOWN]])
+; NOT_CGSCC_NPM-NEXT:    ret void
+;
+; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@callerC1()
+; IS__CGSCC_NPM-NEXT:    [[UNKNOWN:%.*]] = call i8* @unknown_ptr()
+; IS__CGSCC_NPM-NEXT:    [[TMP1:%.*]] = call i8* @argmem_only(i8* noundef [[UNKNOWN]])
+; IS__CGSCC_NPM-NEXT:    ret void
 ;
   %unknown = call i8* @unknown_ptr()
   call i8* @argmem_only(i8* %unknown)
   ret void
 }
 define void @callerC2() {
-; CHECK-LABEL: define {{[^@]+}}@callerC2()
-; CHECK-NEXT:    [[UNKNOWN:%.*]] = call i8* @unknown_ptr()
-; CHECK-NEXT:    [[TMP1:%.*]] = call i8* @inaccesible_argmem_only_decl(i8* [[UNKNOWN]])
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_OPM-LABEL: define {{[^@]+}}@callerC2()
+; NOT_CGSCC_OPM-NEXT:    [[UNKNOWN:%.*]] = call noundef i8* @unknown_ptr()
+; NOT_CGSCC_OPM-NEXT:    [[TMP1:%.*]] = call i8* @inaccesible_argmem_only_decl(i8* noundef [[UNKNOWN]])
+; NOT_CGSCC_OPM-NEXT:    ret void
+;
+; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@callerC2()
+; IS__CGSCC_OPM-NEXT:    [[UNKNOWN:%.*]] = call i8* @unknown_ptr()
+; IS__CGSCC_OPM-NEXT:    [[TMP1:%.*]] = call i8* @inaccesible_argmem_only_decl(i8* noundef [[UNKNOWN]])
+; IS__CGSCC_OPM-NEXT:    ret void
 ;
   %unknown = call i8* @unknown_ptr()
   call i8* @inaccesible_argmem_only_decl(i8* %unknown)
@@ -367,7 +377,7 @@ define void @callerC2() {
 }
 define void @callerD1() {
 ; CHECK-LABEL: define {{[^@]+}}@callerD1()
-; CHECK-NEXT:    [[UNKNOWN:%.*]] = call i8* @argmem_only(i8* noalias nocapture align 536870912 null)
+; CHECK-NEXT:    [[UNKNOWN:%.*]] = call i8* @argmem_only(i8* noalias nocapture noundef align 536870912 null)
 ; CHECK-NEXT:    store i8 0, i8* [[UNKNOWN]], align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -377,7 +387,7 @@ define void @callerD1() {
 }
 define void @callerD2() {
 ; CHECK-LABEL: define {{[^@]+}}@callerD2()
-; CHECK-NEXT:    [[UNKNOWN:%.*]] = call i8* @inaccesible_argmem_only_decl(i8* noalias nocapture align 536870912 null)
+; CHECK-NEXT:    [[UNKNOWN:%.*]] = call i8* @inaccesible_argmem_only_decl(i8* noalias nocapture noundef align 536870912 null)
 ; CHECK-NEXT:    store i8 0, i8* [[UNKNOWN]], align 1
 ; CHECK-NEXT:    ret void
 ;
@@ -464,12 +474,12 @@ define void @writeonly_global() {
 define void @writeonly_global_via_arg() {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind willreturn writeonly
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@writeonly_global_via_arg()
-; IS__TUNIT____-NEXT:    call void @write_global_via_arg(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) @G)
+; IS__TUNIT____-NEXT:    call void @write_global_via_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) @G)
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind willreturn writeonly
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@writeonly_global_via_arg()
-; IS__CGSCC____-NEXT:    call void @write_global_via_arg(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) @G)
+; IS__CGSCC____-NEXT:    call void @write_global_via_arg(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) @G)
 ; IS__CGSCC____-NEXT:    ret void
 ;
   call void @write_global_via_arg(i32* @G)
@@ -499,7 +509,7 @@ define i8 @recursive_not_readnone(i8* %ptr, i1 %c) {
 ; CHECK-NEXT:    [[ALLOC:%.*]] = alloca i8, align 1
 ; CHECK-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; CHECK:       t:
-; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @recursive_not_readnone(i8* noalias nocapture nofree nonnull writeonly dereferenceable(1) [[ALLOC]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @recursive_not_readnone(i8* noalias nocapture nofree noundef nonnull writeonly dereferenceable(1) [[ALLOC]], i1 false)
 ; CHECK-NEXT:    [[R:%.*]] = load i8, i8* [[ALLOC]], align 1
 ; CHECK-NEXT:    ret i8 [[R]]
 ; CHECK:       f:
@@ -520,11 +530,11 @@ f:
 define internal i8 @recursive_not_readnone_internal(i8* %ptr, i1 %c) {
 ; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@recursive_not_readnone_internal
-; IS__TUNIT____-SAME: (i8* noalias nocapture nofree nonnull writeonly dereferenceable(1) [[PTR:%.*]], i1 [[C:%.*]])
+; IS__TUNIT____-SAME: (i8* noalias nocapture nofree noundef nonnull writeonly dereferenceable(1) [[PTR:%.*]], i1 [[C:%.*]])
 ; IS__TUNIT____-NEXT:    [[ALLOC:%.*]] = alloca i8, align 1
 ; IS__TUNIT____-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; IS__TUNIT____:       t:
-; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal(i8* noalias nocapture nofree nonnull writeonly dereferenceable(1) [[ALLOC]], i1 false)
+; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal(i8* noalias nocapture nofree noundef nonnull writeonly dereferenceable(1) [[ALLOC]], i1 false)
 ; IS__TUNIT____-NEXT:    [[R:%.*]] = load i8, i8* [[ALLOC]], align 1
 ; IS__TUNIT____-NEXT:    ret i8 [[R]]
 ; IS__TUNIT____:       f:
@@ -533,11 +543,11 @@ define internal i8 @recursive_not_readnone_internal(i8* %ptr, i1 %c) {
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree nosync nounwind
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@recursive_not_readnone_internal
-; IS__CGSCC____-SAME: (i8* nocapture nofree nonnull writeonly dereferenceable(1) [[PTR:%.*]], i1 [[C:%.*]])
+; IS__CGSCC____-SAME: (i8* nocapture nofree noundef nonnull writeonly dereferenceable(1) [[PTR:%.*]], i1 [[C:%.*]])
 ; IS__CGSCC____-NEXT:    [[ALLOC:%.*]] = alloca i8, align 1
 ; IS__CGSCC____-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; IS__CGSCC____:       t:
-; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal(i8* noalias nocapture nofree nonnull writeonly dereferenceable(1) [[ALLOC]], i1 false)
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal(i8* noalias nocapture nofree noundef nonnull writeonly dereferenceable(1) [[ALLOC]], i1 false)
 ; IS__CGSCC____-NEXT:    [[R:%.*]] = load i8, i8* [[ALLOC]], align 1
 ; IS__CGSCC____-NEXT:    ret i8 [[R]]
 ; IS__CGSCC____:       f:
@@ -560,7 +570,7 @@ define i8 @readnone_caller(i1 %c) {
 ; CHECK-LABEL: define {{[^@]+}}@readnone_caller
 ; CHECK-SAME: (i1 [[C:%.*]])
 ; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
-; CHECK-NEXT:    [[R:%.*]] = call i8 @recursive_not_readnone_internal(i8* noalias nocapture nofree nonnull writeonly dereferenceable(1) [[A]], i1 [[C]])
+; CHECK-NEXT:    [[R:%.*]] = call i8 @recursive_not_readnone_internal(i8* noalias nocapture nofree noundef nonnull writeonly dereferenceable(1) [[A]], i1 [[C]])
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %a = alloca i8
@@ -575,7 +585,7 @@ define internal i8 @recursive_not_readnone_internal2(i8* %ptr, i1 %c) {
 ; IS__TUNIT____-NEXT:    [[ALLOC:%.*]] = alloca i8, align 1
 ; IS__TUNIT____-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; IS__TUNIT____:       t:
-; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal2(i8* noalias nocapture nofree nonnull writeonly dereferenceable(1) [[ALLOC]], i1 false)
+; IS__TUNIT____-NEXT:    [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal2(i8* noalias nocapture nofree noundef nonnull writeonly dereferenceable(1) [[ALLOC]], i1 false)
 ; IS__TUNIT____-NEXT:    [[R:%.*]] = load i8, i8* [[ALLOC]], align 1
 ; IS__TUNIT____-NEXT:    ret i8 [[R]]
 ; IS__TUNIT____:       f:
@@ -588,7 +598,7 @@ define internal i8 @recursive_not_readnone_internal2(i8* %ptr, i1 %c) {
 ; IS__CGSCC____-NEXT:    [[ALLOC:%.*]] = alloca i8, align 1
 ; IS__CGSCC____-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; IS__CGSCC____:       t:
-; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal2(i8* noalias nocapture nofree nonnull writeonly dereferenceable(1) [[ALLOC]], i1 false)
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = call i8 @recursive_not_readnone_internal2(i8* noalias nocapture nofree noundef nonnull writeonly dereferenceable(1) [[ALLOC]], i1 false)
 ; IS__CGSCC____-NEXT:    [[R:%.*]] = load i8, i8* [[ALLOC]], align 1
 ; IS__CGSCC____-NEXT:    ret i8 [[R]]
 ; IS__CGSCC____:       f:
diff --git a/llvm/test/Transforms/Attributor/misc.ll b/llvm/test/Transforms/Attributor/misc.ll
index 80a6948ca6dc4c..3ab1f8543aeb26 100644
--- a/llvm/test/Transforms/Attributor/misc.ll
+++ b/llvm/test/Transforms/Attributor/misc.ll
@@ -13,10 +13,10 @@ define internal void @internal(void (i8*)* %fp) {
 ; CHECK-SAME: (void (i8*)* nonnull [[FP:%.*]])
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    call void @foo(i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[A]])
+; CHECK-NEXT:    call void @foo(i32* noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]])
 ; CHECK-NEXT:    call void [[FP]](i8* bitcast (void (i32*)* @foo to i8*))
-; CHECK-NEXT:    call void @callback1(void (i32*)* nonnull @foo)
-; CHECK-NEXT:    call void @callback2(void (i8*)* bitcast (void (i32*)* @foo to void (i8*)*))
+; CHECK-NEXT:    call void @callback1(void (i32*)* noundef nonnull @foo)
+; CHECK-NEXT:    call void @callback2(void (i8*)* noundef bitcast (void (i32*)* @foo to void (i8*)*))
 ; CHECK-NEXT:    call void @callback2(void (i8*)* nonnull [[FP]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to i8*
 ; CHECK-NEXT:    call void [[FP]](i8* [[TMP1]])
@@ -42,9 +42,9 @@ define void @external(void (i8*)* %fp) {
 ; CHECK-SAME: (void (i8*)* [[FP:%.*]])
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    call void @foo(i32* noalias nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[A]])
-; CHECK-NEXT:    call void @callback1(void (i32*)* nonnull @foo)
-; CHECK-NEXT:    call void @callback2(void (i8*)* bitcast (void (i32*)* @foo to void (i8*)*))
+; CHECK-NEXT:    call void @foo(i32* noalias nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[A]])
+; CHECK-NEXT:    call void @callback1(void (i32*)* noundef nonnull @foo)
+; CHECK-NEXT:    call void @callback2(void (i8*)* noundef bitcast (void (i32*)* @foo to void (i8*)*))
 ; CHECK-NEXT:    call void @callback2(void (i8*)* [[FP]])
 ; CHECK-NEXT:    call void [[FP]](i8* bitcast (void (i32*)* @foo to i8*))
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to i8*
diff --git a/llvm/test/Transforms/Attributor/misc_crash.ll b/llvm/test/Transforms/Attributor/misc_crash.ll
index b49cbd94322d16..e420f58af13683 100644
--- a/llvm/test/Transforms/Attributor/misc_crash.ll
+++ b/llvm/test/Transforms/Attributor/misc_crash.ll
@@ -29,7 +29,7 @@ define i32* @func1() {
 }
 
 ; UTC_ARGS: --disable
-; CHECK-LABEL: define internal nonnull align 4 dereferenceable(4) i32* @func1a()
+; CHECK-LABEL: define internal noundef nonnull align 4 dereferenceable(4) i32* @func1a()
 ; CHECK-NEXT: ret i32* getelementptr inbounds ([1 x i32], [1 x i32]* @var1, i32 0, i32 0)
 define internal i32* @func1a([1 x i32]* %arg) {
   %ptr = getelementptr inbounds [1 x i32], [1 x i32]* %arg, i64 0, i64 0
@@ -40,7 +40,7 @@ define internal i32* @func1a([1 x i32]* %arg) {
 define internal void @func2a(i32* %0) {
 ; CHECK: Function Attrs: nofree nosync nounwind willreturn writeonly
 ; CHECK-LABEL: define {{[^@]+}}@func2a
-; CHECK-SAME: (i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[TMP0:%.*]])
+; CHECK-SAME: (i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[TMP0:%.*]])
 ; CHECK-NEXT:    store i32 0, i32* @var2, align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -50,7 +50,7 @@ define internal void @func2a(i32* %0) {
 
 define i32 @func2() {
 ; CHECK-LABEL: define {{[^@]+}}@func2()
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 (i32*, ...) bitcast (void (i32*)* @func2a to i32 (i32*, ...)*)(i32* nonnull align 4 dereferenceable(4) @var2)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 (i32*, ...) bitcast (void (i32*)* @func2a to i32 (i32*, ...)*)(i32* noundef nonnull align 4 dereferenceable(4) @var2)
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* @var2, align 4
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
@@ -62,7 +62,7 @@ define i32 @func2() {
 define i32 @func3(i1 %false) {
 ; CHECK-LABEL: define {{[^@]+}}@func3
 ; CHECK-SAME: (i1 [[FALSE:%.*]])
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 (i32*, ...) bitcast (void (i32*)* @func2a to i32 (i32*, ...)*)(i32* nonnull align 4 dereferenceable(4) @var2)
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 (i32*, ...) bitcast (void (i32*)* @func2a to i32 (i32*, ...)*)(i32* noundef nonnull align 4 dereferenceable(4) @var2)
 ; CHECK-NEXT:    br i1 [[FALSE]], label [[USE_BB:%.*]], label [[RET_BB:%.*]]
 ; CHECK:       use_bb:
 ; CHECK-NEXT:    ret i32 [[TMP1]]
diff --git a/llvm/test/Transforms/Attributor/noalias.ll b/llvm/test/Transforms/Attributor/noalias.ll
index 5314ea53f5a940..ff780b6714259b 100644
--- a/llvm/test/Transforms/Attributor/noalias.ll
+++ b/llvm/test/Transforms/Attributor/noalias.ll
@@ -56,9 +56,13 @@ define void @nocapture(i8* %a){
 }
 
 define i8* @return_noalias_looks_like_capture(){
-; CHECK-LABEL: define {{[^@]+}}@return_noalias_looks_like_capture()
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; CHECK-NEXT:    ret i8* [[TMP1]]
+; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@return_noalias_looks_like_capture()
+; NOT_CGSCC_NPM-NEXT:    [[TMP1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; NOT_CGSCC_NPM-NEXT:    ret i8* [[TMP1]]
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@return_noalias_looks_like_capture()
+; IS__CGSCC____-NEXT:    [[TMP1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC____-NEXT:    ret i8* [[TMP1]]
 ;
   %1 = tail call noalias i8* @malloc(i64 4)
   call void @nocapture(i8* %1)
@@ -180,7 +184,7 @@ define i8* @test6() nounwind uwtable ssp {
 ; CHECK-NEXT:    store i8 97, i8* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i8], [2 x i8]* [[X]], i64 0, i64 1
 ; CHECK-NEXT:    store i8 0, i8* [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CALL:%.*]] = call noalias i8* @strdup(i8* nocapture nonnull dereferenceable(2) [[ARRAYIDX]])
+; CHECK-NEXT:    [[CALL:%.*]] = call noalias i8* @strdup(i8* nocapture noundef nonnull dereferenceable(2) [[ARRAYIDX]])
 ; CHECK-NEXT:    ret i8* [[CALL]]
 ;
   %x = alloca [2 x i8], align 1
@@ -254,7 +258,7 @@ define i8* @test8(i32* %0) nounwind uwtable {
 declare void @use_i8(i8* nocapture)
 define internal void @test9a(i8* %a, i8* %b) {
 ; CHECK-LABEL: define {{[^@]+}}@test9a()
-; CHECK-NEXT:    call void @use_i8(i8* noalias nocapture align 536870912 null)
+; CHECK-NEXT:    call void @use_i8(i8* noalias nocapture noundef align 536870912 null)
 ; CHECK-NEXT:    ret void
 ;
   call void @use_i8(i8* null)
@@ -353,14 +357,23 @@ define void @test11(i8* noalias %a) {
 declare void @use_nocapture(i8* nocapture)
 declare void @use(i8*)
 define void @test12_1() {
-; CHECK-LABEL: define {{[^@]+}}@test12_1()
-; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 4
-; CHECK-NEXT:    [[B:%.*]] = tail call noalias i8* @malloc(i64 4)
-; CHECK-NEXT:    tail call void @use_nocapture(i8* noalias nocapture nonnull align 4 dereferenceable(1) [[A]])
-; CHECK-NEXT:    tail call void @use_nocapture(i8* noalias nocapture nonnull align 4 dereferenceable(1) [[A]])
-; CHECK-NEXT:    tail call void @use_nocapture(i8* noalias nocapture [[B]])
-; CHECK-NEXT:    tail call void @use_nocapture(i8* noalias nocapture [[B]])
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@test12_1()
+; NOT_CGSCC_NPM-NEXT:    [[A:%.*]] = alloca i8, align 4
+; NOT_CGSCC_NPM-NEXT:    [[B:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; NOT_CGSCC_NPM-NEXT:    tail call void @use_nocapture(i8* noalias nocapture noundef nonnull align 4 dereferenceable(1) [[A]])
+; NOT_CGSCC_NPM-NEXT:    tail call void @use_nocapture(i8* noalias nocapture noundef nonnull align 4 dereferenceable(1) [[A]])
+; NOT_CGSCC_NPM-NEXT:    tail call void @use_nocapture(i8* noalias nocapture noundef [[B]])
+; NOT_CGSCC_NPM-NEXT:    tail call void @use_nocapture(i8* noalias nocapture noundef [[B]])
+; NOT_CGSCC_NPM-NEXT:    ret void
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test12_1()
+; IS__CGSCC____-NEXT:    [[A:%.*]] = alloca i8, align 4
+; IS__CGSCC____-NEXT:    [[B:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC____-NEXT:    tail call void @use_nocapture(i8* noalias nocapture noundef nonnull align 4 dereferenceable(1) [[A]])
+; IS__CGSCC____-NEXT:    tail call void @use_nocapture(i8* noalias nocapture noundef nonnull align 4 dereferenceable(1) [[A]])
+; IS__CGSCC____-NEXT:    tail call void @use_nocapture(i8* noalias nocapture noundef [[B]])
+; IS__CGSCC____-NEXT:    tail call void @use_nocapture(i8* noalias nocapture noundef [[B]])
+; IS__CGSCC____-NEXT:    ret void
 ;
   %A = alloca i8, align 4
   %B = tail call noalias i8* @malloc(i64 4)
@@ -372,13 +385,21 @@ define void @test12_1() {
 }
 
 define void @test12_2(){
-; CHECK-LABEL: define {{[^@]+}}@test12_2()
-; CHECK-NEXT:    [[A:%.*]] = tail call noalias i8* @malloc(i64 4)
-; CHECK-NEXT:    tail call void @use_nocapture(i8* noalias nocapture [[A]])
-; CHECK-NEXT:    tail call void @use_nocapture(i8* noalias nocapture [[A]])
-; CHECK-NEXT:    tail call void @use(i8* [[A]])
-; CHECK-NEXT:    tail call void @use_nocapture(i8* nocapture [[A]])
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@test12_2()
+; NOT_CGSCC_NPM-NEXT:    [[A:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; NOT_CGSCC_NPM-NEXT:    tail call void @use_nocapture(i8* noalias nocapture noundef [[A]])
+; NOT_CGSCC_NPM-NEXT:    tail call void @use_nocapture(i8* noalias nocapture noundef [[A]])
+; NOT_CGSCC_NPM-NEXT:    tail call void @use(i8* noundef [[A]])
+; NOT_CGSCC_NPM-NEXT:    tail call void @use_nocapture(i8* nocapture noundef [[A]])
+; NOT_CGSCC_NPM-NEXT:    ret void
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test12_2()
+; IS__CGSCC____-NEXT:    [[A:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC____-NEXT:    tail call void @use_nocapture(i8* noalias nocapture noundef [[A]])
+; IS__CGSCC____-NEXT:    tail call void @use_nocapture(i8* noalias nocapture noundef [[A]])
+; IS__CGSCC____-NEXT:    tail call void @use(i8* noundef [[A]])
+; IS__CGSCC____-NEXT:    tail call void @use_nocapture(i8* nocapture noundef [[A]])
+; IS__CGSCC____-NEXT:    ret void
 ;
 ; FIXME: This should be @use_nocapture(i8* noalias [[A]])
 ; FIXME: This should be @use_nocapture(i8* noalias nocapture [[A]])
@@ -392,10 +413,15 @@ define void @test12_2(){
 
 declare void @two_args(i8* nocapture , i8* nocapture)
 define void @test12_3(){
-; CHECK-LABEL: define {{[^@]+}}@test12_3()
-; CHECK-NEXT:    [[A:%.*]] = tail call noalias i8* @malloc(i64 4)
-; CHECK-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[A]])
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@test12_3()
+; NOT_CGSCC_NPM-NEXT:    [[A:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; NOT_CGSCC_NPM-NEXT:    tail call void @two_args(i8* nocapture noundef [[A]], i8* nocapture noundef [[A]])
+; NOT_CGSCC_NPM-NEXT:    ret void
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test12_3()
+; IS__CGSCC____-NEXT:    [[A:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC____-NEXT:    tail call void @two_args(i8* nocapture noundef [[A]], i8* nocapture noundef [[A]])
+; IS__CGSCC____-NEXT:    ret void
 ;
   %A = tail call noalias i8* @malloc(i64 4)
   tail call void @two_args(i8* %A, i8* %A)
@@ -404,28 +430,40 @@ define void @test12_3(){
 
 define void @test12_4(){
 ; IS________OPM-LABEL: define {{[^@]+}}@test12_4()
-; IS________OPM-NEXT:    [[A:%.*]] = tail call noalias i8* @malloc(i64 4)
-; IS________OPM-NEXT:    [[B:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS________OPM-NEXT:    [[A:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS________OPM-NEXT:    [[B:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
 ; IS________OPM-NEXT:    [[A_0:%.*]] = getelementptr i8, i8* [[A]], i64 0
 ; IS________OPM-NEXT:    [[A_1:%.*]] = getelementptr i8, i8* [[A]], i64 1
 ; IS________OPM-NEXT:    [[B_0:%.*]] = getelementptr i8, i8* [[B]], i64 0
-; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[B]])
-; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[A_0]])
-; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[A_1]])
-; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture [[A_0]], i8* nocapture [[B_0]])
+; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture noundef [[A]], i8* nocapture noundef [[B]])
+; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture noundef [[A]], i8* nocapture noundef [[A_0]])
+; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture noundef [[A]], i8* nocapture [[A_1]])
+; IS________OPM-NEXT:    tail call void @two_args(i8* nocapture noundef [[A_0]], i8* nocapture noundef [[B_0]])
 ; IS________OPM-NEXT:    ret void
 ;
-; NOT_TUNIT_OPM-LABEL: define {{[^@]+}}@test12_4()
-; NOT_TUNIT_OPM-NEXT:    [[A:%.*]] = tail call noalias i8* @malloc(i64 4)
-; NOT_TUNIT_OPM-NEXT:    [[B:%.*]] = tail call noalias i8* @malloc(i64 4)
-; NOT_TUNIT_OPM-NEXT:    [[A_0:%.*]] = getelementptr i8, i8* [[A]], i64 0
-; NOT_TUNIT_OPM-NEXT:    [[A_1:%.*]] = getelementptr i8, i8* [[A]], i64 1
-; NOT_TUNIT_OPM-NEXT:    [[B_0:%.*]] = getelementptr i8, i8* [[B]], i64 0
-; NOT_TUNIT_OPM-NEXT:    tail call void @two_args(i8* noalias nocapture [[A]], i8* noalias nocapture [[B]])
-; NOT_TUNIT_OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[A_0]])
-; NOT_TUNIT_OPM-NEXT:    tail call void @two_args(i8* nocapture [[A]], i8* nocapture [[A_1]])
-; NOT_TUNIT_OPM-NEXT:    tail call void @two_args(i8* nocapture [[A_0]], i8* nocapture [[B_0]])
-; NOT_TUNIT_OPM-NEXT:    ret void
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@test12_4()
+; IS__TUNIT_NPM-NEXT:    [[A:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT_NPM-NEXT:    [[B:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; IS__TUNIT_NPM-NEXT:    [[A_0:%.*]] = getelementptr i8, i8* [[A]], i64 0
+; IS__TUNIT_NPM-NEXT:    [[A_1:%.*]] = getelementptr i8, i8* [[A]], i64 1
+; IS__TUNIT_NPM-NEXT:    [[B_0:%.*]] = getelementptr i8, i8* [[B]], i64 0
+; IS__TUNIT_NPM-NEXT:    tail call void @two_args(i8* noalias nocapture noundef [[A]], i8* noalias nocapture noundef [[B]])
+; IS__TUNIT_NPM-NEXT:    tail call void @two_args(i8* nocapture noundef [[A]], i8* nocapture noundef [[A_0]])
+; IS__TUNIT_NPM-NEXT:    tail call void @two_args(i8* nocapture noundef [[A]], i8* nocapture [[A_1]])
+; IS__TUNIT_NPM-NEXT:    tail call void @two_args(i8* nocapture noundef [[A_0]], i8* nocapture noundef [[B_0]])
+; IS__TUNIT_NPM-NEXT:    ret void
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test12_4()
+; IS__CGSCC____-NEXT:    [[A:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC____-NEXT:    [[B:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC____-NEXT:    [[A_0:%.*]] = getelementptr i8, i8* [[A]], i64 0
+; IS__CGSCC____-NEXT:    [[A_1:%.*]] = getelementptr i8, i8* [[A]], i64 1
+; IS__CGSCC____-NEXT:    [[B_0:%.*]] = getelementptr i8, i8* [[B]], i64 0
+; IS__CGSCC____-NEXT:    tail call void @two_args(i8* noalias nocapture noundef [[A]], i8* noalias nocapture noundef [[B]])
+; IS__CGSCC____-NEXT:    tail call void @two_args(i8* nocapture noundef [[A]], i8* nocapture noundef [[A_0]])
+; IS__CGSCC____-NEXT:    tail call void @two_args(i8* nocapture noundef [[A]], i8* nocapture noundef [[A_1]])
+; IS__CGSCC____-NEXT:    tail call void @two_args(i8* nocapture noundef [[A_0]], i8* nocapture noundef [[B_0]])
+; IS__CGSCC____-NEXT:    ret void
 ;
   %A = tail call noalias i8* @malloc(i64 4)
   %B = tail call noalias i8* @malloc(i64 4)
@@ -456,12 +494,19 @@ define void @use_i8_internal(i8* %a) {
 }
 
 define void @test13_use_noalias(){
-; CHECK-LABEL: define {{[^@]+}}@test13_use_noalias()
-; CHECK-NEXT:    [[M1:%.*]] = tail call noalias i8* @malloc(i64 4)
-; CHECK-NEXT:    [[C1:%.*]] = bitcast i8* [[M1]] to i16*
-; CHECK-NEXT:    [[C2:%.*]] = bitcast i16* [[C1]] to i8*
-; CHECK-NEXT:    call void @use_i8_internal(i8* noalias nocapture [[C2]])
-; CHECK-NEXT:    ret void
+; NOT_CGSCC_NPM-LABEL: define {{[^@]+}}@test13_use_noalias()
+; NOT_CGSCC_NPM-NEXT:    [[M1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
+; NOT_CGSCC_NPM-NEXT:    [[C1:%.*]] = bitcast i8* [[M1]] to i16*
+; NOT_CGSCC_NPM-NEXT:    [[C2:%.*]] = bitcast i16* [[C1]] to i8*
+; NOT_CGSCC_NPM-NEXT:    call void @use_i8_internal(i8* noalias nocapture noundef [[C2]])
+; NOT_CGSCC_NPM-NEXT:    ret void
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test13_use_noalias()
+; IS__CGSCC____-NEXT:    [[M1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; IS__CGSCC____-NEXT:    [[C1:%.*]] = bitcast i8* [[M1]] to i16*
+; IS__CGSCC____-NEXT:    [[C2:%.*]] = bitcast i16* [[C1]] to i8*
+; IS__CGSCC____-NEXT:    call void @use_i8_internal(i8* noalias nocapture noundef [[C2]])
+; IS__CGSCC____-NEXT:    ret void
 ;
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test13_use_noalias()
 ; IS__CGSCC_OPM-NEXT:    [[M1:%.*]] = tail call noalias i8* @malloc(i64 4)
@@ -478,12 +523,12 @@ define void @test13_use_noalias(){
 
 define void @test13_use_alias(){
 ; CHECK-LABEL: define {{[^@]+}}@test13_use_alias()
-; CHECK-NEXT:    [[M1:%.*]] = tail call noalias i8* @malloc(i64 4)
+; CHECK-NEXT:    [[M1:%.*]] = tail call noalias noundef i8* @malloc(i64 4)
 ; CHECK-NEXT:    [[C1:%.*]] = bitcast i8* [[M1]] to i16*
 ; CHECK-NEXT:    [[C2A:%.*]] = bitcast i16* [[C1]] to i8*
 ; CHECK-NEXT:    [[C2B:%.*]] = bitcast i16* [[C1]] to i8*
-; CHECK-NEXT:    call void @use_i8_internal(i8* nocapture [[C2A]])
-; CHECK-NEXT:    call void @use_i8_internal(i8* nocapture [[C2B]])
+; CHECK-NEXT:    call void @use_i8_internal(i8* nocapture noundef [[C2A]])
+; CHECK-NEXT:    call void @use_i8_internal(i8* nocapture noundef [[C2B]])
 ; CHECK-NEXT:    ret void
 ;
   %m1 = tail call noalias i8* @malloc(i64 4)
@@ -570,11 +615,11 @@ define internal fastcc double @strtox(i8* %s, i8** %p, i32 %prec) unnamed_addr {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[F:%.*]] = alloca [[STRUCT__IO_FILE:%.*]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct._IO_FILE* [[F]] to i8*
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 144, i8* nocapture nonnull align 8 dereferenceable(240) [[TMP0]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 144, i8* nocapture noundef nonnull align 8 dereferenceable(240) [[TMP0]])
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 bitcast (i32 (...)* @sh_fromstring to i32 (%struct._IO_FILE*, i8*)*)(%struct._IO_FILE* nonnull align 8 dereferenceable(240) [[F]], i8* [[S]])
-; CHECK-NEXT:    call void @__shlim(%struct._IO_FILE* nonnull align 8 dereferenceable(240) [[F]], i64 0)
-; CHECK-NEXT:    [[CALL1:%.*]] = call double @__floatscan(%struct._IO_FILE* nonnull align 8 dereferenceable(240) [[F]], i32 1, i32 1)
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 144, i8* nocapture nonnull align 8 dereferenceable(240) [[TMP0]])
+; CHECK-NEXT:    call void @__shlim(%struct._IO_FILE* noundef nonnull align 8 dereferenceable(240) [[F]], i64 0)
+; CHECK-NEXT:    [[CALL1:%.*]] = call double @__floatscan(%struct._IO_FILE* noundef nonnull align 8 dereferenceable(240) [[F]], i32 1, i32 1)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 144, i8* nocapture noundef nonnull align 8 dereferenceable(240) [[TMP0]])
 ; CHECK-NEXT:    ret double [[CALL1]]
 ;
 entry:
diff --git a/llvm/test/Transforms/Attributor/nocapture-1.ll b/llvm/test/Transforms/Attributor/nocapture-1.ll
index 58d8be6d60c9b5..4ea6a327db6e84 100644
--- a/llvm/test/Transforms/Attributor/nocapture-1.ll
+++ b/llvm/test/Transforms/Attributor/nocapture-1.ll
@@ -459,7 +459,7 @@ define i8* @test4_2(i8* %x4_2, i8* %y4_2, i8* %z4_2, i1 %c) {
 ; CHECK-SAME: (i8* nocapture nofree readnone [[X4_2:%.*]], i8* nofree readnone returned "no-capture-maybe-returned" [[Y4_2:%.*]], i8* nocapture nofree readnone [[Z4_2:%.*]], i1 [[C:%.*]])
 ; CHECK-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; CHECK:       t:
-; CHECK-NEXT:    call void @test4_1(i8* noalias nocapture nofree readnone align 536870912 null, i1 [[C]])
+; CHECK-NEXT:    call void @test4_1(i8* noalias nocapture nofree noundef readnone align 536870912 null, i1 [[C]])
 ; CHECK-NEXT:    store i32* null, i32** @g, align 8
 ; CHECK-NEXT:    br label [[F]]
 ; CHECK:       f:
@@ -759,7 +759,7 @@ declare void @unknown(i8*)
 define void @test_callsite() {
 ; CHECK-LABEL: define {{[^@]+}}@test_callsite()
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @unknown(i8* noalias nocapture align 536870912 null)
+; CHECK-NEXT:    call void @unknown(i8* noalias nocapture noundef align 536870912 null)
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/Attributor/nocapture-2.ll b/llvm/test/Transforms/Attributor/nocapture-2.ll
index 5ed73b74c8693b..c4bc297ee2ad76 100644
--- a/llvm/test/Transforms/Attributor/nocapture-2.ll
+++ b/llvm/test/Transforms/Attributor/nocapture-2.ll
@@ -217,11 +217,11 @@ define float* @scc_A(i32* dereferenceable_or_null(4) %a) {
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 ; CHECK:       cond.true:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A]] to i16*
-; CHECK-NEXT:    [[CALL:%.*]] = call dereferenceable_or_null(4) i8* @scc_C(i16* noalias nofree nonnull readnone dereferenceable(4) "no-capture-maybe-returned" [[TMP0]])
+; CHECK-NEXT:    [[CALL:%.*]] = call noundef dereferenceable_or_null(4) i8* @scc_C(i16* noalias nofree nonnull readnone dereferenceable(4) "no-capture-maybe-returned" [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[CALL]] to double*
-; CHECK-NEXT:    [[CALL1:%.*]] = call dereferenceable_or_null(8) i64* @scc_B(double* noalias nofree nonnull readnone dereferenceable(8) "no-capture-maybe-returned" [[TMP1]])
+; CHECK-NEXT:    [[CALL1:%.*]] = call noundef dereferenceable_or_null(8) i64* @scc_B(double* noalias nofree noundef nonnull readnone dereferenceable(8) "no-capture-maybe-returned" [[TMP1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[CALL1]] to i32*
-; CHECK-NEXT:    [[CALL2:%.*]] = call float* @scc_A(i32* noalias nofree nonnull readnone dereferenceable(8) "no-capture-maybe-returned" [[TMP2]])
+; CHECK-NEXT:    [[CALL2:%.*]] = call float* @scc_A(i32* noalias nofree noundef nonnull readnone dereferenceable(8) "no-capture-maybe-returned" [[TMP2]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[CALL2]] to i32*
 ; CHECK-NEXT:    br label [[COND_END:%.*]]
 ; CHECK:       cond.false:
@@ -263,11 +263,11 @@ define i64* @scc_B(double* dereferenceable_or_null(8) %a) {
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 ; CHECK:       cond.true:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to i32*
-; CHECK-NEXT:    [[CALL:%.*]] = call dereferenceable_or_null(4) float* @scc_A(i32* noalias nofree nonnull readnone dereferenceable(8) "no-capture-maybe-returned" [[TMP0]])
+; CHECK-NEXT:    [[CALL:%.*]] = call noundef dereferenceable_or_null(4) float* @scc_A(i32* noalias nofree nonnull readnone dereferenceable(8) "no-capture-maybe-returned" [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[CALL]] to double*
-; CHECK-NEXT:    [[CALL1:%.*]] = call dereferenceable_or_null(8) i64* @scc_B(double* noalias nofree nonnull readnone dereferenceable(8) "no-capture-maybe-returned" [[TMP1]])
+; CHECK-NEXT:    [[CALL1:%.*]] = call noundef dereferenceable_or_null(8) i64* @scc_B(double* noalias nofree noundef nonnull readnone dereferenceable(8) "no-capture-maybe-returned" [[TMP1]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[CALL1]] to i16*
-; CHECK-NEXT:    [[CALL2:%.*]] = call i8* @scc_C(i16* noalias nofree nonnull readnone dereferenceable(8) "no-capture-maybe-returned" [[TMP2]])
+; CHECK-NEXT:    [[CALL2:%.*]] = call i8* @scc_C(i16* noalias nofree noundef nonnull readnone dereferenceable(8) "no-capture-maybe-returned" [[TMP2]])
 ; CHECK-NEXT:    br label [[COND_END:%.*]]
 ; CHECK:       cond.false:
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[A]] to i8*
@@ -312,16 +312,16 @@ define i8* @scc_C(i16* dereferenceable_or_null(2) %a) {
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 ; CHECK:       cond.true:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A]] to double*
-; CHECK-NEXT:    [[CALL1:%.*]] = call dereferenceable_or_null(8) i64* @scc_B(double* noalias nofree nonnull readnone dereferenceable(8) "no-capture-maybe-returned" [[TMP0]])
+; CHECK-NEXT:    [[CALL1:%.*]] = call noundef dereferenceable_or_null(8) i64* @scc_B(double* noalias nofree nonnull readnone dereferenceable(8) "no-capture-maybe-returned" [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[CALL1]] to i8*
 ; CHECK-NEXT:    br label [[COND_END:%.*]]
 ; CHECK:       cond.false:
-; CHECK-NEXT:    [[CALL2:%.*]] = call dereferenceable_or_null(4) i8* @scc_C(i16* noalias nofree nonnull readnone dereferenceable(4) "no-capture-maybe-returned" [[A]])
+; CHECK-NEXT:    [[CALL2:%.*]] = call noundef dereferenceable_or_null(4) i8* @scc_C(i16* noalias nofree nonnull readnone dereferenceable(4) "no-capture-maybe-returned" [[A]])
 ; CHECK-NEXT:    br label [[COND_END]]
 ; CHECK:       cond.end:
 ; CHECK-NEXT:    [[COND:%.*]] = phi i8* [ [[TMP1]], [[COND_TRUE]] ], [ [[CALL2]], [[COND_FALSE]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[COND]] to i32*
-; CHECK-NEXT:    [[CALL3:%.*]] = call float* @scc_A(i32* noalias nofree nonnull readnone dereferenceable(4) "no-capture-maybe-returned" [[TMP2]])
+; CHECK-NEXT:    [[CALL3:%.*]] = call float* @scc_A(i32* noalias nofree noundef nonnull readnone dereferenceable(4) "no-capture-maybe-returned" [[TMP2]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[CALL3]] to i8*
 ; CHECK-NEXT:    ret i8* [[TMP3]]
 ;
diff --git a/llvm/test/Transforms/Attributor/nonnull.ll b/llvm/test/Transforms/Attributor/nonnull.ll
index 4add5a5c1f5a89..6e06b3b195204f 100644
--- a/llvm/test/Transforms/Attributor/nonnull.ll
+++ b/llvm/test/Transforms/Attributor/nonnull.ll
@@ -364,10 +364,10 @@ define void @test12(i8* nonnull %a) {
 declare i8* @unknown()
 define void @test13_helper() {
 ; CHECK-LABEL: define {{[^@]+}}@test13_helper()
-; CHECK-NEXT:    [[NONNULLPTR:%.*]] = tail call nonnull i8* @ret_nonnull()
-; CHECK-NEXT:    [[MAYBENULLPTR:%.*]] = tail call i8* @unknown()
-; CHECK-NEXT:    tail call void @test13(i8* noalias nocapture nofree nonnull readnone [[NONNULLPTR]], i8* noalias nocapture nofree nonnull readnone [[NONNULLPTR]], i8* noalias nocapture nofree readnone [[MAYBENULLPTR]])
-; CHECK-NEXT:    tail call void @test13(i8* noalias nocapture nofree nonnull readnone [[NONNULLPTR]], i8* noalias nocapture nofree readnone [[MAYBENULLPTR]], i8* noalias nocapture nofree nonnull readnone [[NONNULLPTR]])
+; CHECK-NEXT:    [[NONNULLPTR:%.*]] = tail call noundef nonnull i8* @ret_nonnull()
+; CHECK-NEXT:    [[MAYBENULLPTR:%.*]] = tail call noundef i8* @unknown()
+; CHECK-NEXT:    tail call void @test13(i8* noalias nocapture nofree noundef nonnull readnone [[NONNULLPTR]], i8* noalias nocapture nofree noundef nonnull readnone [[NONNULLPTR]], i8* noalias nocapture nofree noundef readnone [[MAYBENULLPTR]])
+; CHECK-NEXT:    tail call void @test13(i8* noalias nocapture nofree noundef nonnull readnone [[NONNULLPTR]], i8* noalias nocapture nofree noundef readnone [[MAYBENULLPTR]], i8* noalias nocapture nofree noundef nonnull readnone [[NONNULLPTR]])
 ; CHECK-NEXT:    ret void
 ;
   %nonnullptr = tail call i8* @ret_nonnull()
@@ -379,10 +379,10 @@ define void @test13_helper() {
 define internal void @test13(i8* %a, i8* %b, i8* %c) {
 ; IS__TUNIT____: Function Attrs: nounwind
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@test13
-; IS__TUNIT____-SAME: (i8* noalias nocapture nofree nonnull readnone [[A:%.*]], i8* noalias nocapture nofree readnone [[B:%.*]], i8* noalias nocapture nofree readnone [[C:%.*]])
-; IS__TUNIT____-NEXT:    call void @use_i8_ptr(i8* noalias nocapture nofree nonnull readnone [[A]])
-; IS__TUNIT____-NEXT:    call void @use_i8_ptr(i8* noalias nocapture nofree readnone [[B]])
-; IS__TUNIT____-NEXT:    call void @use_i8_ptr(i8* noalias nocapture nofree readnone [[C]])
+; IS__TUNIT____-SAME: (i8* noalias nocapture nofree noundef nonnull readnone [[A:%.*]], i8* noalias nocapture nofree noundef readnone [[B:%.*]], i8* noalias nocapture nofree noundef readnone [[C:%.*]])
+; IS__TUNIT____-NEXT:    call void @use_i8_ptr(i8* noalias nocapture nofree noundef nonnull readnone [[A]])
+; IS__TUNIT____-NEXT:    call void @use_i8_ptr(i8* noalias nocapture nofree noundef readnone [[B]])
+; IS__TUNIT____-NEXT:    call void @use_i8_ptr(i8* noalias nocapture nofree noundef readnone [[C]])
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC____: Function Attrs: nounwind
diff --git a/llvm/test/Transforms/Attributor/noreturn_async.ll b/llvm/test/Transforms/Attributor/noreturn_async.ll
index 879fb16a13d0bf..6c3526baba2bb7 100644
--- a/llvm/test/Transforms/Attributor/noreturn_async.ll
+++ b/llvm/test/Transforms/Attributor/noreturn_async.ll
@@ -86,7 +86,7 @@ entry:
 ; CHECK-NOT:  nounwind
 ; CHECK-NEXT: define
 ; CHECK-NEXT:   entry:
-; CHECK-NEXT:   %call3 = call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(18) getelementptr inbounds ([18 x i8], [18 x i8]* @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@", i64 0, i64 0))
+; CHECK-NEXT:   %call3 = call i32 (i8*, ...) @printf(i8* noundef nonnull dereferenceable(18) getelementptr inbounds ([18 x i8], [18 x i8]* @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@", i64 0, i64 0))
 ; CHECK-NEXT:   call void @"?overflow@@YAXXZ_may_throw"()
 ; CHECK-NEXT:   unreachable
   %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@", i64 0, i64 0))
diff --git a/llvm/test/Transforms/Attributor/noreturn_sync.ll b/llvm/test/Transforms/Attributor/noreturn_sync.ll
index 22b675427cf01f..0321b0ceafd76e 100644
--- a/llvm/test/Transforms/Attributor/noreturn_sync.ll
+++ b/llvm/test/Transforms/Attributor/noreturn_sync.ll
@@ -82,7 +82,7 @@ entry:
 ; CHECK-NOT:  nounwind
 ; CHECK-NEXT: define
 ; CHECK-NEXT:   entry:
-; CHECK-NEXT:   %call3 = call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(18) getelementptr inbounds ([18 x i8], [18 x i8]* @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@", i64 0, i64 0))
+; CHECK-NEXT:   %call3 = call i32 (i8*, ...) @printf(i8* noundef nonnull dereferenceable(18) getelementptr inbounds ([18 x i8], [18 x i8]* @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@", i64 0, i64 0))
 ; CHECK-NEXT:   call void @"?overflow@@YAXXZ_may_throw"()
 ; CHECK-NEXT:   unreachable
   %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @"??_C@_0BC@NKPAGFFJ@Exception?5caught?6?$AA@", i64 0, i64 0))
diff --git a/llvm/test/Transforms/Attributor/nosync.ll b/llvm/test/Transforms/Attributor/nosync.ll
index 1404cc4b34c23a..102b2e86ac2132 100644
--- a/llvm/test/Transforms/Attributor/nosync.ll
+++ b/llvm/test/Transforms/Attributor/nosync.ll
@@ -459,7 +459,7 @@ declare void @llvm.x86.sse2.clflush(i8*)
 define void @i_totally_sync() {
 ; CHECK: Function Attrs: nounwind
 ; CHECK-LABEL: define {{[^@]+}}@i_totally_sync()
-; CHECK-NEXT:    tail call void @llvm.x86.sse2.clflush(i8* nonnull align 4 dereferenceable(4) bitcast (i32* @a to i8*))
+; CHECK-NEXT:    tail call void @llvm.x86.sse2.clflush(i8* noundef nonnull align 4 dereferenceable(4) bitcast (i32* @a to i8*))
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.x86.sse2.clflush(i8* bitcast (i32* @a to i8*))
diff --git a/llvm/test/Transforms/Attributor/noundef.ll b/llvm/test/Transforms/Attributor/noundef.ll
new file mode 100644
index 00000000000000..b7c1d45205a607
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/noundef.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -attributor -enable-new-pm=0 -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_NPM,NOT_CGSCC_OPM,NOT_TUNIT_NPM,IS__TUNIT____,IS________OPM,IS__TUNIT_OPM
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_CGSCC_OPM,NOT_CGSCC_NPM,NOT_TUNIT_OPM,IS__TUNIT____,IS________NPM,IS__TUNIT_NPM
+; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM
+
+declare void @unknown()
+
+declare void @bar(i32*)
+
+define void @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[X:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    call void @bar(i32* noundef nonnull align 4 dereferenceable(4) [[X]])
+; CHECK-NEXT:    ret void
+;
+  %x = alloca i32
+  call void @unknown()
+  call void @bar(i32* %x)
+  ret void
+}
diff --git a/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll b/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll
index 7c114500328003..701b70926aaa2d 100644
--- a/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll
+++ b/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll
@@ -78,11 +78,11 @@ define internal i32* @internal_ret0_nw(i32* %n0, i32* %w0) {
 ; IS__TUNIT____-NEXT:    store i32 3, i32* [[R0]], align 4
 ; IS__TUNIT____-NEXT:    store i32 5, i32* [[R1]], align 4
 ; IS__TUNIT____-NEXT:    store i32 1, i32* [[W0]], align 4
-; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call i32* @internal_ret1_rrw(i32* nofree nonnull align 4 dereferenceable(4) [[R0]], i32* nofree nonnull align 4 dereferenceable(4) [[R1]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
-; IS__TUNIT____-NEXT:    [[CALL1:%.*]] = call i32* @external_ret2_nrw(i32* nofree [[N0]], i32* nofree nonnull align 4 dereferenceable(4) [[R0]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
-; IS__TUNIT____-NEXT:    [[CALL2:%.*]] = call i32* @external_ret2_nrw(i32* nofree [[N0]], i32* nofree nonnull align 4 dereferenceable(4) [[R1]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
-; IS__TUNIT____-NEXT:    [[CALL3:%.*]] = call i32* @external_sink_ret2_nrw(i32* nofree [[N0]], i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[R0]], i32* nofree nonnull writeonly align 4 dereferenceable(4) "no-capture-maybe-returned" [[W0]])
-; IS__TUNIT____-NEXT:    [[CALL4:%.*]] = call i32* @external_sink_ret2_nrw(i32* nofree [[N0]], i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[R1]], i32* nofree nonnull writeonly align 4 dereferenceable(4) "no-capture-maybe-returned" [[W0]])
+; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call i32* @internal_ret1_rrw(i32* nofree noundef nonnull align 4 dereferenceable(4) [[R0]], i32* nofree noundef nonnull align 4 dereferenceable(4) [[R1]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
+; IS__TUNIT____-NEXT:    [[CALL1:%.*]] = call i32* @external_ret2_nrw(i32* nofree [[N0]], i32* nofree noundef nonnull align 4 dereferenceable(4) [[R0]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
+; IS__TUNIT____-NEXT:    [[CALL2:%.*]] = call i32* @external_ret2_nrw(i32* nofree [[N0]], i32* nofree noundef nonnull align 4 dereferenceable(4) [[R1]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
+; IS__TUNIT____-NEXT:    [[CALL3:%.*]] = call i32* @external_sink_ret2_nrw(i32* nofree [[N0]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[R0]], i32* nofree nonnull writeonly align 4 dereferenceable(4) "no-capture-maybe-returned" [[W0]])
+; IS__TUNIT____-NEXT:    [[CALL4:%.*]] = call i32* @external_sink_ret2_nrw(i32* nofree [[N0]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[R1]], i32* nofree nonnull writeonly align 4 dereferenceable(4) "no-capture-maybe-returned" [[W0]])
 ; IS__TUNIT____-NEXT:    [[CALL5:%.*]] = call i32* @internal_ret0_nw(i32* nofree [[N0]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
 ; IS__TUNIT____-NEXT:    br label [[RETURN]]
 ; IS__TUNIT____:       return:
@@ -103,11 +103,11 @@ define internal i32* @internal_ret0_nw(i32* %n0, i32* %w0) {
 ; IS__CGSCC____-NEXT:    store i32 3, i32* [[R0]], align 4
 ; IS__CGSCC____-NEXT:    store i32 5, i32* [[R1]], align 4
 ; IS__CGSCC____-NEXT:    store i32 1, i32* [[W0]], align 4
-; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call i32* @internal_ret1_rrw(i32* nofree nonnull align 4 dereferenceable(4) [[R0]], i32* nofree nonnull align 4 dereferenceable(4) [[R1]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
-; IS__CGSCC____-NEXT:    [[CALL1:%.*]] = call i32* @external_ret2_nrw(i32* nofree [[N0]], i32* nofree nonnull align 4 dereferenceable(4) [[R0]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
-; IS__CGSCC____-NEXT:    [[CALL2:%.*]] = call i32* @external_ret2_nrw(i32* nofree [[N0]], i32* nofree nonnull align 4 dereferenceable(4) [[R1]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
-; IS__CGSCC____-NEXT:    [[CALL3:%.*]] = call i32* @external_sink_ret2_nrw(i32* nofree [[N0]], i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[R0]], i32* nofree nonnull writeonly align 4 dereferenceable(4) [[W0]])
-; IS__CGSCC____-NEXT:    [[CALL4:%.*]] = call i32* @external_sink_ret2_nrw(i32* nofree [[N0]], i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) [[R1]], i32* nofree nonnull writeonly align 4 dereferenceable(4) [[W0]])
+; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call i32* @internal_ret1_rrw(i32* nofree noundef nonnull align 4 dereferenceable(4) [[R0]], i32* nofree noundef nonnull align 4 dereferenceable(4) [[R1]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
+; IS__CGSCC____-NEXT:    [[CALL1:%.*]] = call i32* @external_ret2_nrw(i32* nofree [[N0]], i32* nofree noundef nonnull align 4 dereferenceable(4) [[R0]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
+; IS__CGSCC____-NEXT:    [[CALL2:%.*]] = call i32* @external_ret2_nrw(i32* nofree [[N0]], i32* nofree noundef nonnull align 4 dereferenceable(4) [[R1]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
+; IS__CGSCC____-NEXT:    [[CALL3:%.*]] = call i32* @external_sink_ret2_nrw(i32* nofree [[N0]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[R0]], i32* nofree nonnull writeonly align 4 dereferenceable(4) [[W0]])
+; IS__CGSCC____-NEXT:    [[CALL4:%.*]] = call i32* @external_sink_ret2_nrw(i32* nofree [[N0]], i32* nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[R1]], i32* nofree nonnull writeonly align 4 dereferenceable(4) [[W0]])
 ; IS__CGSCC____-NEXT:    [[CALL5:%.*]] = call i32* @internal_ret0_nw(i32* nofree [[N0]], i32* nofree nonnull align 4 dereferenceable(4) [[W0]])
 ; IS__CGSCC____-NEXT:    br label [[RETURN]]
 ; IS__CGSCC____:       return:
diff --git a/llvm/test/Transforms/Attributor/readattrs.ll b/llvm/test/Transforms/Attributor/readattrs.ll
index 2f2c18d293ba0f..37381026ab354a 100644
--- a/llvm/test/Transforms/Attributor/readattrs.ll
+++ b/llvm/test/Transforms/Attributor/readattrs.ll
@@ -241,7 +241,7 @@ define void @unsound_readnone(i8* %ignored, i8* %escaped_then_written) {
 ; CHECK-LABEL: define {{[^@]+}}@unsound_readnone
 ; CHECK-SAME: (i8* nocapture nofree readnone [[IGNORED:%.*]], i8* [[ESCAPED_THEN_WRITTEN:%.*]])
 ; CHECK-NEXT:    [[ADDR:%.*]] = alloca i8*, align 8
-; CHECK-NEXT:    call void @escape_readnone_ptr(i8** nonnull align 8 dereferenceable(8) [[ADDR]], i8* noalias readnone [[ESCAPED_THEN_WRITTEN]])
+; CHECK-NEXT:    call void @escape_readnone_ptr(i8** noundef nonnull align 8 dereferenceable(8) [[ADDR]], i8* noalias readnone [[ESCAPED_THEN_WRITTEN]])
 ; CHECK-NEXT:    [[ADDR_LD:%.*]] = load i8*, i8** [[ADDR]], align 8
 ; CHECK-NEXT:    store i8 0, i8* [[ADDR_LD]], align 1
 ; CHECK-NEXT:    ret void
@@ -257,7 +257,7 @@ define void @unsound_readonly(i8* %ignored, i8* %escaped_then_written) {
 ; CHECK-LABEL: define {{[^@]+}}@unsound_readonly
 ; CHECK-SAME: (i8* nocapture nofree readnone [[IGNORED:%.*]], i8* [[ESCAPED_THEN_WRITTEN:%.*]])
 ; CHECK-NEXT:    [[ADDR:%.*]] = alloca i8*, align 8
-; CHECK-NEXT:    call void @escape_readonly_ptr(i8** nonnull align 8 dereferenceable(8) [[ADDR]], i8* readonly [[ESCAPED_THEN_WRITTEN]])
+; CHECK-NEXT:    call void @escape_readonly_ptr(i8** noundef nonnull align 8 dereferenceable(8) [[ADDR]], i8* readonly [[ESCAPED_THEN_WRITTEN]])
 ; CHECK-NEXT:    [[ADDR_LD:%.*]] = load i8*, i8** [[ADDR]], align 8
 ; CHECK-NEXT:    store i8 0, i8* [[ADDR_LD]], align 1
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/Attributor/returned.ll b/llvm/test/Transforms/Attributor/returned.ll
index b0007746592de5..2df7eebc6c0c59 100644
--- a/llvm/test/Transforms/Attributor/returned.ll
+++ b/llvm/test/Transforms/Attributor/returned.ll
@@ -314,8 +314,8 @@ define double* @ptr_scc_r1(double* %a, double* %r, double* %b) #0 {
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@ptr_scc_r1
 ; IS__TUNIT____-SAME: (double* nofree readnone [[A:%.*]], double* nofree readnone returned [[R:%.*]], double* nocapture nofree readnone [[B:%.*]])
 ; IS__TUNIT____-NEXT:  entry:
-; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call double* @ptr_sink_r0(double* noalias nofree readnone "no-capture-maybe-returned" [[R]])
-; IS__TUNIT____-NEXT:    [[CALL1:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[R]], double* noalias nofree readnone [[A]], double* noalias nofree readnone [[CALL]])
+; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call noundef double* @ptr_sink_r0(double* noalias nofree readnone "no-capture-maybe-returned" [[R]])
+; IS__TUNIT____-NEXT:    [[CALL1:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[R]], double* noalias nofree readnone [[A]], double* noalias nofree noundef readnone [[CALL]])
 ; IS__TUNIT____-NEXT:    ret double* [[CALL1]]
 ;
 ; IS__CGSCC____: Function Attrs: nofree noinline nosync nounwind readnone uwtable
@@ -323,7 +323,7 @@ define double* @ptr_scc_r1(double* %a, double* %r, double* %b) #0 {
 ; IS__CGSCC____-SAME: (double* nofree readnone [[A:%.*]], double* nofree readnone returned [[R:%.*]], double* nocapture nofree readnone [[B:%.*]])
 ; IS__CGSCC____-NEXT:  entry:
 ; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call double* @ptr_sink_r0(double* noalias nofree readnone [[R]])
-; IS__CGSCC____-NEXT:    [[CALL1:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[R]], double* noalias nofree readnone [[A]], double* noalias nofree readnone [[CALL]])
+; IS__CGSCC____-NEXT:    [[CALL1:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[R]], double* noalias nofree readnone [[A]], double* noalias nofree noundef readnone [[CALL]])
 ; IS__CGSCC____-NEXT:    ret double* [[CALL1]]
 ;
 entry:
@@ -340,20 +340,20 @@ define double* @ptr_scc_r2(double* %a, double* %b, double* %r) #0 {
 ; IS__TUNIT____-NEXT:    [[CMP:%.*]] = icmp ugt double* [[A]], [[B]]
 ; IS__TUNIT____-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; IS__TUNIT____:       if.then:
-; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call double* @ptr_sink_r0(double* noalias nofree readnone "no-capture-maybe-returned" [[R]])
-; IS__TUNIT____-NEXT:    [[CALL1:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[B]], double* noalias nofree readnone [[A]], double* noalias nofree readnone [[CALL]])
+; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call noundef double* @ptr_sink_r0(double* noalias nofree readnone "no-capture-maybe-returned" [[R]])
+; IS__TUNIT____-NEXT:    [[CALL1:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[B]], double* noalias nofree readnone [[A]], double* noalias nofree noundef readnone [[CALL]])
 ; IS__TUNIT____-NEXT:    br label [[RETURN:%.*]]
 ; IS__TUNIT____:       if.end:
 ; IS__TUNIT____-NEXT:    [[CMP2:%.*]] = icmp ult double* [[A]], [[B]]
 ; IS__TUNIT____-NEXT:    br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END12:%.*]]
 ; IS__TUNIT____:       if.then3:
-; IS__TUNIT____-NEXT:    [[CALL4:%.*]] = call double* @ptr_sink_r0(double* noalias nofree readnone "no-capture-maybe-returned" [[B]])
-; IS__TUNIT____-NEXT:    [[CALL5:%.*]] = call double* @ptr_scc_r1(double* noalias nofree readnone [[A]], double* noalias nofree readnone [[B]], double* noalias nocapture nofree readnone undef)
-; IS__TUNIT____-NEXT:    [[CALL6:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[R]], double* noalias nofree readnone [[R]], double* noalias nofree readnone [[R]])
-; IS__TUNIT____-NEXT:    [[CALL7:%.*]] = call double* @ptr_scc_r1(double* noalias nofree readnone [[A]], double* noalias nofree readnone [[CALL6]], double* noalias nocapture nofree readnone undef)
-; IS__TUNIT____-NEXT:    [[CALL8:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[A]], double* noalias nofree readnone [[B]], double* noalias nofree readnone [[R]])
-; IS__TUNIT____-NEXT:    [[CALL9:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[CALL5]], double* noalias nofree readnone [[CALL7]], double* noalias nofree readnone [[CALL8]])
-; IS__TUNIT____-NEXT:    [[CALL11:%.*]] = call double* @ptr_scc_r1(double* noalias nofree readnone [[CALL4]], double* noalias nofree readnone [[CALL9]], double* noalias nocapture nofree readnone undef)
+; IS__TUNIT____-NEXT:    [[CALL4:%.*]] = call noundef double* @ptr_sink_r0(double* noalias nofree readnone "no-capture-maybe-returned" [[B]])
+; IS__TUNIT____-NEXT:    [[CALL5:%.*]] = call noundef double* @ptr_scc_r1(double* noalias nofree readnone [[A]], double* noalias nofree readnone [[B]], double* noalias nocapture nofree readnone undef)
+; IS__TUNIT____-NEXT:    [[CALL6:%.*]] = call noundef double* @ptr_scc_r2(double* noalias nofree readnone [[R]], double* noalias nofree readnone [[R]], double* noalias nofree readnone [[R]])
+; IS__TUNIT____-NEXT:    [[CALL7:%.*]] = call noundef double* @ptr_scc_r1(double* noalias nofree readnone [[A]], double* noalias nofree noundef readnone [[CALL6]], double* noalias nocapture nofree readnone undef)
+; IS__TUNIT____-NEXT:    [[CALL8:%.*]] = call noundef double* @ptr_scc_r2(double* noalias nofree readnone [[A]], double* noalias nofree readnone [[B]], double* noalias nofree readnone [[R]])
+; IS__TUNIT____-NEXT:    [[CALL9:%.*]] = call noundef double* @ptr_scc_r2(double* noalias nofree noundef readnone [[CALL5]], double* noalias nofree noundef readnone [[CALL7]], double* noalias nofree noundef readnone [[CALL8]])
+; IS__TUNIT____-NEXT:    [[CALL11:%.*]] = call double* @ptr_scc_r1(double* noalias nofree noundef readnone [[CALL4]], double* noalias nofree noundef readnone [[CALL9]], double* noalias nocapture nofree noundef readnone undef)
 ; IS__TUNIT____-NEXT:    br label [[RETURN]]
 ; IS__TUNIT____:       if.end12:
 ; IS__TUNIT____-NEXT:    [[CMP13:%.*]] = icmp eq double* [[A]], [[B]]
@@ -378,19 +378,19 @@ define double* @ptr_scc_r2(double* %a, double* %b, double* %r) #0 {
 ; IS__CGSCC____-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; IS__CGSCC____:       if.then:
 ; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call double* @ptr_sink_r0(double* noalias nofree readnone [[R]])
-; IS__CGSCC____-NEXT:    [[CALL1:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[B]], double* noalias nofree readnone [[A]], double* noalias nofree readnone [[CALL]])
+; IS__CGSCC____-NEXT:    [[CALL1:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[B]], double* noalias nofree readnone [[A]], double* noalias nofree noundef readnone [[CALL]])
 ; IS__CGSCC____-NEXT:    br label [[RETURN:%.*]]
 ; IS__CGSCC____:       if.end:
 ; IS__CGSCC____-NEXT:    [[CMP2:%.*]] = icmp ult double* [[A]], [[B]]
 ; IS__CGSCC____-NEXT:    br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END12:%.*]]
 ; IS__CGSCC____:       if.then3:
 ; IS__CGSCC____-NEXT:    [[CALL4:%.*]] = call double* @ptr_sink_r0(double* noalias nofree readnone [[B]])
-; IS__CGSCC____-NEXT:    [[CALL5:%.*]] = call double* @ptr_scc_r1(double* noalias nofree readnone [[A]], double* noalias nofree readnone [[B]], double* noalias nocapture nofree readnone undef)
-; IS__CGSCC____-NEXT:    [[CALL6:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[R]], double* noalias nofree readnone [[R]], double* noalias nofree readnone [[R]])
-; IS__CGSCC____-NEXT:    [[CALL7:%.*]] = call double* @ptr_scc_r1(double* noalias nofree readnone [[A]], double* noalias nofree readnone [[CALL6]], double* noalias nocapture nofree readnone undef)
-; IS__CGSCC____-NEXT:    [[CALL8:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[A]], double* noalias nofree readnone [[B]], double* noalias nofree readnone [[R]])
-; IS__CGSCC____-NEXT:    [[CALL9:%.*]] = call double* @ptr_scc_r2(double* noalias nofree readnone [[CALL5]], double* noalias nofree readnone [[CALL7]], double* noalias nofree readnone [[CALL8]])
-; IS__CGSCC____-NEXT:    [[CALL11:%.*]] = call double* @ptr_scc_r1(double* noalias nofree readnone [[CALL4]], double* noalias nofree readnone [[CALL9]], double* noalias nocapture nofree readnone undef)
+; IS__CGSCC____-NEXT:    [[CALL5:%.*]] = call noundef double* @ptr_scc_r1(double* noalias nofree readnone [[A]], double* noalias nofree readnone [[B]], double* noalias nocapture nofree readnone undef)
+; IS__CGSCC____-NEXT:    [[CALL6:%.*]] = call noundef double* @ptr_scc_r2(double* noalias nofree readnone [[R]], double* noalias nofree readnone [[R]], double* noalias nofree readnone [[R]])
+; IS__CGSCC____-NEXT:    [[CALL7:%.*]] = call noundef double* @ptr_scc_r1(double* noalias nofree readnone [[A]], double* noalias nofree noundef readnone [[CALL6]], double* noalias nocapture nofree readnone undef)
+; IS__CGSCC____-NEXT:    [[CALL8:%.*]] = call noundef double* @ptr_scc_r2(double* noalias nofree readnone [[A]], double* noalias nofree readnone [[B]], double* noalias nofree readnone [[R]])
+; IS__CGSCC____-NEXT:    [[CALL9:%.*]] = call noundef double* @ptr_scc_r2(double* noalias nofree noundef readnone [[CALL5]], double* noalias nofree noundef readnone [[CALL7]], double* noalias nofree noundef readnone [[CALL8]])
+; IS__CGSCC____-NEXT:    [[CALL11:%.*]] = call double* @ptr_scc_r1(double* noalias nofree noundef readnone [[CALL4]], double* noalias nofree noundef readnone [[CALL9]], double* noalias nocapture nofree noundef readnone undef)
 ; IS__CGSCC____-NEXT:    br label [[RETURN]]
 ; IS__CGSCC____:       if.end12:
 ; IS__CGSCC____-NEXT:    [[CMP13:%.*]] = icmp eq double* [[A]], [[B]]
@@ -605,7 +605,7 @@ define i32* @calls_unknown_fn(i32* %r) #0 {
 ; CHECK: Function Attrs: noinline nounwind uwtable
 ; CHECK-LABEL: define {{[^@]+}}@calls_unknown_fn
 ; CHECK-SAME: (i32* nofree readnone returned "no-capture-maybe-returned" [[R:%.*]])
-; CHECK-NEXT:    tail call void @unknown_fn(i32* (i32*)* nonnull @calls_unknown_fn)
+; CHECK-NEXT:    tail call void @unknown_fn(i32* (i32*)* noundef nonnull @calls_unknown_fn)
 ; CHECK-NEXT:    ret i32* [[R]]
 ;
   tail call void @unknown_fn(i32* (i32*)* nonnull @calls_unknown_fn)
diff --git a/llvm/test/Transforms/Attributor/undefined_behavior.ll b/llvm/test/Transforms/Attributor/undefined_behavior.ll
index 22c2979e23defe..b4a02671b7cdc3 100644
--- a/llvm/test/Transforms/Attributor/undefined_behavior.ll
+++ b/llvm/test/Transforms/Attributor/undefined_behavior.ll
@@ -704,12 +704,12 @@ ret:
 define void @arg_nonnull_violation1_1() {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@arg_nonnull_violation1_1()
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_1(i32* noalias nocapture nofree nonnull writeonly align 536870912 null)
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_1(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null)
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@arg_nonnull_violation1_1()
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_1(i32* noalias nocapture nofree nonnull writeonly align 536870912 dereferenceable(4) null)
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_1(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 dereferenceable(4) null)
 ; IS__CGSCC____-NEXT:    ret void
 ;
   call void @arg_nonnull_1(i32* null)
@@ -734,13 +734,13 @@ define void @arg_nonnull_violation2_1(i1 %c) {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@arg_nonnull_violation2_1
 ; IS__TUNIT____-SAME: (i1 [[C:%.*]])
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_1(i32* nocapture nofree nonnull writeonly align 536870912 null)
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_1(i32* nocapture nofree noundef nonnull writeonly align 536870912 null)
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@arg_nonnull_violation2_1
 ; IS__CGSCC____-SAME: (i1 [[C:%.*]])
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_1(i32* nocapture nofree nonnull writeonly align 536870912 dereferenceable(4) null)
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_1(i32* nocapture nofree noundef nonnull writeonly align 536870912 dereferenceable(4) null)
 ; IS__CGSCC____-NEXT:    ret void
 ;
   %null = getelementptr i32, i32* null, i32 0
@@ -774,16 +774,16 @@ define void @arg_nonnull_violation3_1(i1 %c) {
 ; IS__TUNIT____-NEXT:    [[PTR:%.*]] = alloca i32, align 4
 ; IS__TUNIT____-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; IS__TUNIT____:       t:
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]])
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree writeonly align 536870912 null)
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]])
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* noalias nocapture nofree writeonly align 536870912 null)
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]])
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree noundef writeonly align 536870912 null)
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]])
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* noalias nocapture nofree noundef writeonly align 536870912 null)
 ; IS__TUNIT____-NEXT:    br label [[RET:%.*]]
 ; IS__TUNIT____:       f:
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]])
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree writeonly align 536870912 null)
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]])
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* noalias nocapture nofree writeonly align 536870912 null)
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]])
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree noundef writeonly align 536870912 null)
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]])
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* noalias nocapture nofree noundef writeonly align 536870912 null)
 ; IS__TUNIT____-NEXT:    br label [[RET]]
 ; IS__TUNIT____:       ret:
 ; IS__TUNIT____-NEXT:    ret void
@@ -794,16 +794,16 @@ define void @arg_nonnull_violation3_1(i1 %c) {
 ; IS__CGSCC____-NEXT:    [[PTR:%.*]] = alloca i32, align 4
 ; IS__CGSCC____-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; IS__CGSCC____:       t:
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]])
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree writeonly align 536870912 null)
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]])
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* noalias nocapture nofree writeonly align 536870912 null)
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]])
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree noundef writeonly align 536870912 null)
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]])
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* noalias nocapture nofree noundef writeonly align 536870912 null)
 ; IS__CGSCC____-NEXT:    br label [[RET:%.*]]
 ; IS__CGSCC____:       f:
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]])
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree writeonly align 536870912 null)
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]])
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* noalias nocapture nofree writeonly align 536870912 null)
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]])
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree noundef writeonly align 536870912 null)
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]])
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_12(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* noalias nocapture nofree noundef writeonly align 536870912 null)
 ; IS__CGSCC____-NEXT:    br label [[RET]]
 ; IS__CGSCC____:       ret:
 ; IS__CGSCC____-NEXT:    ret void
@@ -833,12 +833,12 @@ define void @arg_nonnull_violation3_2(i1 %c) {
 ; IS__TUNIT____-NEXT:    [[PTR:%.*]] = alloca i32, align 4
 ; IS__TUNIT____-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; IS__TUNIT____:       t:
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]])
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree writeonly align 536870912 null)
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]])
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree noundef writeonly align 536870912 null)
 ; IS__TUNIT____-NEXT:    unreachable
 ; IS__TUNIT____:       f:
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]])
-; IS__TUNIT____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree writeonly align 536870912 null)
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]])
+; IS__TUNIT____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree noundef writeonly align 536870912 null)
 ; IS__TUNIT____-NEXT:    unreachable
 ; IS__TUNIT____:       ret:
 ; IS__TUNIT____-NEXT:    ret void
@@ -849,12 +849,12 @@ define void @arg_nonnull_violation3_2(i1 %c) {
 ; IS__CGSCC____-NEXT:    [[PTR:%.*]] = alloca i32, align 4
 ; IS__CGSCC____-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; IS__CGSCC____:       t:
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]])
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree writeonly align 536870912 null)
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]])
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree noundef writeonly align 536870912 null)
 ; IS__CGSCC____-NEXT:    unreachable
 ; IS__CGSCC____:       f:
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]])
-; IS__CGSCC____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* noalias nocapture nofree nonnull writeonly align 536870912 null, i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree writeonly align 536870912 null)
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]])
+; IS__CGSCC____-NEXT:    call void @arg_nonnull_12_noundef_2(i32* noalias nocapture nofree noundef nonnull writeonly align 536870912 null, i32* nocapture nofree noundef nonnull writeonly align 4 dereferenceable(4) [[PTR]], i32* noalias nocapture nofree noundef writeonly align 536870912 null)
 ; IS__CGSCC____-NEXT:    unreachable
 ; IS__CGSCC____:       ret:
 ; IS__CGSCC____-NEXT:    ret void
diff --git a/llvm/test/Transforms/Attributor/value-simplify.ll b/llvm/test/Transforms/Attributor/value-simplify.ll
index 3a487a9ed22995..7ae8cd37801171 100644
--- a/llvm/test/Transforms/Attributor/value-simplify.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify.ll
@@ -321,12 +321,12 @@ define i32 @ipccp3() {
 define internal i32* @test_inalloca(i32* inalloca %a) {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@test_inalloca
-; IS__TUNIT____-SAME: (i32* inalloca noalias nofree returned writeonly align 536870912 "no-capture-maybe-returned" [[A:%.*]])
+; IS__TUNIT____-SAME: (i32* inalloca noalias nofree noundef returned writeonly align 536870912 "no-capture-maybe-returned" [[A:%.*]])
 ; IS__TUNIT____-NEXT:    ret i32* [[A]]
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@test_inalloca
-; IS__CGSCC____-SAME: (i32* inalloca noalias nofree returned writeonly align 536870912 "no-capture-maybe-returned" [[A:%.*]])
+; IS__CGSCC____-SAME: (i32* inalloca noalias nofree noundef returned writeonly align 536870912 "no-capture-maybe-returned" [[A:%.*]])
 ; IS__CGSCC____-NEXT:    ret i32* [[A]]
 ;
   ret i32* %a
@@ -334,12 +334,12 @@ define internal i32* @test_inalloca(i32* inalloca %a) {
 define i32* @complicated_args_inalloca() {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@complicated_args_inalloca()
-; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call i32* @test_inalloca(i32* noalias nocapture nofree writeonly align 536870912 null)
+; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call i32* @test_inalloca(i32* noalias nocapture nofree noundef writeonly align 536870912 null)
 ; IS__TUNIT____-NEXT:    ret i32* [[CALL]]
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@complicated_args_inalloca()
-; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call i32* @test_inalloca(i32* noalias nocapture nofree writeonly align 536870912 null)
+; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call i32* @test_inalloca(i32* noalias nocapture nofree noundef writeonly align 536870912 null)
 ; IS__CGSCC____-NEXT:    ret i32* [[CALL]]
 ;
   %call = call i32* @test_inalloca(i32* null)
@@ -349,12 +349,12 @@ define i32* @complicated_args_inalloca() {
 define internal i32* @test_preallocated(i32* preallocated(i32) %a) {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@test_preallocated
-; IS__TUNIT____-SAME: (i32* noalias nofree returned writeonly preallocated(i32) align 536870912 "no-capture-maybe-returned" [[A:%.*]])
+; IS__TUNIT____-SAME: (i32* noalias nofree noundef returned writeonly preallocated(i32) align 536870912 "no-capture-maybe-returned" [[A:%.*]])
 ; IS__TUNIT____-NEXT:    ret i32* [[A]]
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@test_preallocated
-; IS__CGSCC____-SAME: (i32* noalias nofree returned writeonly preallocated(i32) align 536870912 "no-capture-maybe-returned" [[A:%.*]])
+; IS__CGSCC____-SAME: (i32* noalias nofree noundef returned writeonly preallocated(i32) align 536870912 "no-capture-maybe-returned" [[A:%.*]])
 ; IS__CGSCC____-NEXT:    ret i32* [[A]]
 ;
   ret i32* %a
@@ -363,25 +363,25 @@ define i32* @complicated_args_preallocated() {
 ; IS__TUNIT_OPM: Function Attrs: nounwind
 ; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@complicated_args_preallocated()
 ; IS__TUNIT_OPM-NEXT:    [[C:%.*]] = call token @llvm.call.preallocated.setup(i32 1)
-; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree writeonly preallocated(i32) align 536870912 null) [[ATTR5:#.*]] [ "preallocated"(token [[C]]) ]
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree noundef writeonly preallocated(i32) align 536870912 null) [[ATTR5:#.*]] [ "preallocated"(token [[C]]) ]
 ; IS__TUNIT_OPM-NEXT:    ret i32* [[CALL]]
 ;
 ; IS__TUNIT_NPM: Function Attrs: nounwind
 ; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@complicated_args_preallocated()
 ; IS__TUNIT_NPM-NEXT:    [[C:%.*]] = call token @llvm.call.preallocated.setup(i32 1)
-; IS__TUNIT_NPM-NEXT:    [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree writeonly preallocated(i32) align 536870912 null) [[ATTR4:#.*]] [ "preallocated"(token [[C]]) ]
+; IS__TUNIT_NPM-NEXT:    [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree noundef writeonly preallocated(i32) align 536870912 null) [[ATTR4:#.*]] [ "preallocated"(token [[C]]) ]
 ; IS__TUNIT_NPM-NEXT:    ret i32* [[CALL]]
 ;
 ; IS__CGSCC_OPM: Function Attrs: nounwind
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@complicated_args_preallocated()
 ; IS__CGSCC_OPM-NEXT:    [[C:%.*]] = call token @llvm.call.preallocated.setup(i32 1)
-; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree writeonly preallocated(i32) align 536870912 null) [[ATTR6:#.*]] [ "preallocated"(token [[C]]) ]
+; IS__CGSCC_OPM-NEXT:    [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree noundef writeonly preallocated(i32) align 536870912 null) [[ATTR6:#.*]] [ "preallocated"(token [[C]]) ]
 ; IS__CGSCC_OPM-NEXT:    ret i32* [[CALL]]
 ;
 ; IS__CGSCC_NPM: Function Attrs: nounwind
 ; IS__CGSCC_NPM-LABEL: define {{[^@]+}}@complicated_args_preallocated()
 ; IS__CGSCC_NPM-NEXT:    [[C:%.*]] = call token @llvm.call.preallocated.setup(i32 1)
-; IS__CGSCC_NPM-NEXT:    [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree writeonly preallocated(i32) align 536870912 null) [[ATTR5:#.*]] [ "preallocated"(token [[C]]) ]
+; IS__CGSCC_NPM-NEXT:    [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree noundef writeonly preallocated(i32) align 536870912 null) [[ATTR5:#.*]] [ "preallocated"(token [[C]]) ]
 ; IS__CGSCC_NPM-NEXT:    ret i32* [[CALL]]
 ;
   %c = call token @llvm.call.preallocated.setup(i32 1)
@@ -393,13 +393,13 @@ define internal void @test_sret(%struct.X* sret %a, %struct.X** %b) {
 ;
 ; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@test_sret
-; IS__TUNIT____-SAME: (%struct.X* noalias nofree nonnull sret writeonly align 536870912 dereferenceable(8) [[A:%.*]], %struct.X** nocapture nofree nonnull writeonly align 8 dereferenceable(8) [[B:%.*]])
+; IS__TUNIT____-SAME: (%struct.X* noalias nofree noundef nonnull sret writeonly align 536870912 dereferenceable(8) [[A:%.*]], %struct.X** nocapture nofree nonnull writeonly align 8 dereferenceable(8) [[B:%.*]])
 ; IS__TUNIT____-NEXT:    store %struct.X* [[A]], %struct.X** [[B]], align 8
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn writeonly
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@test_sret
-; IS__CGSCC____-SAME: (%struct.X* noalias nofree nonnull sret writeonly align 536870912 dereferenceable(8) [[A:%.*]], %struct.X** nocapture nofree nonnull writeonly align 8 dereferenceable(8) [[B:%.*]])
+; IS__CGSCC____-SAME: (%struct.X* noalias nofree noundef nonnull sret writeonly align 536870912 dereferenceable(8) [[A:%.*]], %struct.X** nocapture nofree nonnull writeonly align 8 dereferenceable(8) [[B:%.*]])
 ; IS__CGSCC____-NEXT:    store %struct.X* [[A]], %struct.X** [[B]], align 8
 ; IS__CGSCC____-NEXT:    ret void
 ;
@@ -412,14 +412,13 @@ define void @complicated_args_sret(%struct.X** %b) {
 ; IS__TUNIT____: Function Attrs: argmemonly nofree nosync nounwind willreturn writeonly
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@complicated_args_sret
 ; IS__TUNIT____-SAME: (%struct.X** nocapture nofree writeonly [[B:%.*]])
-; IS__TUNIT____-NEXT:    call void @test_sret(%struct.X* noalias nocapture nofree writeonly align 536870912 null, %struct.X** nocapture nofree writeonly align 8 [[B]])
+; IS__TUNIT____-NEXT:    call void @test_sret(%struct.X* noalias nocapture nofree noundef writeonly align 536870912 null, %struct.X** nocapture nofree writeonly align 8 [[B]])
 ; IS__TUNIT____-NEXT:    ret void
 ;
 ; IS__CGSCC____: Function Attrs: argmemonly nofree norecurse nosync nounwind willreturn writeonly
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@complicated_args_sret
 ; IS__CGSCC____-SAME: (%struct.X** nocapture nofree nonnull writeonly align 8 dereferenceable(8) [[B:%.*]])
-; IS__CGSCC____-NEXT:    call void @test_sret(%struct.X* noalias nocapture nofree nonnull writeonly align 536870912 dereferenceable(8) null, %struct.X** nocapture nofree nonnull writeonly align 8 dereferenceable(8) [[B]])
-; IS__CGSCC____-NEXT:    ret void
+; IS__CGSCC____-NEXT:    unreachable
 ;
   call void @test_sret(%struct.X* null, %struct.X** %b)
   ret void
@@ -428,12 +427,12 @@ define void @complicated_args_sret(%struct.X** %b) {
 define internal %struct.X* @test_nest(%struct.X* nest %a) {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@test_nest
-; IS__TUNIT____-SAME: (%struct.X* nest noalias nofree readnone returned align 536870912 "no-capture-maybe-returned" [[A:%.*]])
+; IS__TUNIT____-SAME: (%struct.X* nest noalias nofree noundef readnone returned align 536870912 "no-capture-maybe-returned" [[A:%.*]])
 ; IS__TUNIT____-NEXT:    ret %struct.X* [[A]]
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@test_nest
-; IS__CGSCC____-SAME: (%struct.X* nest noalias nofree readnone returned align 536870912 "no-capture-maybe-returned" [[A:%.*]])
+; IS__CGSCC____-SAME: (%struct.X* nest noalias nofree noundef readnone returned align 536870912 "no-capture-maybe-returned" [[A:%.*]])
 ; IS__CGSCC____-NEXT:    ret %struct.X* [[A]]
 ;
   ret %struct.X* %a
@@ -441,12 +440,12 @@ define internal %struct.X* @test_nest(%struct.X* nest %a) {
 define %struct.X* @complicated_args_nest() {
 ; IS__TUNIT____: Function Attrs: nofree nosync nounwind readnone willreturn
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@complicated_args_nest()
-; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call %struct.X* @test_nest(%struct.X* noalias nocapture nofree readnone align 536870912 null)
+; IS__TUNIT____-NEXT:    [[CALL:%.*]] = call %struct.X* @test_nest(%struct.X* noalias nocapture nofree noundef readnone align 536870912 null)
 ; IS__TUNIT____-NEXT:    ret %struct.X* [[CALL]]
 ;
 ; IS__CGSCC____: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC____-LABEL: define {{[^@]+}}@complicated_args_nest()
-; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call %struct.X* @test_nest(%struct.X* noalias nocapture nofree readnone align 536870912 null)
+; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call %struct.X* @test_nest(%struct.X* noalias nocapture nofree noundef readnone align 536870912 null)
 ; IS__CGSCC____-NEXT:    ret %struct.X* [[CALL]]
 ;
   %call = call %struct.X* @test_nest(%struct.X* null)
@@ -457,7 +456,7 @@ define %struct.X* @complicated_args_nest() {
 define internal void @test_byval(%struct.X* byval %a) {
 ; IS__CGSCC_OPM: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
 ; IS__CGSCC_OPM-LABEL: define {{[^@]+}}@test_byval
-; IS__CGSCC_OPM-SAME: (%struct.X* noalias nocapture nofree nonnull writeonly byval align 8 dereferenceable(8) [[A:%.*]])
+; IS__CGSCC_OPM-SAME: (%struct.X* noalias nocapture nofree noundef nonnull writeonly byval align 8 dereferenceable(8) [[A:%.*]])
 ; IS__CGSCC_OPM-NEXT:    [[G0:%.*]] = getelementptr [[STRUCT_X:%.*]], %struct.X* [[A]], i32 0, i32 0
 ; IS__CGSCC_OPM-NEXT:    store i8* null, i8** [[G0]], align 8
 ; IS__CGSCC_OPM-NEXT:    ret void
diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
index 07976660546f8f..b9e739a62b5b97 100644
--- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
@@ -27,7 +27,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 define void @delete_parallel_0() {
 ; CHECK-LABEL: define {{[^@]+}}@delete_parallel_0()
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined.willreturn to void (i32*, i32*, ...)*))
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*)* @.omp_outlined.willreturn to void (i32*, i32*, ...)*))
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -99,9 +99,9 @@ entry:
 define void @delete_parallel_1() {
 ; CHECK-LABEL: define {{[^@]+}}@delete_parallel_1()
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*))
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined..0 to void (i32*, i32*, ...)*))
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*))
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*))
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*)* @.omp_outlined..0 to void (i32*, i32*, ...)*))
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @0, i32 0, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*))
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -190,10 +190,10 @@ define void @delete_parallel_2() {
 ; CHECK-NEXT:    [[TMP:%.*]] = bitcast i32* [[A]] to i8*
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull align 4 dereferenceable(4) [[TMP]]) #0
 ; CHECK-NEXT:    store i32 0, i32* [[A]], align 4
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nocapture nofree nonnull align 4 dereferenceable(4) [[A]])
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]])
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]])
-; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* nonnull bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* nocapture nonnull align 4 dereferenceable(4) [[A]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..3 to void (i32*, i32*, ...)*), i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[A]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..4 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull align 4 dereferenceable(4) [[A]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..5 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull align 4 dereferenceable(4) [[A]])
+; CHECK-NEXT:    call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* noundef nonnull align 8 dereferenceable(24) @0, i32 1, void (i32*, i32*, ...)* noundef bitcast (void (i32*, i32*, i32*)* @.omp_outlined..6 to void (i32*, i32*, ...)*), i32* nocapture noundef nonnull align 4 dereferenceable(4) [[A]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[A]] to i8*
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull [[TMP1]])
 ; CHECK-NEXT:    ret void
@@ -214,7 +214,7 @@ entry:
 
 define internal void @.omp_outlined..3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..3
-; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree nonnull align 4 dereferenceable(4) [[A:%.*]]) #6
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nofree noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #6
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 @omp_get_thread_num() #4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
@@ -244,7 +244,7 @@ if.end:                                           ; preds = %if.then, %entry
 
 define internal void @.omp_outlined..4(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4
-; CHECK-SAME: (i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nonnull align 4 dereferenceable(4) [[A:%.*]])
+; CHECK-SAME: (i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull align 4 dereferenceable(4) [[A:%.*]])
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(%struct.ident_t* nonnull @0, i32 [[TMP]])
@@ -286,7 +286,7 @@ declare void @__kmpc_end_master(%struct.ident_t*, i32)
 
 define internal void @.omp_outlined..5(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..5
-; CHECK-SAME: (i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nonnull align 4 dereferenceable(4) [[A:%.*]])
+; CHECK-SAME: (i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull align 4 dereferenceable(4) [[A:%.*]])
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @0)
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
@@ -324,7 +324,7 @@ omp_if.end:                                       ; preds = %entry, %omp_if.then
 
 define internal void @.omp_outlined..6(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* dereferenceable(4) %a) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..6
-; CHECK-SAME: (i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture nonnull align 4 dereferenceable(4) [[A:%.*]])
+; CHECK-SAME: (i32* noalias nocapture nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], i32* nocapture noundef nonnull align 4 dereferenceable(4) [[A:%.*]])
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8

From b246bea921ae09c6f6a1d8c4fee7229a24990027 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 18 Aug 2020 09:10:43 +0000
Subject: [PATCH 019/101] [gn build] Port 00d7b7d014f

---
 llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn
index c807389846b78f..6f3b5d43e673d5 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn
@@ -43,6 +43,7 @@ unittest("ToolingTests") {
     "RecursiveASTVisitorTests/CXXOperatorCallExprTraverser.cpp",
     "RecursiveASTVisitorTests/Callbacks.cpp",
     "RecursiveASTVisitorTests/Class.cpp",
+    "RecursiveASTVisitorTests/Concept.cpp",
     "RecursiveASTVisitorTests/ConstructExpr.cpp",
     "RecursiveASTVisitorTests/DeclRefExpr.cpp",
     "RecursiveASTVisitorTests/ImplicitCtor.cpp",

From 13080ca1f0823b8df9651c1977040e5471c4a431 Mon Sep 17 00:00:00 2001
From: Rainer Orth <ro@gcc.gnu.org>
Date: Tue, 18 Aug 2020 11:32:51 +0200
Subject: [PATCH 020/101] [compiler-rt][test] XFAIL two tests on 32-bit sparc

Two tests `FAIL` on 32-bit sparc:

  Profile-sparc :: Posix/instrprof-gcov-parallel.test
  UBSan-Standalone-sparc :: TestCases/Float/cast-overflow.cpp

The failure mode is similar:

  Undefined                       first referenced
   symbol                             in file
  __atomic_store_4                    /var/tmp/instrprof-gcov-parallel-6afe8d.o
  __atomic_load_4                     /var/tmp/instrprof-gcov-parallel-6afe8d.o

  Undefined                       first referenced
   symbol                             in file
  __atomic_load_1                     /var/tmp/cast-overflow-72a808.o

This is a known bug: `clang` doesn't inline atomics on 32-bit sparc, unlike
`gcc`.

The patch therefore `XFAIL`s the tests.

Tested on `sparcv9-sun-solaris2.11` and `amd64-pc-solaris2.11`.

Differential Revision: https://reviews.llvm.org/D85346
---
 compiler-rt/test/profile/Posix/instrprof-gcov-parallel.test | 3 +++
 compiler-rt/test/ubsan/TestCases/Float/cast-overflow.cpp    | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/compiler-rt/test/profile/Posix/instrprof-gcov-parallel.test b/compiler-rt/test/profile/Posix/instrprof-gcov-parallel.test
index 0c7198e3c4e9eb..52b51e6269f532 100644
--- a/compiler-rt/test/profile/Posix/instrprof-gcov-parallel.test
+++ b/compiler-rt/test/profile/Posix/instrprof-gcov-parallel.test
@@ -10,6 +10,9 @@ RUN: %run %t.driver %t.target
 RUN: llvm-cov gcov instrprof-gcov-parallel.target.gcda
 RUN: FileCheck --input-file instrprof-gcov-parallel.target.c.gcov %s
 
+# Bug 42535
+# XFAIL: sparc-target-arch
+
 # Test if the .gcda file is correctly created from one of child processes
 # and counters of all processes are recorded correctly.
 # 707 = CHILDREN * COUNT
diff --git a/compiler-rt/test/ubsan/TestCases/Float/cast-overflow.cpp b/compiler-rt/test/ubsan/TestCases/Float/cast-overflow.cpp
index 479c39f28428ad..1c680259a2471e 100644
--- a/compiler-rt/test/ubsan/TestCases/Float/cast-overflow.cpp
+++ b/compiler-rt/test/ubsan/TestCases/Float/cast-overflow.cpp
@@ -11,6 +11,9 @@
 // FIXME: not %run %t 8 2>&1 | FileCheck %s --check-prefix=CHECK-8
 // RUN: not %run %t 9 2>&1 | FileCheck %s --check-prefix=CHECK-9
 
+// Bug 42535
+// XFAIL: sparc-target-arch
+
 // This test assumes float and double are IEEE-754 single- and double-precision.
 
 #if defined(__APPLE__)

From 9b32ef9413be2f18ad98f24454854b438b5d9214 Mon Sep 17 00:00:00 2001
From: QingShan Zhang <qshanz@cn.ibm.com>
Date: Tue, 18 Aug 2020 09:40:37 +0000
Subject: [PATCH 021/101] [Test][NFC] Add a new test to verify if scheduler can
 cluster two ld/st even with different preds

---
 .../CodeGen/AArch64/aarch64-stp-cluster.ll    | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
index e821e8504d962c..b0ed3d0490cc04 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
@@ -213,3 +213,28 @@ entry:
   store i32 %add, i32* %arrayidx1, align 4
   ret void
 }
+
+; FIXME - The SU(4) and SU(7) can be clustered even with
+; different preds
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: cluster_with_different_preds:%bb.0
+; CHECK-NOT:Cluster ld/st SU(4) - SU(7)
+; CHECK:SU(3):   STRWui %2:gpr32, %0:gpr64common, 0 ::
+; CHECK:SU(4):   %3:gpr32 = LDRWui %1:gpr64common, 0 ::
+; CHECK:Predecessors:
+; CHECK: SU(3): Ord  Latency=1 Memory
+; CHECK:SU(6):   STRBBui %4:gpr32, %1:gpr64common, 4 ::
+; CHECK:SU(7):   %5:gpr32 = LDRWui %1:gpr64common, 1 ::
+; CHECK:Predecessors:
+; CHECK:SU(6): Ord  Latency=1 Memory
+define i32 @cluster_with_different_preds(i32* %p, i32* %q) {
+entry:
+  store i32 3, i32* %p, align 4
+  %0 = load i32, i32* %q, align 4
+  %add.ptr = getelementptr inbounds i32, i32* %q, i64 1
+  %1 = bitcast i32* %add.ptr to i8*
+  store i8 5, i8* %1, align 1
+  %2 = load i32, i32* %add.ptr, align 4
+  %add = add nsw i32 %2, %0
+  ret i32 %add
+}

From eaff200429a3dcf36eebfae39d2e859d6815285e Mon Sep 17 00:00:00 2001
From: sameeran joshi <sameeranjayant.joshi@amd.com>
Date: Tue, 18 Aug 2020 15:05:51 +0530
Subject: [PATCH 022/101] [Flang] Move markdown files(.MD) from documentation/
 to docs/

Summary:
Other LLVM sub-projects use docs/ folder for documentation files.
Follow LLVM project policy.
Modify `documentation/` references in sources to `docs/`.
This patch doesn't modify files to reStructuredText(.rst) file format.

Reviewed By: DavidTruby, sscalpone

Differential Revision: https://reviews.llvm.org/D85884
---
 flang/README.md                               | 20 +++++++++----------
 .../ArrayComposition.md                       |  2 +-
 .../BijectiveInternalNameUniquing.md          |  0
 flang/{documentation => docs}/C++17.md        |  2 +-
 flang/{documentation => docs}/C++style.md     |  2 +-
 flang/{documentation => docs}/Calls.md        |  2 +-
 flang/{documentation => docs}/Character.md    |  2 +-
 .../ControlFlowGraph.md                       |  2 +-
 flang/{documentation => docs}/Directives.md   |  2 +-
 flang/{documentation => docs}/Extensions.md   |  2 +-
 .../FortranForCProgrammers.md                 |  2 +-
 flang/{documentation => docs}/FortranIR.md    |  2 +-
 .../IORuntimeInternals.md                     |  2 +-
 .../ImplementingASemanticCheck.md             |  2 +-
 flang/{documentation => docs}/Intrinsics.md   |  2 +-
 .../LabelResolution.md                        |  2 +-
 flang/{documentation => docs}/ModFiles.md     |  2 +-
 .../OpenMP-4.5-grammar.txt                    |  2 +-
 .../OpenMP-semantics.md                       |  2 +-
 .../OptionComparison.md                       |  2 +-
 flang/{documentation => docs}/Overview.md     |  2 +-
 .../ParserCombinators.md                      |  2 +-
 flang/{documentation => docs}/Parsing.md      |  2 +-
 .../{documentation => docs}/Preprocessing.md  |  2 +-
 .../PullRequestChecklist.md                   |  2 +-
 .../RuntimeDescriptor.md                      |  2 +-
 flang/{documentation => docs}/Semantics.md    |  2 +-
 .../{documentation => docs}/f2018-grammar.txt |  2 +-
 .../{documentation => docs}/flang-c-style.el  |  2 +-
 flang/lib/Evaluate/intrinsics.cpp             |  2 +-
 30 files changed, 38 insertions(+), 38 deletions(-)
 rename flang/{documentation => docs}/ArrayComposition.md (99%)
 rename flang/{documentation => docs}/BijectiveInternalNameUniquing.md (100%)
 rename flang/{documentation => docs}/C++17.md (99%)
 rename flang/{documentation => docs}/C++style.md (99%)
 rename flang/{documentation => docs}/Calls.md (99%)
 rename flang/{documentation => docs}/Character.md (99%)
 rename flang/{documentation => docs}/ControlFlowGraph.md (99%)
 rename flang/{documentation => docs}/Directives.md (92%)
 rename flang/{documentation => docs}/Extensions.md (99%)
 rename flang/{documentation => docs}/FortranForCProgrammers.md (99%)
 rename flang/{documentation => docs}/FortranIR.md (99%)
 rename flang/{documentation => docs}/IORuntimeInternals.md (99%)
 rename flang/{documentation => docs}/ImplementingASemanticCheck.md (99%)
 rename flang/{documentation => docs}/Intrinsics.md (99%)
 rename flang/{documentation => docs}/LabelResolution.md (99%)
 rename flang/{documentation => docs}/ModFiles.md (99%)
 rename flang/{documentation => docs}/OpenMP-4.5-grammar.txt (99%)
 rename flang/{documentation => docs}/OpenMP-semantics.md (99%)
 rename flang/{documentation => docs}/OptionComparison.md (99%)
 rename flang/{documentation => docs}/Overview.md (98%)
 rename flang/{documentation => docs}/ParserCombinators.md (99%)
 rename flang/{documentation => docs}/Parsing.md (99%)
 rename flang/{documentation => docs}/Preprocessing.md (99%)
 rename flang/{documentation => docs}/PullRequestChecklist.md (98%)
 rename flang/{documentation => docs}/RuntimeDescriptor.md (99%)
 rename flang/{documentation => docs}/Semantics.md (99%)
 rename flang/{documentation => docs}/f2018-grammar.txt (99%)
 rename flang/{documentation => docs}/flang-c-style.el (92%)

diff --git a/flang/README.md b/flang/README.md
index f7797ed55bd3ed..44573ae4b9b6b0 100644
--- a/flang/README.md
+++ b/flang/README.md
@@ -8,30 +8,30 @@ F18 was subsequently accepted into the LLVM project and rechristened as Flang.
 
 ## Getting Started
 
-Read more about flang in the [documentation directory](documentation).
-Start with the [compiler overview](documentation/Overview.md).
+Read more about flang in the [docs directory](docs).
+Start with the [compiler overview](docs/Overview.md).
 
 To better understand Fortran as a language
 and the specific grammar accepted by flang,
-read [Fortran For C Programmers](documentation/FortranForCProgrammers.md)
+read [Fortran For C Programmers](docs/FortranForCProgrammers.md)
 and
-flang's specifications of the [Fortran grammar](documentation/f2018-grammar.txt)
+flang's specifications of the [Fortran grammar](docs/f2018-grammar.txt)
 and
-the [OpenMP grammar](documentation/OpenMP-4.5-grammar.txt).
+the [OpenMP grammar](docs/OpenMP-4.5-grammar.txt).
 
 Treatment of language extensions is covered
-in [this document](documentation/Extensions.md).
+in [this document](docs/Extensions.md).
 
 To understand the compilers handling of intrinsics,
-see the [discussion of intrinsics](documentation/Intrinsics.md).
+see the [discussion of intrinsics](docs/Intrinsics.md).
 
 To understand how a flang program communicates with libraries at runtime,
-see the discussion of [runtime descriptors](documentation/RuntimeDescriptor.md).
+see the discussion of [runtime descriptors](docs/RuntimeDescriptor.md).
 
 If you're interested in contributing to the compiler,
-read the [style guide](documentation/C++style.md)
+read the [style guide](docs/C++style.md)
 and
-also review [how flang uses modern C++ features](documentation/C++17.md).
+also review [how flang uses modern C++ features](docs/C++17.md).
 
 ## Supported C++ compilers
 
diff --git a/flang/documentation/ArrayComposition.md b/flang/docs/ArrayComposition.md
similarity index 99%
rename from flang/documentation/ArrayComposition.md
rename to flang/docs/ArrayComposition.md
index 099909c5ef0d04..0f30af39f9e4bb 100644
--- a/flang/documentation/ArrayComposition.md
+++ b/flang/docs/ArrayComposition.md
@@ -1,4 +1,4 @@
-<!--===- documentation/ArrayComposition.md 
+<!--===- docs/ArrayComposition.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/BijectiveInternalNameUniquing.md b/flang/docs/BijectiveInternalNameUniquing.md
similarity index 100%
rename from flang/documentation/BijectiveInternalNameUniquing.md
rename to flang/docs/BijectiveInternalNameUniquing.md
diff --git a/flang/documentation/C++17.md b/flang/docs/C++17.md
similarity index 99%
rename from flang/documentation/C++17.md
rename to flang/docs/C++17.md
index 18ea0b23e70edc..87d5fc01f09222 100644
--- a/flang/documentation/C++17.md
+++ b/flang/docs/C++17.md
@@ -1,4 +1,4 @@
-<!--===- documentation/C++17.md 
+<!--===- docs/C++17.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/C++style.md b/flang/docs/C++style.md
similarity index 99%
rename from flang/documentation/C++style.md
rename to flang/docs/C++style.md
index ca532463ae831e..4ab95393d758ab 100644
--- a/flang/documentation/C++style.md
+++ b/flang/docs/C++style.md
@@ -1,4 +1,4 @@
-<!--===- documentation/C++style.md 
+<!--===- docs/C++style.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Calls.md b/flang/docs/Calls.md
similarity index 99%
rename from flang/documentation/Calls.md
rename to flang/docs/Calls.md
index 79f0d972bc7332..d70bc910d73dbe 100644
--- a/flang/documentation/Calls.md
+++ b/flang/docs/Calls.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Calls.md
+<!--===- docs/Calls.md
 
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Character.md b/flang/docs/Character.md
similarity index 99%
rename from flang/documentation/Character.md
rename to flang/docs/Character.md
index d1c7ca479a1734..700db864f2dace 100644
--- a/flang/documentation/Character.md
+++ b/flang/docs/Character.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Character.md
+<!--===- docs/Character.md
 
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/ControlFlowGraph.md b/flang/docs/ControlFlowGraph.md
similarity index 99%
rename from flang/documentation/ControlFlowGraph.md
rename to flang/docs/ControlFlowGraph.md
index 6ed9183daf7cce..b2b549845ebb6f 100644
--- a/flang/documentation/ControlFlowGraph.md
+++ b/flang/docs/ControlFlowGraph.md
@@ -1,4 +1,4 @@
-<!--===- documentation/ControlFlowGraph.md 
+<!--===- docs/ControlFlowGraph.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Directives.md b/flang/docs/Directives.md
similarity index 92%
rename from flang/documentation/Directives.md
rename to flang/docs/Directives.md
index e178a69714ccea..c2e93c5f3de2ea 100644
--- a/flang/documentation/Directives.md
+++ b/flang/docs/Directives.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Directives.md 
+<!--===- docs/Directives.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Extensions.md b/flang/docs/Extensions.md
similarity index 99%
rename from flang/documentation/Extensions.md
rename to flang/docs/Extensions.md
index 8bf5647bd60cdd..eaa600e2eea066 100644
--- a/flang/documentation/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Extensions.md 
+<!--===- docs/Extensions.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/FortranForCProgrammers.md b/flang/docs/FortranForCProgrammers.md
similarity index 99%
rename from flang/documentation/FortranForCProgrammers.md
rename to flang/docs/FortranForCProgrammers.md
index ce4a0b7072b015..103def2a92ce65 100644
--- a/flang/documentation/FortranForCProgrammers.md
+++ b/flang/docs/FortranForCProgrammers.md
@@ -1,4 +1,4 @@
-<!--===- documentation/FortranForCProgrammers.md
+<!--===- docs/FortranForCProgrammers.md
 
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/FortranIR.md b/flang/docs/FortranIR.md
similarity index 99%
rename from flang/documentation/FortranIR.md
rename to flang/docs/FortranIR.md
index ccdeb8fc4f565c..5d83aaa8e34cfb 100644
--- a/flang/documentation/FortranIR.md
+++ b/flang/docs/FortranIR.md
@@ -1,4 +1,4 @@
-<!--===- documentation/FortranIR.md 
+<!--===- docs/FortranIR.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/IORuntimeInternals.md b/flang/docs/IORuntimeInternals.md
similarity index 99%
rename from flang/documentation/IORuntimeInternals.md
rename to flang/docs/IORuntimeInternals.md
index c9b1ce4078ecbb..b4f3092a014ece 100644
--- a/flang/documentation/IORuntimeInternals.md
+++ b/flang/docs/IORuntimeInternals.md
@@ -1,4 +1,4 @@
-<!--===- documentation/IORuntimeInternals.md
+<!--===- docs/IORuntimeInternals.md
 
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/ImplementingASemanticCheck.md b/flang/docs/ImplementingASemanticCheck.md
similarity index 99%
rename from flang/documentation/ImplementingASemanticCheck.md
rename to flang/docs/ImplementingASemanticCheck.md
index fc2e4f14061eb9..3bb16915cb8809 100644
--- a/flang/documentation/ImplementingASemanticCheck.md
+++ b/flang/docs/ImplementingASemanticCheck.md
@@ -1,4 +1,4 @@
-<!--===- documentation/ImplementingASemanticCheck.md 
+<!--===- docs/ImplementingASemanticCheck.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Intrinsics.md b/flang/docs/Intrinsics.md
similarity index 99%
rename from flang/documentation/Intrinsics.md
rename to flang/docs/Intrinsics.md
index 8fd06766710c60..7be0bf3e4a9ca7 100644
--- a/flang/documentation/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Intrinsics.md 
+<!--===- docs/Intrinsics.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/LabelResolution.md b/flang/docs/LabelResolution.md
similarity index 99%
rename from flang/documentation/LabelResolution.md
rename to flang/docs/LabelResolution.md
index 87c7798c234edf..e837b4fa6aece3 100644
--- a/flang/documentation/LabelResolution.md
+++ b/flang/docs/LabelResolution.md
@@ -1,4 +1,4 @@
-<!--===- documentation/LabelResolution.md 
+<!--===- docs/LabelResolution.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/ModFiles.md b/flang/docs/ModFiles.md
similarity index 99%
rename from flang/documentation/ModFiles.md
rename to flang/docs/ModFiles.md
index 7d7ceb4e0a0e81..483341bdd0f471 100644
--- a/flang/documentation/ModFiles.md
+++ b/flang/docs/ModFiles.md
@@ -1,4 +1,4 @@
-<!--===- documentation/ModFiles.md 
+<!--===- docs/ModFiles.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/OpenMP-4.5-grammar.txt b/flang/docs/OpenMP-4.5-grammar.txt
similarity index 99%
rename from flang/documentation/OpenMP-4.5-grammar.txt
rename to flang/docs/OpenMP-4.5-grammar.txt
index 01cfa1b65f94a6..c74072ba1ef27c 100644
--- a/flang/documentation/OpenMP-4.5-grammar.txt
+++ b/flang/docs/OpenMP-4.5-grammar.txt
@@ -1,4 +1,4 @@
-#===-- documentation/OpenMP-4.5-grammar.txt --------------------------------===#
+#===-- docs/OpenMP-4.5-grammar.txt --------------------------------===#
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/OpenMP-semantics.md b/flang/docs/OpenMP-semantics.md
similarity index 99%
rename from flang/documentation/OpenMP-semantics.md
rename to flang/docs/OpenMP-semantics.md
index 783acef6a4e011..4e2a81739cf81f 100644
--- a/flang/documentation/OpenMP-semantics.md
+++ b/flang/docs/OpenMP-semantics.md
@@ -1,4 +1,4 @@
-<!--===- documentation/OpenMP-semantics.md 
+<!--===- docs/OpenMP-semantics.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/OptionComparison.md b/flang/docs/OptionComparison.md
similarity index 99%
rename from flang/documentation/OptionComparison.md
rename to flang/docs/OptionComparison.md
index e8ccf57fbdf253..db5932411cc1e8 100644
--- a/flang/documentation/OptionComparison.md
+++ b/flang/docs/OptionComparison.md
@@ -1,4 +1,4 @@
-<!--===- documentation/OptionComparison.md 
+<!--===- docs/OptionComparison.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Overview.md b/flang/docs/Overview.md
similarity index 98%
rename from flang/documentation/Overview.md
rename to flang/docs/Overview.md
index 47ad18f023f955..75a8cd1c4cab07 100644
--- a/flang/documentation/Overview.md
+++ b/flang/docs/Overview.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Overview.md 
+<!--===- docs/Overview.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/ParserCombinators.md b/flang/docs/ParserCombinators.md
similarity index 99%
rename from flang/documentation/ParserCombinators.md
rename to flang/docs/ParserCombinators.md
index 984a4d04343949..4f3dc6fd07ae69 100644
--- a/flang/documentation/ParserCombinators.md
+++ b/flang/docs/ParserCombinators.md
@@ -1,4 +1,4 @@
-<!--===- documentation/ParserCombinators.md 
+<!--===- docs/ParserCombinators.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Parsing.md b/flang/docs/Parsing.md
similarity index 99%
rename from flang/documentation/Parsing.md
rename to flang/docs/Parsing.md
index b961cd630ae185..fad9a4d57278c3 100644
--- a/flang/documentation/Parsing.md
+++ b/flang/docs/Parsing.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Parsing.md 
+<!--===- docs/Parsing.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Preprocessing.md b/flang/docs/Preprocessing.md
similarity index 99%
rename from flang/documentation/Preprocessing.md
rename to flang/docs/Preprocessing.md
index eff3f921e43c56..7f6f3951cfd162 100644
--- a/flang/documentation/Preprocessing.md
+++ b/flang/docs/Preprocessing.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Preprocessing.md 
+<!--===- docs/Preprocessing.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/PullRequestChecklist.md b/flang/docs/PullRequestChecklist.md
similarity index 98%
rename from flang/documentation/PullRequestChecklist.md
rename to flang/docs/PullRequestChecklist.md
index 9a43fa9b46e028..12a67be374a207 100644
--- a/flang/documentation/PullRequestChecklist.md
+++ b/flang/docs/PullRequestChecklist.md
@@ -1,4 +1,4 @@
-<!--===- documentation/PullRequestChecklist.md 
+<!--===- docs/PullRequestChecklist.md 
 
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/RuntimeDescriptor.md b/flang/docs/RuntimeDescriptor.md
similarity index 99%
rename from flang/documentation/RuntimeDescriptor.md
rename to flang/docs/RuntimeDescriptor.md
index bf884526579233..d819517fa9795f 100644
--- a/flang/documentation/RuntimeDescriptor.md
+++ b/flang/docs/RuntimeDescriptor.md
@@ -1,4 +1,4 @@
-<!--===- documentation/RuntimeDescriptor.md 
+<!--===- docs/RuntimeDescriptor.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/Semantics.md b/flang/docs/Semantics.md
similarity index 99%
rename from flang/documentation/Semantics.md
rename to flang/docs/Semantics.md
index 3f185f9f52b8a3..6ea0b292de69f1 100644
--- a/flang/documentation/Semantics.md
+++ b/flang/docs/Semantics.md
@@ -1,4 +1,4 @@
-<!--===- documentation/Semantics.md 
+<!--===- docs/Semantics.md 
   
    Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
    See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/f2018-grammar.txt b/flang/docs/f2018-grammar.txt
similarity index 99%
rename from flang/documentation/f2018-grammar.txt
rename to flang/docs/f2018-grammar.txt
index 10d3747d128752..2de8cdfc1b8f77 100644
--- a/flang/documentation/f2018-grammar.txt
+++ b/flang/docs/f2018-grammar.txt
@@ -1,4 +1,4 @@
-#===-- documentation/f2018-grammar.txt -------------------------------------===#
+#===-- docs/f2018-grammar.txt -------------------------------------===#
 #
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/documentation/flang-c-style.el b/flang/docs/flang-c-style.el
similarity index 92%
rename from flang/documentation/flang-c-style.el
rename to flang/docs/flang-c-style.el
index 001ab075046622..1749b5dd857053 100644
--- a/flang/documentation/flang-c-style.el
+++ b/flang/docs/flang-c-style.el
@@ -1,4 +1,4 @@
-;;===-- documentation/flang-c-style.el ------------------------------------===;;
+;;===-- docs/flang-c-style.el ------------------------------------===;;
 ;;
 ;; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 ;; See https://llvm.org/LICENSE.txt for license information.
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index ca9a91c98dafe3..9744c18fc2e45c 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -30,7 +30,7 @@ class FoldingContext;
 
 // This file defines the supported intrinsic procedures and implements
 // their recognition and validation.  It is largely table-driven.  See
-// documentation/intrinsics.md and section 16 of the Fortran 2018 standard
+// docs/intrinsics.md and section 16 of the Fortran 2018 standard
 // for full details on each of the intrinsics.  Be advised, they have
 // complicated details, and the design of these tables has to accommodate
 // that complexity.

From d2057a8015e97b5e212f3963923dbdcce9356a8f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 18 Aug 2020 11:11:58 +0100
Subject: [PATCH 023/101] [X86][AVX] Lower v16i8/v8i16 binary shuffles using
 VTRUNC/TRUNCATE

This patch adds lowerShuffleWithVTRUNC to handle basic binary shuffles that can be lowered either as a pure ISD::TRUNCATE or a X86ISD::VTRUNC (with undef/zero values in the remaining upper elements).

We concat the binary sources together into a single 256-bit source vector. To avoid regressions we perform this after we've tried to lower with PACKS/PACKUS which typically does a cleaner job than a concat.

For non-AVX512VL cases we have to canonicalize VTRUNC cases to use a 512-bit source vectors (inserting undefs/zeros in the upper elements as necessary), truncate and then (possibly) extract the 128-bit result.

This should address the last regressions in D66004

Differential Revision: https://reviews.llvm.org/D86093
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  94 ++++++-
 llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 146 ++++------
 llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 174 ++++++------
 llvm/test/CodeGen/X86/vector-trunc.ll         |  27 +-
 .../CodeGen/X86/x86-interleaved-access.ll     | 253 +++++++++++-------
 5 files changed, 385 insertions(+), 309 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9c084a4a2cd2ca..a929df328e130a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11325,17 +11325,15 @@ static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
 //
 // But when avx512vl is available, one can just use a single vpmovdw
 // instruction.
-static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
-                                     MVT VT, SDValue V1, SDValue V2,
-                                     SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget) {
+// TODO: Merge with lowerShuffleAsVTRUNC.
+static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
+                                     SDValue V2, ArrayRef<int> Mask,
+                                     const X86Subtarget &Subtarget,
+                                     SelectionDAG &DAG) {
   assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
-
-  if (Mask.size() != VT.getVectorNumElements())
-    return SDValue();
-
   bool SwappedOps = false;
 
+  // TODO: Convert to use Zeroable bitmask.
   if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
     if (!ISD::isBuildVectorAllZeros(V1.getNode()))
       return SDValue();
@@ -11378,6 +11376,73 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
 }
 
+// Attempt to match binary shuffle patterns as a truncate.
+static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
+                                    SDValue V2, ArrayRef<int> Mask,
+                                    const APInt &Zeroable,
+                                    const X86Subtarget &Subtarget,
+                                    SelectionDAG &DAG) {
+  assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
+  if (!Subtarget.hasAVX512())
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  unsigned MaxScale = 64 / VT.getScalarSizeInBits();
+  for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+    // TODO: Support non-BWI VPMOVWB truncations?
+    unsigned SrcEltBits = EltSizeInBits * Scale;
+    if (SrcEltBits < 32 && !Subtarget.hasBWI())
+      continue;
+
+    // Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
+    // Bail if the V2 elements are undef.
+    unsigned NumHalfSrcElts = NumElts / Scale;
+    unsigned NumSrcElts = 2 * NumHalfSrcElts;
+    if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
+        isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
+      continue;
+
+    // The elements beyond the truncation must be undef/zero.
+    unsigned UpperElts = NumElts - NumSrcElts;
+    if (UpperElts > 0 &&
+        !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+      continue;
+
+    // As we're using both sources then we need to concat them together
+    // and truncate from the 256-bit src.
+    MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
+    SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
+
+    MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+    MVT SrcVT = MVT::getVectorVT(SrcSVT, 256 / SrcEltBits);
+    Src = DAG.getBitcast(SrcVT, Src);
+
+    if (SrcVT.getVectorNumElements() == NumElts)
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, Src);
+
+    if (!Subtarget.hasVLX()) {
+      // Non-VLX targets must truncate from a 512-bit type, so we need to
+      // widen, truncate and then possibly extract the original 128-bit
+      // vector.
+      bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
+      Src = widenSubVector(Src, !UndefUppers, Subtarget, DAG, DL, 512);
+      unsigned NumWideSrcElts = Src.getValueType().getVectorNumElements();
+      if (NumWideSrcElts >= NumElts) {
+        // Widening means we can now use a regular TRUNCATE.
+        MVT WideVT = MVT::getVectorVT(VT.getScalarType(), NumWideSrcElts);
+        SDValue WideRes = DAG.getNode(ISD::TRUNCATE, DL, WideVT, Src);
+        if (!WideVT.is128BitVector())
+          WideRes = extract128BitVector(WideRes, 0, DAG, DL);
+        return WideRes;
+      }
+    }
+    return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
+  }
+
+  return SDValue();
+}
+
 /// Check whether a compaction lowering can be done by dropping even
 /// elements and compute how many times even elements must be dropped.
 ///
@@ -14733,7 +14798,7 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // Try to use lower using a truncation.
   if (SDValue V =
-          lowerShuffleWithVPMOV(DL, Mask, MVT::v8i16, V1, V2, DAG, Subtarget))
+          lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
     return V;
 
   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
@@ -14816,6 +14881,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        Subtarget))
     return V;
 
+  // Try to use lower using a truncation.
+  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
+                                       Subtarget, DAG))
+    return V;
+
   // Try to use byte rotation instructions.
   if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
                                                 Subtarget, DAG))
@@ -14922,7 +14992,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // Try to use lower using a truncation.
   if (SDValue V =
-          lowerShuffleWithVPMOV(DL, Mask, MVT::v16i8, V1, V2, DAG, Subtarget))
+          lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
+  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
+                                       Subtarget, DAG))
     return V;
 
   // See if we can use SSE4A Extraction / Insertion.
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 9256a43f8e3393..1559fdbbe72c8c 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -42,11 +42,10 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT:    vpand 16(%rdi), %xmm0, %xmm1
-; AVX512BW-NEXT:    vpand (%rdi), %xmm0, %xmm0
-; AVX512BW-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
@@ -143,11 +142,10 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 ;
 ; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
-; AVX512F-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
@@ -159,11 +157,10 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 ;
 ; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
-; AVX512BW-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
@@ -377,54 +374,42 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vpmovdb %xmm1, %xmm1
-; AVX512VL-NEXT:    vpmovdb %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovdb %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT:    vpmovdb %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpmovdb %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovdb %ymm0, %xmm0
 ; AVX512BWVL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VBMIVL-NEXT:    vpmovdb %xmm1, %xmm1
-; AVX512VBMIVL-NEXT:    vpmovdb %xmm0, %xmm0
-; AVX512VBMIVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vpmovdb %ymm0, %xmm0
 ; AVX512VBMIVL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
 ; AVX512VBMIVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -1081,49 +1066,42 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ;
 ; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512F-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovqw %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512BW-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u>
-; AVX512BWVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u>
-; AVX512VBMIVL-NEXT:    vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT:    vmovq %xmm1, (%rsi)
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vpmovqw %ymm0, %xmm0
+; AVX512VBMIVL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
 ; AVX512VBMIVL-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %L
   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -1199,54 +1177,42 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vpmovqb %xmm1, %xmm1
-; AVX512VL-NEXT:    vpmovqb %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT:    vpmovqb %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT:    vpmovqb %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpmovqb %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512BWVL-NEXT:    vpmovqb %ymm0, %xmm0
 ; AVX512BWVL-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VBMIVL-NEXT:    vpmovqb %xmm1, %xmm1
-; AVX512VBMIVL-NEXT:    vpmovqb %xmm0, %xmm0
-; AVX512VBMIVL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
+; AVX512VBMIVL-NEXT:    vpmovqb %ymm0, %xmm0
 ; AVX512VBMIVL-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512VBMIVL-NEXT:    vzeroupper
 ; AVX512VBMIVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 811fbe37497cb0..6c5f5125109dbd 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -178,20 +178,17 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
 define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512F-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
@@ -211,20 +208,17 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
@@ -244,20 +238,17 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 ;
 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VBMI-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm0
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512VBMI-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512VBMI-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512VBMI-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512VBMI-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
@@ -293,44 +284,43 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512F-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
 ; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512F-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm0
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
+; AVX512VL-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512VL-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512VL-NEXT:    vpmovqw %ymm1, %xmm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512BW-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512BW-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
@@ -346,13 +336,14 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512VBMI:       # %bb.0:
 ; AVX512VBMI-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX512VBMI-NEXT:    vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512VBMI-NEXT:    vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512VBMI-NEXT:    vpackusdw %xmm1, %xmm2, %xmm1
-; AVX512VBMI-NEXT:    vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
 ; AVX512VBMI-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512VBMI-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX512VBMI-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512VBMI-NEXT:    vpmovqw %zmm1, %xmm1
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
 ; AVX512VBMI-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
@@ -386,20 +377,17 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm0
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512F-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512F-NEXT:    vpmovqb %zmm1, %xmm1
+; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
@@ -415,20 +403,17 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm0
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512BW-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
+; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
@@ -444,20 +429,17 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ;
 ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512VBMI-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm0
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VBMI-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
+; AVX512VBMI-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512VBMI-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512VBMI-NEXT:    vpmovqb %zmm1, %xmm1
+; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; AVX512VBMI-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
 ;
 ; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index f0398b15d04a41..d52e6195f8dfd3 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -1581,10 +1581,11 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; AVX512F-LABEL: trunc2x4i32_8i16:
 ; AVX512F:       # %bb.0: # %entry
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX512F-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc2x4i32_8i16:
@@ -1597,10 +1598,11 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; AVX512BW-LABEL: trunc2x4i32_8i16:
 ; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX512BW-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc2x4i32_8i16:
@@ -1709,10 +1711,11 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; AVX512BW-LABEL: trunc2x8i16_16i8:
 ; AVX512BW:       # %bb.0: # %entry
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
+; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc2x8i16_16i8:
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index b80775ac7d57c8..047978c8a0dac7 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -383,33 +383,88 @@ ret void
 }
 
 define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
-; AVX-LABEL: interleaved_load_vf8_i8_stride4:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
-; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    retq
+; AVX1-LABEL: interleaved_load_vf8_i8_stride4:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: interleaved_load_vf8_i8_stride4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX2-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX2-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: interleaved_load_vf8_i8_stride4:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm3
+; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm4
+; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm1, %xmm3, %xmm4
+; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16
   %v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
   %v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
@@ -529,10 +584,8 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
 ; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
 ; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
-; AVX512-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX512-NEXT:    vmovdqa (%rdi), %ymm5
+; AVX512-NEXT:    vpmovdb %zmm5, %xmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
 ; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
@@ -762,85 +815,83 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
 ;
 ; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm11
+; AVX512-NEXT:    vmovdqa 112(%rdi), %xmm14
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm0, %xmm11, %xmm3
-; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm13
-; AVX512-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
+; AVX512-NEXT:    vpshufb %xmm0, %xmm14, %xmm3
+; AVX512-NEXT:    vmovdqa 96(%rdi), %xmm2
+; AVX512-NEXT:    vpshufb %xmm0, %xmm2, %xmm0
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm14
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm5, %xmm14, %xmm6
-; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm4
-; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm5
-; AVX512-NEXT:    vpmovdb %zmm5, %xmm5
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm5, %xmm11, %xmm0
-; AVX512-NEXT:    vpshufb %xmm5, %xmm13, %xmm6
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm3
+; AVX512-NEXT:    vpmovdb %zmm3, %xmm3
+; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm3
+; AVX512-NEXT:    vpmovdb %zmm3, %xmm3
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT:    vmovdqa 64(%rdi), %xmm10
+; AVX512-NEXT:    vmovdqa 80(%rdi), %xmm11
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm0
+; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm5
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm1, %xmm14, %xmm6
-; AVX512-NEXT:    vpshufb %xmm1, %xmm4, %xmm7
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm10
-; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm12
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm5
+; AVX512-NEXT:    vpshufb %xmm4, %xmm10, %xmm6
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm12
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm13
 ; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm7
 ; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm0
-; AVX512-NEXT:    vpshufb %xmm5, %xmm0, %xmm6
-; AVX512-NEXT:    vpshufb %xmm5, %xmm7, %xmm5
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm12, %xmm6
-; AVX512-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
+; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm5
+; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
+; AVX512-NEXT:    vpshufb %xmm4, %xmm13, %xmm5
+; AVX512-NEXT:    vpshufb %xmm4, %xmm12, %xmm4
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
+; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm5
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm5, %xmm11, %xmm6
+; AVX512-NEXT:    vpshufb %xmm5, %xmm10, %xmm1
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm1, %xmm11, %xmm5
-; AVX512-NEXT:    vpshufb %xmm1, %xmm13, %xmm6
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm4
+; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-NEXT:    vpshufb %xmm5, %xmm13, %xmm4
+; AVX512-NEXT:    vpshufb %xmm5, %xmm12, %xmm5
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
+; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm4, %xmm11, %xmm5
+; AVX512-NEXT:    vpshufb %xmm4, %xmm10, %xmm6
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
 ; AVX512-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm6, %xmm14, %xmm2
-; AVX512-NEXT:    vpshufb %xmm6, %xmm4, %xmm3
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
-; AVX512-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; AVX512-NEXT:    vpshufb %xmm6, %xmm12, %xmm3
-; AVX512-NEXT:    vpshufb %xmm6, %xmm10, %xmm5
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
-; AVX512-NEXT:    vpshufb %xmm2, %xmm13, %xmm5
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm5, %xmm14, %xmm6
-; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
-; AVX512-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX512-NEXT:    vpshufb %xmm5, %xmm12, %xmm2
-; AVX512-NEXT:    vpshufb %xmm5, %xmm10, %xmm4
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; AVX512-NEXT:    vpshufb %xmm4, %xmm13, %xmm3
+; AVX512-NEXT:    vpshufb %xmm4, %xmm12, %xmm4
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512-NEXT:    vpcmpeqb %zmm8, %zmm9, %k0
 ; AVX512-NEXT:    vpcmpeqb %zmm0, %zmm1, %k1
 ; AVX512-NEXT:    kxnord %k1, %k0, %k0

From cb5cc47a65fec57e03dbe0e54c80811a64eba8ec Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Mon, 17 Aug 2020 12:46:55 +0100
Subject: [PATCH 024/101] [SVE] Lower fixed length vector ISD::SPLAT_VECTOR
 operations.

Also strengthens the CHECK lines for scalable vector splat tests.

Differential Revision: https://reviews.llvm.org/D86070
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  14 +-
 .../AArch64/sve-fixed-length-splat-vector.ll  | 654 ++++++++++++++++++
 .../AArch64/sve-fixed-length-subvector.ll     |   2 +-
 llvm/test/CodeGen/AArch64/sve-vector-splat.ll |  12 +-
 4 files changed, 674 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eb87f6be73e93f..2cd2f67171205f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1124,6 +1124,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
   setOperationAction(ISD::SMAX, VT, Custom);
   setOperationAction(ISD::SMIN, VT, Custom);
+  setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
   setOperationAction(ISD::SRA, VT, Custom);
   setOperationAction(ISD::SRL, VT, Custom);
   setOperationAction(ISD::STORE, VT, Custom);
@@ -7978,9 +7979,11 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   EVT ElemVT = VT.getScalarType();
-
   SDValue SplatVal = Op.getOperand(0);
 
+  if (useSVEForFixedLengthVectorVT(VT))
+    return LowerToScalableOp(Op, DAG);
+
   // Extend input splat value where needed to fit into a GPR (32b or 64b only)
   // FPRs don't have this restriction.
   switch (ElemVT.getSimpleVT().SimpleTy) {
@@ -15485,6 +15488,15 @@ SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
   // Create list of operands by converting existing ones to scalable types.
   SmallVector<SDValue, 4> Ops;
   for (const SDValue &V : Op->op_values()) {
+    assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
+
+    // Pass through non-vector operands.
+    if (!V.getValueType().isVector()) {
+      Ops.push_back(V);
+      continue;
+    }
+
+    // "cast" fixed length vector to a scalable vector.
     assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
            "Only fixed length vectors are supported!");
     Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
new file mode 100644
index 00000000000000..a4d8f281705df1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
@@ -0,0 +1,654 @@
+; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | FileCheck %s -check-prefix=NO_SVE
+; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
+; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK
+; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
+; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
+; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Don't use SVE when its registers are no bigger than NEON.
+; NO_SVE-NOT: ptrue
+
+;
+; DUP (integer)
+;
+
+; Don't use SVE for 64-bit vectors.
+define <8 x i8> @splat_v8i8(i8 %a) #0 {
+; CHECK-LABEL: splat_v8i8:
+; CHECK: dup v0.8b, w0
+; CHECK-NEXT: ret
+  %insert = insertelement <8 x i8> undef, i8 %a, i64 0
+  %splat = shufflevector <8 x i8> %insert, <8 x i8> undef, <8 x i32> zeroinitializer
+  ret <8 x i8> %splat
+}
+
+; Don't use SVE for 128-bit vectors.
+define <16 x i8> @splat_v16i8(i8 %a) #0 {
+; CHECK-LABEL: splat_v16i8:
+; CHECK: dup v0.16b, w0
+; CHECK-NEXT: ret
+  %insert = insertelement <16 x i8> undef, i8 %a, i64 0
+  %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %splat
+}
+
+define void @splat_v32i8(i8 %a, <32 x i8>* %b) #0 {
+; CHECK-LABEL: splat_v32i8:
+; CHECK-DAG: mov [[RES:z[0-9]+]].b, w0
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl32
+; CHECK-NEXT: st1b { [[RES]].b }, [[PG]], [x1]
+; CHECK-NEXT: ret
+  %insert = insertelement <32 x i8> undef, i8 %a, i64 0
+  %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+  store <32 x i8> %splat, <32 x i8>* %b
+  ret void
+}
+
+define void @splat_v64i8(i8 %a, <64 x i8>* %b) #0 {
+; CHECK-LABEL: splat_v64i8:
+; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].b, w0
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].b, vl64
+; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x1]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].b, w0
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].b, vl32
+; VBITS_EQ_256-DAG: mov w[[OFFSET_HI:[0-9]+]], #32
+; VBITS_EQ_256-DAG: st1b { [[RES]].b }, [[PG]], [x1]
+; VBITS_EQ_256-DAG: st1b { [[RES]].b }, [[PG]], [x1, x[[OFFSET_HI]]
+; VBITS_EQ_256-NEXT: ret
+  %insert = insertelement <64 x i8> undef, i8 %a, i64 0
+  %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+  store <64 x i8> %splat, <64 x i8>* %b
+  ret void
+}
+
+define void @splat_v128i8(i8 %a, <128 x i8>* %b) #0 {
+; CHECK-LABEL: splat_v128i8:
+; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].b, w0
+; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].b, vl128
+; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG]], [x1]
+; VBITS_GE_1024-NEXT: ret
+  %insert = insertelement <128 x i8> undef, i8 %a, i64 0
+  %splat = shufflevector <128 x i8> %insert, <128 x i8> undef, <128 x i32> zeroinitializer
+  store <128 x i8> %splat, <128 x i8>* %b
+  ret void
+}
+
+define void @splat_v256i8(i8 %a, <256 x i8>* %b) #0 {
+; CHECK-LABEL: splat_v256i8:
+; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].b, w0
+; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].b, vl256
+; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x1]
+; VBITS_GE_2048-NEXT: ret
+  %insert = insertelement <256 x i8> undef, i8 %a, i64 0
+  %splat = shufflevector <256 x i8> %insert, <256 x i8> undef, <256 x i32> zeroinitializer
+  store <256 x i8> %splat, <256 x i8>* %b
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <4 x i16> @splat_v4i16(i16 %a) #0 {
+; CHECK-LABEL: splat_v4i16:
+; CHECK: dup v0.4h, w0
+; CHECK-NEXT: ret
+  %insert = insertelement <4 x i16> undef, i16 %a, i64 0
+  %splat = shufflevector <4 x i16> %insert, <4 x i16> undef, <4 x i32> zeroinitializer
+  ret <4 x i16> %splat
+}
+
+; Don't use SVE for 128-bit vectors.
+define <8 x i16> @splat_v8i16(i16 %a) #0 {
+; CHECK-LABEL: splat_v8i16:
+; CHECK: dup v0.8h, w0
+; CHECK-NEXT: ret
+  %insert = insertelement <8 x i16> undef, i16 %a, i64 0
+  %splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %splat
+}
+
+define void @splat_v16i16(i16 %a, <16 x i16>* %b) #0 {
+; CHECK-LABEL: splat_v16i16:
+; CHECK-DAG: mov [[RES:z[0-9]+]].h, w0
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl16
+; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
+; CHECK-NEXT: ret
+  %insert = insertelement <16 x i16> undef, i16 %a, i64 0
+  %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer
+  store <16 x i16> %splat, <16 x i16>* %b
+  ret void
+}
+
+define void @splat_v32i16(i16 %a, <32 x i16>* %b) #0 {
+; CHECK-LABEL: splat_v32i16:
+; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].h, w0
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32
+; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].h, w0
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
+; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32
+; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x1]
+; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x[[B_HI]]
+; VBITS_EQ_256-NEXT: ret
+  %insert = insertelement <32 x i16> undef, i16 %a, i64 0
+  %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+  store <32 x i16> %splat, <32 x i16>* %b
+  ret void
+}
+
+define void @splat_v64i16(i16 %a, <64 x i16>* %b) #0 {
+; CHECK-LABEL: splat_v64i16:
+; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].h, w0
+; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].h, vl64
+; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
+; VBITS_GE_1024-NEXT: ret
+  %insert = insertelement <64 x i16> undef, i16 %a, i64 0
+  %splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
+  store <64 x i16> %splat, <64 x i16>* %b
+  ret void
+}
+
+define void @splat_v128i16(i16 %a, <128 x i16>* %b) #0 {
+; CHECK-LABEL: splat_v128i16:
+; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].h, w0
+; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].h, vl128
+; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x1]
+; VBITS_GE_2048-NEXT: ret
+  %insert = insertelement <128 x i16> undef, i16 %a, i64 0
+  %splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
+  store <128 x i16> %splat, <128 x i16>* %b
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <2 x i32> @splat_v2i32(i32 %a) #0 {
+; CHECK-LABEL: splat_v2i32:
+; CHECK: dup v0.2s, w0
+; CHECK-NEXT: ret
+  %insert = insertelement <2 x i32> undef, i32 %a, i64 0
+  %splat = shufflevector <2 x i32> %insert, <2 x i32> undef, <2 x i32> zeroinitializer
+  ret <2 x i32> %splat
+}
+
+; Don't use SVE for 128-bit vectors.
+define <4 x i32> @splat_v4i32(i32 %a) #0 {
+; CHECK-LABEL: splat_v4i32:
+; CHECK: dup v0.4s, w0
+; CHECK-NEXT: ret
+  %insert = insertelement <4 x i32> undef, i32 %a, i64 0
+  %splat = shufflevector <4 x i32> %insert, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %splat
+}
+
+define void @splat_v8i32(i32 %a, <8 x i32>* %b) #0 {
+; CHECK-LABEL: splat_v8i32:
+; CHECK-DAG: mov [[RES:z[0-9]+]].s, w0
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl8
+; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
+; CHECK-NEXT: ret
+  %insert = insertelement <8 x i32> undef, i32 %a, i64 0
+  %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer
+  store <8 x i32> %splat, <8 x i32>* %b
+  ret void
+}
+
+define void @splat_v16i32(i32 %a, <16 x i32>* %b) #0 {
+; CHECK-LABEL: splat_v16i32:
+; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].s, w0
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16
+; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].s, w0
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
+; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32
+; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x1]
+; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x[[B_HI]]
+; VBITS_EQ_256-NEXT: ret
+  %insert = insertelement <16 x i32> undef, i32 %a, i64 0
+  %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+  store <16 x i32> %splat, <16 x i32>* %b
+  ret void
+}
+
+define void @splat_v32i32(i32 %a, <32 x i32>* %b) #0 {
+; CHECK-LABEL: splat_v32i32:
+; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].s, w0
+; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].s, vl32
+; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
+; VBITS_GE_1024-NEXT: ret
+  %insert = insertelement <32 x i32> undef, i32 %a, i64 0
+  %splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
+  store <32 x i32> %splat, <32 x i32>* %b
+  ret void
+}
+
+define void @splat_v64i32(i32 %a, <64 x i32>* %b) #0 {
+; CHECK-LABEL: splat_v64i32:
+; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].s, w0
+; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].s, vl64
+; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x1]
+; VBITS_GE_2048-NEXT: ret
+  %insert = insertelement <64 x i32> undef, i32 %a, i64 0
+  %splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
+  store <64 x i32> %splat, <64 x i32>* %b
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <1 x i64> @splat_v1i64(i64 %a) #0 {
+; CHECK-LABEL: splat_v1i64:
+; CHECK: fmov d0, x0
+; CHECK-NEXT: ret
+  %insert = insertelement <1 x i64> undef, i64 %a, i64 0
+  %splat = shufflevector <1 x i64> %insert, <1 x i64> undef, <1 x i32> zeroinitializer
+  ret <1 x i64> %splat
+}
+
+; Don't use SVE for 128-bit vectors.
+define <2 x i64> @splat_v2i64(i64 %a) #0 {
+; CHECK-LABEL: splat_v2i64:
+; CHECK: dup v0.2d, x0
+; CHECK-NEXT: ret
+  %insert = insertelement <2 x i64> undef, i64 %a, i64 0
+  %splat = shufflevector <2 x i64> %insert, <2 x i64> undef, <2 x i32> zeroinitializer
+  ret <2 x i64> %splat
+}
+
+define void @splat_v4i64(i64 %a, <4 x i64>* %b) #0 {
+; CHECK-LABEL: splat_v4i64:
+; CHECK-DAG: mov [[RES:z[0-9]+]].d, x0
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d, vl4
+; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
+; CHECK-NEXT: ret
+  %insert = insertelement <4 x i64> undef, i64 %a, i64 0
+  %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer
+  store <4 x i64> %splat, <4 x i64>* %b
+  ret void
+}
+
+define void @splat_v8i64(i64 %a, <8 x i64>* %b) #0 {
+; CHECK-LABEL: splat_v8i64:
+; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].d, x0
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].d, x0
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
+; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32
+; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x1]
+; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x[[B_HI]]
+; VBITS_EQ_256-NEXT: ret
+  %insert = insertelement <8 x i64> undef, i64 %a, i64 0
+  %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+  store <8 x i64> %splat, <8 x i64>* %b
+  ret void
+}
+
+define void @splat_v16i64(i64 %a, <16 x i64>* %b) #0 {
+; CHECK-LABEL: splat_v16i64:
+; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].d, x0
+; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].d, vl16
+; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
+; VBITS_GE_1024-NEXT: ret
+  %insert = insertelement <16 x i64> undef, i64 %a, i64 0
+  %splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
+  store <16 x i64> %splat, <16 x i64>* %b
+  ret void
+}
+
+define void @splat_v32i64(i64 %a, <32 x i64>* %b) #0 {
+; CHECK-LABEL: splat_v32i64:
+; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].d, x0
+; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].d, vl32
+; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x1]
+; VBITS_GE_2048-NEXT: ret
+  %insert = insertelement <32 x i64> undef, i64 %a, i64 0
+  %splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
+  store <32 x i64> %splat, <32 x i64>* %b
+  ret void
+}
+
+;
+; DUP (floating-point)
+;
+
+; Don't use SVE for 64-bit vectors.
+define <4 x half> @splat_v4f16(half %a) #0 {
+; CHECK-LABEL: splat_v4f16:
+; CHECK: dup v0.4h, v0.h[0]
+; CHECK-NEXT: ret
+  %insert = insertelement <4 x half> undef, half %a, i64 0
+  %splat = shufflevector <4 x half> %insert, <4 x half> undef, <4 x i32> zeroinitializer
+  ret <4 x half> %splat
+}
+
+; Don't use SVE for 128-bit vectors.
+define <8 x half> @splat_v8f16(half %a) #0 {
+; CHECK-LABEL: splat_v8f16:
+; CHECK: dup v0.8h, v0.h[0]
+; CHECK-NEXT: ret
+  %insert = insertelement <8 x half> undef, half %a, i64 0
+  %splat = shufflevector <8 x half> %insert, <8 x half> undef, <8 x i32> zeroinitializer
+  ret <8 x half> %splat
+}
+
+define void @splat_v16f16(half %a, <16 x half>* %b) #0 {
+; CHECK-LABEL: splat_v16f16:
+; CHECK-DAG: mov [[RES:z[0-9]+]].h, h0
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].h, vl16
+; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %insert = insertelement <16 x half> undef, half %a, i64 0
+  %splat = shufflevector <16 x half> %insert, <16 x half> undef, <16 x i32> zeroinitializer
+  store <16 x half> %splat, <16 x half>* %b
+  ret void
+}
+
+define void @splat_v32f16(half %a, <32 x half>* %b) #0 {
+; CHECK-LABEL: splat_v32f16:
+; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].h, h0
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32
+; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].h, h0
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
+; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x0, #32
+; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x0]
+; VBITS_EQ_256-DAG: st1h { [[RES]].h }, [[PG]], [x[[B_HI]]
+; VBITS_EQ_256-NEXT: ret
+  %insert = insertelement <32 x half> undef, half %a, i64 0
+  %splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer
+  store <32 x half> %splat, <32 x half>* %b
+  ret void
+}
+
+define void @splat_v64f16(half %a, <64 x half>* %b) #0 {
+; CHECK-LABEL: splat_v64f16:
+; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].h, h0
+; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].h, vl64
+; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; VBITS_GE_1024-NEXT: ret
+  %insert = insertelement <64 x half> undef, half %a, i64 0
+  %splat = shufflevector <64 x half> %insert, <64 x half> undef, <64 x i32> zeroinitializer
+  store <64 x half> %splat, <64 x half>* %b
+  ret void
+}
+
+define void @splat_v128f16(half %a, <128 x half>* %b) #0 {
+; CHECK-LABEL: splat_v128f16:
+; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].h, h0
+; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].h, vl128
+; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; VBITS_GE_2048-NEXT: ret
+  %insert = insertelement <128 x half> undef, half %a, i64 0
+  %splat = shufflevector <128 x half> %insert, <128 x half> undef, <128 x i32> zeroinitializer
+  store <128 x half> %splat, <128 x half>* %b
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <2 x float> @splat_v2f32(float %a, <2 x float> %op2) #0 {
+; CHECK-LABEL: splat_v2f32:
+; CHECK: dup v0.2s, v0.s[0]
+; CHECK-NEXT: ret
+  %insert = insertelement <2 x float> undef, float %a, i64 0
+  %splat = shufflevector <2 x float> %insert, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %splat
+}
+
+; Don't use SVE for 128-bit vectors.
+define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) #0 {
+; CHECK-LABEL: splat_v4f32:
+; CHECK: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+  %insert = insertelement <4 x float> undef, float %a, i64 0
+  %splat = shufflevector <4 x float> %insert, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %splat
+}
+
+define void @splat_v8f32(float %a, <8 x float>* %b) #0 {
+; CHECK-LABEL: splat_v8f32:
+; CHECK-DAG: mov [[RES:z[0-9]+]].s, s0
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].s, vl8
+; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %insert = insertelement <8 x float> undef, float %a, i64 0
+  %splat = shufflevector <8 x float> %insert, <8 x float> undef, <8 x i32> zeroinitializer
+  store <8 x float> %splat, <8 x float>* %b
+  ret void
+}
+
+define void @splat_v16f32(float %a, <16 x float>* %b) #0 {
+; CHECK-LABEL: splat_v16f32:
+; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].s, s0
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16
+; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].s, s0
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
+; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x0, #32
+; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x0]
+; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG]], [x[[B_HI]]
+; VBITS_EQ_256-NEXT: ret
+  %insert = insertelement <16 x float> undef, float %a, i64 0
+  %splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer
+  store <16 x float> %splat, <16 x float>* %b
+  ret void
+}
+
+define void @splat_v32f32(float %a, <32 x float>* %b) #0 {
+; CHECK-LABEL: splat_v32f32:
+; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].s, s0
+; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].s, vl32
+; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; VBITS_GE_1024-NEXT: ret
+  %insert = insertelement <32 x float> undef, float %a, i64 0
+  %splat = shufflevector <32 x float> %insert, <32 x float> undef, <32 x i32> zeroinitializer
+  store <32 x float> %splat, <32 x float>* %b
+  ret void
+}
+
+define void @splat_v64f32(float %a, <64 x float>* %b) #0 {
+; CHECK-LABEL: splat_v64f32:
+; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].s, s0
+; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].s, vl64
+; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; VBITS_GE_2048-NEXT: ret
+  %insert = insertelement <64 x float> undef, float %a, i64 0
+  %splat = shufflevector <64 x float> %insert, <64 x float> undef, <64 x i32> zeroinitializer
+  store <64 x float> %splat, <64 x float>* %b
+  ret void
+}
+
+; Don't use SVE for 64-bit vectors.
+define <1 x double> @splat_v1f64(double %a, <1 x double> %op2) #0 {
+; CHECK-LABEL: splat_v1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+  %insert = insertelement <1 x double> undef, double %a, i64 0
+  %splat = shufflevector <1 x double> %insert, <1 x double> undef, <1 x i32> zeroinitializer
+  ret <1 x double> %splat
+}
+
+; Don't use SVE for 128-bit vectors.
+define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) #0 {
+; CHECK-LABEL: splat_v2f64:
+; CHECK: dup v0.2d, v0.d[0]
+; CHECK-NEXT: ret
+  %insert = insertelement <2 x double> undef, double %a, i64 0
+  %splat = shufflevector <2 x double> %insert, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %splat
+}
+
+define void @splat_v4f64(double %a, <4 x double>* %b) #0 {
+; CHECK-LABEL: splat_v4f64:
+; CHECK-DAG: mov [[RES:z[0-9]+]].d, d0
+; CHECK-DAG: ptrue [[PG:p[0-9]+]].d, vl4
+; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; CHECK-NEXT: ret
+  %insert = insertelement <4 x double> undef, double %a, i64 0
+  %splat = shufflevector <4 x double> %insert, <4 x double> undef, <4 x i32> zeroinitializer
+  store <4 x double> %splat, <4 x double>* %b
+  ret void
+}
+
+define void @splat_v8f64(double %a, <8 x double>* %b) #0 {
+; CHECK-LABEL: splat_v8f64:
+; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].d, d0
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+
+; Ensure sensible type legalisation.
+; VBITS_EQ_256-DAG: mov [[RES:z[0-9]+]].d, d0
+; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
+; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x0, #32
+; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x0]
+; VBITS_EQ_256-DAG: st1d { [[RES]].d }, [[PG]], [x[[B_HI]]
+; VBITS_EQ_256-NEXT: ret
+  %insert = insertelement <8 x double> undef, double %a, i64 0
+  %splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer
+  store <8 x double> %splat, <8 x double>* %b
+  ret void
+}
+
+define void @splat_v16f64(double %a, <16 x double>* %b) #0 {
+; CHECK-LABEL: splat_v16f64:
+; VBITS_GE_1024-DAG: mov [[RES:z[0-9]+]].d, d0
+; VBITS_GE_1024-DAG: ptrue [[PG:p[0-9]+]].d, vl16
+; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; VBITS_GE_1024-NEXT: ret
+  %insert = insertelement <16 x double> undef, double %a, i64 0
+  %splat = shufflevector <16 x double> %insert, <16 x double> undef, <16 x i32> zeroinitializer
+  store <16 x double> %splat, <16 x double>* %b
+  ret void
+}
+
+define void @splat_v32f64(double %a, <32 x double>* %b) #0 {
+; CHECK-LABEL: splat_v32f64:
+; VBITS_GE_2048-DAG: mov [[RES:z[0-9]+]].d, d0
+; VBITS_GE_2048-DAG: ptrue [[PG:p[0-9]+]].d, vl32
+; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; VBITS_GE_2048-NEXT: ret
+  %insert = insertelement <32 x double> undef, double %a, i64 0
+  %splat = shufflevector <32 x double> %insert, <32 x double> undef, <32 x i32> zeroinitializer
+  store <32 x double> %splat, <32 x double>* %b
+  ret void
+}
+
+;
+; DUP (integer immediate)
+;
+
+define void @splat_imm_v64i8(<64 x i8>* %a) #0 {
+; CHECK-LABEL: splat_imm_v64i8:
+; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].b, #1
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].b, vl64
+; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+  %insert = insertelement <64 x i8> undef, i8 1, i64 0
+  %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+  store <64 x i8> %splat, <64 x i8>* %a
+  ret void
+}
+
+define void @splat_imm_v32i16(<32 x i16>* %a) #0 {
+; CHECK-LABEL: splat_imm_v32i16:
+; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].h, #2
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32
+; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+  %insert = insertelement <32 x i16> undef, i16 2, i64 0
+  %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+  store <32 x i16> %splat, <32 x i16>* %a
+  ret void
+}
+
+define void @splat_imm_v16i32(<16 x i32>* %a) #0 {
+; CHECK-LABEL: splat_imm_v16i32:
+; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].s, #3
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16
+; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+  %insert = insertelement <16 x i32> undef, i32 3, i64 0
+  %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
+  store <16 x i32> %splat, <16 x i32>* %a
+  ret void
+}
+
+define void @splat_imm_v8i64(<8 x i64>* %a) #0 {
+; CHECK-LABEL: splat_imm_v8i64:
+; VBITS_GE_512-DAG: mov [[RES:z[0-9]+]].d, #4
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+  %insert = insertelement <8 x i64> undef, i64 4, i64 0
+  %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
+  store <8 x i64> %splat, <8 x i64>* %a
+  ret void
+}
+
+;
+; DUP (floating-point immediate)
+;
+
+define void @splat_imm_v32f16(<32 x half>* %a) #0 {
+; CHECK-LABEL: splat_imm_v32f16:
+; VBITS_GE_512-DAG: fmov [[RES:z[0-9]+]].h, #5.00000000
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].h, vl32
+; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+  %insert = insertelement <32 x half> undef, half 5.0, i64 0
+  %splat = shufflevector <32 x half> %insert, <32 x half> undef, <32 x i32> zeroinitializer
+  store <32 x half> %splat, <32 x half>* %a
+  ret void
+}
+
+define void @splat_imm_v16f32(<16 x float>* %a) #0 {
+; CHECK-LABEL: splat_imm_v16f32:
+; VBITS_GE_512-DAG: fmov [[RES:z[0-9]+]].s, #6.00000000
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].s, vl16
+; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+  %insert = insertelement <16 x float> undef, float 6.0, i64 0
+  %splat = shufflevector <16 x float> %insert, <16 x float> undef, <16 x i32> zeroinitializer
+  store <16 x float> %splat, <16 x float>* %a
+  ret void
+}
+
+define void @splat_imm_v8f64(<8 x double>* %a) #0 {
+; CHECK-LABEL: splat_imm_v8f64:
+; VBITS_GE_512-DAG: fmov [[RES:z[0-9]+]].d, #7.00000000
+; VBITS_GE_512-DAG: ptrue [[PG:p[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
+; VBITS_GE_512-NEXT: ret
+  %insert = insertelement <8 x double> undef, double 7.0, i64 0
+  %splat = shufflevector <8 x double> %insert, <8 x double> undef, <8 x i32> zeroinitializer
+  store <8 x double> %splat, <8 x double>* %a
+  ret void
+}
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
index 0d6b56e5b28281..857a2f6ff204ed 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-subvector.ll
@@ -93,7 +93,7 @@ define <8 x i1> @no_warn_dropped_scalable(<8 x i32>* %in) #0 {
 ; CHECK-LABEL: no_warn_dropped_scalable:
 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
 ; CHECK: ld1w { [[A:z[0-9]+]].s }, [[PG]]/z, [x0]
-; CHECK: cmpgt p{{[0-9]}}.s, [[PG]]/z, [[A]].s, z{{[0-9]+}}.s
+; CHECK: cmpgt p{{[0-9]}}.s, [[PG]]/z, [[A]].s, #0
 ; CHECK: ret
   %a = load <8 x i32>, <8 x i32>* %in
   br label %bb1
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
index 7a765002ac9f8f..be8ec87e7a56bf 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-splat.ll
@@ -303,7 +303,7 @@ define <vscale x 2 x double> @splat_nxv2f64_zero() {
 
 define <vscale x 8 x half> @splat_nxv8f16_imm() {
 ; CHECK-LABEL: splat_nxv8f16_imm:
-; CHECK: mov z0.h, #1.0
+; CHECK: fmov z0.h, #1.00000000
 ; CHECK-NEXT: ret
   %1 = insertelement <vscale x 8 x half> undef, half 1.0, i32 0
   %2 = shufflevector <vscale x 8 x half> %1, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
@@ -312,7 +312,7 @@ define <vscale x 8 x half> @splat_nxv8f16_imm() {
 
 define <vscale x 4 x half> @splat_nxv4f16_imm() {
 ; CHECK-LABEL: splat_nxv4f16_imm:
-; CHECK: mov z0.h, #1.0
+; CHECK: fmov z0.h, #1.00000000
 ; CHECK-NEXT: ret
   %1 = insertelement <vscale x 4 x half> undef, half 1.0, i32 0
   %2 = shufflevector <vscale x 4 x half> %1, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
@@ -321,7 +321,7 @@ define <vscale x 4 x half> @splat_nxv4f16_imm() {
 
 define <vscale x 2 x half> @splat_nxv2f16_imm() {
 ; CHECK-LABEL: splat_nxv2f16_imm:
-; CHECK: mov z0.h, #1.0
+; CHECK: fmov z0.h, #1.00000000
 ; CHECK-NEXT: ret
   %1 = insertelement <vscale x 2 x half> undef, half 1.0, i32 0
   %2 = shufflevector <vscale x 2 x half> %1, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
@@ -330,7 +330,7 @@ define <vscale x 2 x half> @splat_nxv2f16_imm() {
 
 define <vscale x 4 x float> @splat_nxv4f32_imm() {
 ; CHECK-LABEL: splat_nxv4f32_imm:
-; CHECK: mov z0.s, #1.0
+; CHECK: fmov z0.s, #1.00000000
 ; CHECK-NEXT: ret
   %1 = insertelement <vscale x 4 x float> undef, float 1.0, i32 0
   %2 = shufflevector <vscale x 4 x float> %1, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
@@ -339,7 +339,7 @@ define <vscale x 4 x float> @splat_nxv4f32_imm() {
 
 define <vscale x 2 x float> @splat_nxv2f32_imm() {
 ; CHECK-LABEL: splat_nxv2f32_imm:
-; CHECK: mov z0.s, #1.0
+; CHECK: fmov z0.s, #1.00000000
 ; CHECK-NEXT: ret
   %1 = insertelement <vscale x 2 x float> undef, float 1.0, i32 0
   %2 = shufflevector <vscale x 2 x float> %1, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
@@ -348,7 +348,7 @@ define <vscale x 2 x float> @splat_nxv2f32_imm() {
 
 define <vscale x 2 x double> @splat_nxv2f64_imm() {
 ; CHECK-LABEL: splat_nxv2f64_imm:
-; CHECK: mov z0.d, #1.0
+; CHECK: fmov z0.d, #1.00000000
 ; CHECK-NEXT: ret
   %1 = insertelement <vscale x 2 x double> undef, double 1.0, i32 0
   %2 = shufflevector <vscale x 2 x double> %1, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer

From 5eb705d5dc3d92e3fd47141cb7d248d8ad59be58 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Tue, 18 Aug 2020 11:21:03 +0100
Subject: [PATCH 025/101] [NFC] Add some more Arm tests for IndVarSimplify

Copy some generic functions and apply minsize for arm.
---
 .../IndVarSimplify/ARM/code-size.ll           | 856 ++++++++++++++++++
 1 file changed, 856 insertions(+)
 create mode 100644 llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll

diff --git a/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll b/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll
new file mode 100644
index 00000000000000..4c5f9ef05bad9f
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/ARM/code-size.ll
@@ -0,0 +1,856 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=thumbv8m.main -indvars -S < %s | FileCheck %s --check-prefix=CHECK-V8M
+; RUN: opt -mtriple=thumbv8a -indvars -S < %s | FileCheck %s --check-prefix=CHECK-V8A
+
+define i32 @remove_loop(i32 %size) #0 {
+; CHECK-V8M-LABEL: @remove_loop(
+; CHECK-V8M-NEXT:  entry:
+; CHECK-V8M-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31
+; CHECK-V8M-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[SIZE]], 31
+; CHECK-V8M-NEXT:    [[UMIN:%.*]] = select i1 [[TMP1]], i32 [[SIZE]], i32 31
+; CHECK-V8M-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP0]], [[UMIN]]
+; CHECK-V8M-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 5
+; CHECK-V8M-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], 5
+; CHECK-V8M-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK-V8M:       while.cond:
+; CHECK-V8M-NEXT:    br i1 false, label [[WHILE_COND]], label [[WHILE_END:%.*]]
+; CHECK-V8M:       while.end:
+; CHECK-V8M-NEXT:    [[TMP5:%.*]] = sub i32 [[SIZE]], [[TMP4]]
+; CHECK-V8M-NEXT:    ret i32 [[TMP5]]
+;
+; CHECK-V8A-LABEL: @remove_loop(
+; CHECK-V8A-NEXT:  entry:
+; CHECK-V8A-NEXT:    [[TMP0:%.*]] = add i32 [[SIZE:%.*]], 31
+; CHECK-V8A-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[SIZE]], 31
+; CHECK-V8A-NEXT:    [[UMIN:%.*]] = select i1 [[TMP1]], i32 [[SIZE]], i32 31
+; CHECK-V8A-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP0]], [[UMIN]]
+; CHECK-V8A-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 5
+; CHECK-V8A-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], 5
+; CHECK-V8A-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK-V8A:       while.cond:
+; CHECK-V8A-NEXT:    br i1 false, label [[WHILE_COND]], label [[WHILE_END:%.*]]
+; CHECK-V8A:       while.end:
+; CHECK-V8A-NEXT:    [[TMP5:%.*]] = sub i32 [[SIZE]], [[TMP4]]
+; CHECK-V8A-NEXT:    ret i32 [[TMP5]]
+;
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %size.addr.0 = phi i32 [ %size, %entry ], [ %sub, %while.cond ]
+  %cmp = icmp ugt i32 %size.addr.0, 31
+  %sub = add i32 %size.addr.0, -32
+  br i1 %cmp, label %while.cond, label %while.end
+
+while.end:                                        ; preds = %while.cond
+  %size.lcssa = phi i32 [ %size.addr.0, %while.cond ]
+  ret i32 %size.lcssa
+}
+
+define void @expandOuterRecurrence(i32 %arg) nounwind #0 {
+; CHECK-V8M-LABEL: @expandOuterRecurrence(
+; CHECK-V8M-NEXT:  entry:
+; CHECK-V8M-NEXT:    [[SUB1:%.*]] = sub nsw i32 [[ARG:%.*]], 1
+; CHECK-V8M-NEXT:    [[CMP1:%.*]] = icmp slt i32 0, [[SUB1]]
+; CHECK-V8M-NEXT:    br i1 [[CMP1]], label [[OUTER_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-V8M:       outer.preheader:
+; CHECK-V8M-NEXT:    br label [[OUTER:%.*]]
+; CHECK-V8M:       outer:
+; CHECK-V8M-NEXT:    [[I:%.*]] = phi i32 [ [[I_INC:%.*]], [[OUTER_INC:%.*]] ], [ 0, [[OUTER_PREHEADER]] ]
+; CHECK-V8M-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[ARG]], [[I]]
+; CHECK-V8M-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[SUB2]], 1
+; CHECK-V8M-NEXT:    [[CMP2:%.*]] = icmp slt i32 0, [[SUB3]]
+; CHECK-V8M-NEXT:    br i1 [[CMP2]], label [[INNER_PH:%.*]], label [[OUTER_INC]]
+; CHECK-V8M:       inner.ph:
+; CHECK-V8M-NEXT:    br label [[INNER:%.*]]
+; CHECK-V8M:       inner:
+; CHECK-V8M-NEXT:    br i1 false, label [[INNER]], label [[OUTER_INC_LOOPEXIT:%.*]]
+; CHECK-V8M:       outer.inc.loopexit:
+; CHECK-V8M-NEXT:    br label [[OUTER_INC]]
+; CHECK-V8M:       outer.inc:
+; CHECK-V8M-NEXT:    [[I_INC]] = add nuw nsw i32 [[I]], 1
+; CHECK-V8M-NEXT:    br i1 false, label [[OUTER]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK-V8M:       exit.loopexit:
+; CHECK-V8M-NEXT:    br label [[EXIT]]
+; CHECK-V8M:       exit:
+; CHECK-V8M-NEXT:    ret void
+;
+; CHECK-V8A-LABEL: @expandOuterRecurrence(
+; CHECK-V8A-NEXT:  entry:
+; CHECK-V8A-NEXT:    [[SUB1:%.*]] = sub nsw i32 [[ARG:%.*]], 1
+; CHECK-V8A-NEXT:    [[CMP1:%.*]] = icmp slt i32 0, [[SUB1]]
+; CHECK-V8A-NEXT:    br i1 [[CMP1]], label [[OUTER_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-V8A:       outer.preheader:
+; CHECK-V8A-NEXT:    br label [[OUTER:%.*]]
+; CHECK-V8A:       outer:
+; CHECK-V8A-NEXT:    [[I:%.*]] = phi i32 [ [[I_INC:%.*]], [[OUTER_INC:%.*]] ], [ 0, [[OUTER_PREHEADER]] ]
+; CHECK-V8A-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[ARG]], [[I]]
+; CHECK-V8A-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[SUB2]], 1
+; CHECK-V8A-NEXT:    [[CMP2:%.*]] = icmp slt i32 0, [[SUB3]]
+; CHECK-V8A-NEXT:    br i1 [[CMP2]], label [[INNER_PH:%.*]], label [[OUTER_INC]]
+; CHECK-V8A:       inner.ph:
+; CHECK-V8A-NEXT:    br label [[INNER:%.*]]
+; CHECK-V8A:       inner:
+; CHECK-V8A-NEXT:    br i1 false, label [[INNER]], label [[OUTER_INC_LOOPEXIT:%.*]]
+; CHECK-V8A:       outer.inc.loopexit:
+; CHECK-V8A-NEXT:    br label [[OUTER_INC]]
+; CHECK-V8A:       outer.inc:
+; CHECK-V8A-NEXT:    [[I_INC]] = add nuw nsw i32 [[I]], 1
+; CHECK-V8A-NEXT:    br i1 false, label [[OUTER]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK-V8A:       exit.loopexit:
+; CHECK-V8A-NEXT:    br label [[EXIT]]
+; CHECK-V8A:       exit:
+; CHECK-V8A-NEXT:    ret void
+;
+entry:
+  %sub1 = sub nsw i32 %arg, 1
+  %cmp1 = icmp slt i32 0, %sub1
+  br i1 %cmp1, label %outer, label %exit
+
+outer:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %outer.inc ]
+  %sub2 = sub nsw i32 %arg, %i
+  %sub3 = sub nsw i32 %sub2, 1
+  %cmp2 = icmp slt i32 0, %sub3
+  br i1 %cmp2, label %inner.ph, label %outer.inc
+
+inner.ph:
+  br label %inner
+
+inner:
+  %j = phi i32 [ 0, %inner.ph ], [ %j.inc, %inner ]
+  %j.inc = add nsw i32 %j, 1
+  %cmp3 = icmp slt i32 %j.inc, %sub3
+  br i1 %cmp3, label %inner, label %outer.inc
+
+outer.inc:
+  %i.inc = add nsw i32 %i, 1
+  %cmp4 = icmp slt i32 %i.inc, %sub1
+  br i1 %cmp4, label %outer, label %exit
+
+exit:
+  ret void
+}
+
+define i32 @test1(i32* %array, i32 %length, i32 %n) #0 {
+; CHECK-V8M-LABEL: @test1(
+; CHECK-V8M-NEXT:  loop.preheader:
+; CHECK-V8M-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[N:%.*]], 1
+; CHECK-V8M-NEXT:    [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 1
+; CHECK-V8M-NEXT:    [[TMP1:%.*]] = add i32 [[UMAX]], -1
+; CHECK-V8M-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[LENGTH:%.*]], [[TMP1]]
+; CHECK-V8M-NEXT:    [[UMIN:%.*]] = select i1 [[TMP2]], i32 [[LENGTH]], i32 [[TMP1]]
+; CHECK-V8M-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[LENGTH]], [[UMIN]]
+; CHECK-V8M-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8M:       loop:
+; CHECK-V8M-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8M-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[GUARDED]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8M-NEXT:    br i1 [[TMP3]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8M:       deopt:
+; CHECK-V8M-NEXT:    call void @prevent_merging()
+; CHECK-V8M-NEXT:    ret i32 -1
+; CHECK-V8M:       guarded:
+; CHECK-V8M-NEXT:    [[I_I64:%.*]] = zext i32 [[I]] to i64
+; CHECK-V8M-NEXT:    [[ARRAY_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY:%.*]], i64 [[I_I64]]
+; CHECK-V8M-NEXT:    [[ARRAY_I:%.*]] = load i32, i32* [[ARRAY_I_PTR]], align 4
+; CHECK-V8M-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC]], [[ARRAY_I]]
+; CHECK-V8M-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-V8M-NEXT:    [[CONTINUE:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; CHECK-V8M-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8M:       exit:
+; CHECK-V8M-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED]] ]
+; CHECK-V8M-NEXT:    ret i32 [[RESULT]]
+;
+; CHECK-V8A-LABEL: @test1(
+; CHECK-V8A-NEXT:  loop.preheader:
+; CHECK-V8A-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[N:%.*]], 1
+; CHECK-V8A-NEXT:    [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 1
+; CHECK-V8A-NEXT:    [[TMP1:%.*]] = add i32 [[UMAX]], -1
+; CHECK-V8A-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[LENGTH:%.*]], [[TMP1]]
+; CHECK-V8A-NEXT:    [[UMIN:%.*]] = select i1 [[TMP2]], i32 [[LENGTH]], i32 [[TMP1]]
+; CHECK-V8A-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[LENGTH]], [[UMIN]]
+; CHECK-V8A-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8A:       loop:
+; CHECK-V8A-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8A-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[GUARDED]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8A-NEXT:    br i1 [[TMP3]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8A:       deopt:
+; CHECK-V8A-NEXT:    call void @prevent_merging()
+; CHECK-V8A-NEXT:    ret i32 -1
+; CHECK-V8A:       guarded:
+; CHECK-V8A-NEXT:    [[I_I64:%.*]] = zext i32 [[I]] to i64
+; CHECK-V8A-NEXT:    [[ARRAY_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY:%.*]], i64 [[I_I64]]
+; CHECK-V8A-NEXT:    [[ARRAY_I:%.*]] = load i32, i32* [[ARRAY_I_PTR]], align 4
+; CHECK-V8A-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC]], [[ARRAY_I]]
+; CHECK-V8A-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-V8A-NEXT:    [[CONTINUE:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; CHECK-V8A-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8A:       exit:
+; CHECK-V8A-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED]] ]
+; CHECK-V8A-NEXT:    ret i32 [[RESULT]]
+;
+loop.preheader:                                   ; preds = %entry
+  br label %loop
+
+loop:                                             ; preds = %guarded, %loop.preheader
+  %loop.acc = phi i32 [ %loop.acc.next, %guarded ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %guarded ], [ 0, %loop.preheader ]
+  %within.bounds = icmp ult i32 %i, %length
+  br i1 %within.bounds, label %guarded, label %deopt, !prof !0
+
+deopt:                                            ; preds = %loop
+  call void @prevent_merging()
+  ret i32 -1
+
+guarded:                                          ; preds = %loop
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:                                             ; preds = %guarded, %entry
+  %result = phi i32 [ %loop.acc.next, %guarded ]
+  ret i32 %result
+}
+
+declare void @maythrow()
+
+define i32 @test2(i32* %array, i32 %length, i32 %n) #0 {
+; CHECK-V8M-LABEL: @test2(
+; CHECK-V8M-NEXT:  loop.preheader:
+; CHECK-V8M-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
+; CHECK-V8M-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[LENGTH:%.*]], [[TMP0]]
+; CHECK-V8M-NEXT:    [[UMIN:%.*]] = select i1 [[TMP1]], i32 [[LENGTH]], i32 [[TMP0]]
+; CHECK-V8M-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[LENGTH]], [[UMIN]]
+; CHECK-V8M-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8M:       loop:
+; CHECK-V8M-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8M-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[GUARDED]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8M-NEXT:    br i1 [[TMP2]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8M:       deopt:
+; CHECK-V8M-NEXT:    call void @prevent_merging()
+; CHECK-V8M-NEXT:    ret i32 -1
+; CHECK-V8M:       guarded:
+; CHECK-V8M-NEXT:    [[I_I64:%.*]] = zext i32 [[I]] to i64
+; CHECK-V8M-NEXT:    [[ARRAY_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY:%.*]], i64 [[I_I64]]
+; CHECK-V8M-NEXT:    [[ARRAY_I:%.*]] = load i32, i32* [[ARRAY_I_PTR]], align 4
+; CHECK-V8M-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC]], [[ARRAY_I]]
+; CHECK-V8M-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-V8M-NEXT:    [[CONTINUE:%.*]] = icmp ne i32 [[I_NEXT]], [[N]]
+; CHECK-V8M-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8M:       exit:
+; CHECK-V8M-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED]] ]
+; CHECK-V8M-NEXT:    ret i32 [[RESULT]]
+;
+; CHECK-V8A-LABEL: @test2(
+; CHECK-V8A-NEXT:  loop.preheader:
+; CHECK-V8A-NEXT:    [[TMP0:%.*]] = add i32 [[N:%.*]], -1
+; CHECK-V8A-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[LENGTH:%.*]], [[TMP0]]
+; CHECK-V8A-NEXT:    [[UMIN:%.*]] = select i1 [[TMP1]], i32 [[LENGTH]], i32 [[TMP0]]
+; CHECK-V8A-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[LENGTH]], [[UMIN]]
+; CHECK-V8A-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8A:       loop:
+; CHECK-V8A-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8A-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[GUARDED]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8A-NEXT:    br i1 [[TMP2]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8A:       deopt:
+; CHECK-V8A-NEXT:    call void @prevent_merging()
+; CHECK-V8A-NEXT:    ret i32 -1
+; CHECK-V8A:       guarded:
+; CHECK-V8A-NEXT:    [[I_I64:%.*]] = zext i32 [[I]] to i64
+; CHECK-V8A-NEXT:    [[ARRAY_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY:%.*]], i64 [[I_I64]]
+; CHECK-V8A-NEXT:    [[ARRAY_I:%.*]] = load i32, i32* [[ARRAY_I_PTR]], align 4
+; CHECK-V8A-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC]], [[ARRAY_I]]
+; CHECK-V8A-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-V8A-NEXT:    [[CONTINUE:%.*]] = icmp ne i32 [[I_NEXT]], [[N]]
+; CHECK-V8A-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8A:       exit:
+; CHECK-V8A-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED]] ]
+; CHECK-V8A-NEXT:    ret i32 [[RESULT]]
+;
+loop.preheader:                                   ; preds = %entry
+  br label %loop
+
+loop:                                             ; preds = %guarded, %loop.preheader
+  %loop.acc = phi i32 [ %loop.acc.next, %guarded ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %guarded ], [ 0, %loop.preheader ]
+  %within.bounds = icmp ne i32 %i, %length
+  br i1 %within.bounds, label %guarded, label %deopt, !prof !0
+
+deopt:                                            ; preds = %loop
+  call void @prevent_merging()
+  ret i32 -1
+
+guarded:                                          ; preds = %loop
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ne i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:                                             ; preds = %guarded, %entry
+  %result = phi i32 [ %loop.acc.next, %guarded ]
+  ret i32 %result
+}
+
+define i32 @two_range_checks(i32* %array.1, i32 %length.1, i32* %array.2, i32 %length.2, i32 %n) #0 {
+; CHECK-V8M-LABEL: @two_range_checks(
+; CHECK-V8M-NEXT:  loop.preheader:
+; CHECK-V8M-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[LENGTH_2:%.*]], [[LENGTH_1:%.*]]
+; CHECK-V8M-NEXT:    [[UMIN:%.*]] = select i1 [[TMP0]], i32 [[LENGTH_2]], i32 [[LENGTH_1]]
+; CHECK-V8M-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[LENGTH_2]], [[LENGTH_1]]
+; CHECK-V8M-NEXT:    [[UMIN1:%.*]] = select i1 [[TMP1]], i32 [[LENGTH_2]], i32 [[LENGTH_1]]
+; CHECK-V8M-NEXT:    [[TMP2:%.*]] = icmp ugt i32 [[N:%.*]], 1
+; CHECK-V8M-NEXT:    [[UMAX:%.*]] = select i1 [[TMP2]], i32 [[N]], i32 1
+; CHECK-V8M-NEXT:    [[TMP3:%.*]] = add i32 [[UMAX]], -1
+; CHECK-V8M-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[UMIN1]], [[TMP3]]
+; CHECK-V8M-NEXT:    [[UMIN2:%.*]] = select i1 [[TMP4]], i32 [[UMIN1]], i32 [[TMP3]]
+; CHECK-V8M-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[UMIN]], [[UMIN2]]
+; CHECK-V8M-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8M:       loop:
+; CHECK-V8M-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8M-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[GUARDED]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8M-NEXT:    br i1 [[TMP5]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8M:       deopt:
+; CHECK-V8M-NEXT:    call void @prevent_merging()
+; CHECK-V8M-NEXT:    ret i32 -1
+; CHECK-V8M:       guarded:
+; CHECK-V8M-NEXT:    [[I_I64:%.*]] = zext i32 [[I]] to i64
+; CHECK-V8M-NEXT:    [[ARRAY_1_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_1:%.*]], i64 [[I_I64]]
+; CHECK-V8M-NEXT:    [[ARRAY_1_I:%.*]] = load i32, i32* [[ARRAY_1_I_PTR]], align 4
+; CHECK-V8M-NEXT:    [[LOOP_ACC_1:%.*]] = add i32 [[LOOP_ACC]], [[ARRAY_1_I]]
+; CHECK-V8M-NEXT:    [[ARRAY_2_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_2:%.*]], i64 [[I_I64]]
+; CHECK-V8M-NEXT:    [[ARRAY_2_I:%.*]] = load i32, i32* [[ARRAY_2_I_PTR]], align 4
+; CHECK-V8M-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC_1]], [[ARRAY_2_I]]
+; CHECK-V8M-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-V8M-NEXT:    [[CONTINUE:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; CHECK-V8M-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8M:       exit:
+; CHECK-V8M-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED]] ]
+; CHECK-V8M-NEXT:    ret i32 [[RESULT]]
+;
+; CHECK-V8A-LABEL: @two_range_checks(
+; CHECK-V8A-NEXT:  loop.preheader:
+; CHECK-V8A-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[LENGTH_2:%.*]], [[LENGTH_1:%.*]]
+; CHECK-V8A-NEXT:    [[UMIN:%.*]] = select i1 [[TMP0]], i32 [[LENGTH_2]], i32 [[LENGTH_1]]
+; CHECK-V8A-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[LENGTH_2]], [[LENGTH_1]]
+; CHECK-V8A-NEXT:    [[UMIN1:%.*]] = select i1 [[TMP1]], i32 [[LENGTH_2]], i32 [[LENGTH_1]]
+; CHECK-V8A-NEXT:    [[TMP2:%.*]] = icmp ugt i32 [[N:%.*]], 1
+; CHECK-V8A-NEXT:    [[UMAX:%.*]] = select i1 [[TMP2]], i32 [[N]], i32 1
+; CHECK-V8A-NEXT:    [[TMP3:%.*]] = add i32 [[UMAX]], -1
+; CHECK-V8A-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[UMIN1]], [[TMP3]]
+; CHECK-V8A-NEXT:    [[UMIN2:%.*]] = select i1 [[TMP4]], i32 [[UMIN1]], i32 [[TMP3]]
+; CHECK-V8A-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[UMIN]], [[UMIN2]]
+; CHECK-V8A-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8A:       loop:
+; CHECK-V8A-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8A-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[GUARDED]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8A-NEXT:    br i1 [[TMP5]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8A:       deopt:
+; CHECK-V8A-NEXT:    call void @prevent_merging()
+; CHECK-V8A-NEXT:    ret i32 -1
+; CHECK-V8A:       guarded:
+; CHECK-V8A-NEXT:    [[I_I64:%.*]] = zext i32 [[I]] to i64
+; CHECK-V8A-NEXT:    [[ARRAY_1_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_1:%.*]], i64 [[I_I64]]
+; CHECK-V8A-NEXT:    [[ARRAY_1_I:%.*]] = load i32, i32* [[ARRAY_1_I_PTR]], align 4
+; CHECK-V8A-NEXT:    [[LOOP_ACC_1:%.*]] = add i32 [[LOOP_ACC]], [[ARRAY_1_I]]
+; CHECK-V8A-NEXT:    [[ARRAY_2_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_2:%.*]], i64 [[I_I64]]
+; CHECK-V8A-NEXT:    [[ARRAY_2_I:%.*]] = load i32, i32* [[ARRAY_2_I_PTR]], align 4
+; CHECK-V8A-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC_1]], [[ARRAY_2_I]]
+; CHECK-V8A-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-V8A-NEXT:    [[CONTINUE:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; CHECK-V8A-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8A:       exit:
+; CHECK-V8A-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED]] ]
+; CHECK-V8A-NEXT:    ret i32 [[RESULT]]
+;
+loop.preheader:                                   ; preds = %entry
+  br label %loop
+
+loop:                                             ; preds = %guarded, %loop.preheader
+  %loop.acc = phi i32 [ %loop.acc.next, %guarded ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %guarded ], [ 0, %loop.preheader ]
+  %within.bounds.1 = icmp ult i32 %i, %length.1
+  %within.bounds.2 = icmp ult i32 %i, %length.2
+  %within.bounds = and i1 %within.bounds.1, %within.bounds.2
+  br i1 %within.bounds, label %guarded, label %deopt, !prof !0
+
+deopt:                                            ; preds = %loop
+  call void @prevent_merging()
+  ret i32 -1
+
+guarded:                                          ; preds = %loop
+  %i.i64 = zext i32 %i to i64
+  %array.1.i.ptr = getelementptr inbounds i32, i32* %array.1, i64 %i.i64
+  %array.1.i = load i32, i32* %array.1.i.ptr, align 4
+  %loop.acc.1 = add i32 %loop.acc, %array.1.i
+  %array.2.i.ptr = getelementptr inbounds i32, i32* %array.2, i64 %i.i64
+  %array.2.i = load i32, i32* %array.2.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc.1, %array.2.i
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:                                             ; preds = %guarded, %entry
+  %result = phi i32 [ %loop.acc.next, %guarded ]
+  ret i32 %result
+}
+
+define i32 @three_range_checks(i32* %array.1, i32 %length.1, i32* %array.2, i32 %length.2, i32* %array.3, i32 %length.3, i32 %n) #0 {
+; CHECK-V8M-LABEL: @three_range_checks(
+; CHECK-V8M-NEXT:  loop.preheader:
+; CHECK-V8M-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[LENGTH_3:%.*]], [[LENGTH_2:%.*]]
+; CHECK-V8M-NEXT:    [[UMIN:%.*]] = select i1 [[TMP0]], i32 [[LENGTH_3]], i32 [[LENGTH_2]]
+; CHECK-V8M-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[UMIN]], [[LENGTH_1:%.*]]
+; CHECK-V8M-NEXT:    [[UMIN1:%.*]] = select i1 [[TMP1]], i32 [[UMIN]], i32 [[LENGTH_1]]
+; CHECK-V8M-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[LENGTH_3]], [[LENGTH_2]]
+; CHECK-V8M-NEXT:    [[UMIN2:%.*]] = select i1 [[TMP2]], i32 [[LENGTH_3]], i32 [[LENGTH_2]]
+; CHECK-V8M-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[UMIN2]], [[LENGTH_1]]
+; CHECK-V8M-NEXT:    [[UMIN3:%.*]] = select i1 [[TMP3]], i32 [[UMIN2]], i32 [[LENGTH_1]]
+; CHECK-V8M-NEXT:    [[TMP4:%.*]] = icmp ugt i32 [[N:%.*]], 1
+; CHECK-V8M-NEXT:    [[UMAX:%.*]] = select i1 [[TMP4]], i32 [[N]], i32 1
+; CHECK-V8M-NEXT:    [[TMP5:%.*]] = add i32 [[UMAX]], -1
+; CHECK-V8M-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[UMIN3]], [[TMP5]]
+; CHECK-V8M-NEXT:    [[UMIN4:%.*]] = select i1 [[TMP6]], i32 [[UMIN3]], i32 [[TMP5]]
+; CHECK-V8M-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[UMIN1]], [[UMIN4]]
+; CHECK-V8M-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8M:       loop:
+; CHECK-V8M-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8M-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[GUARDED]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8M-NEXT:    br i1 [[TMP7]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8M:       deopt:
+; CHECK-V8M-NEXT:    call void @prevent_merging()
+; CHECK-V8M-NEXT:    ret i32 -1
+; CHECK-V8M:       guarded:
+; CHECK-V8M-NEXT:    [[I_I64:%.*]] = zext i32 [[I]] to i64
+; CHECK-V8M-NEXT:    [[ARRAY_1_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_1:%.*]], i64 [[I_I64]]
+; CHECK-V8M-NEXT:    [[ARRAY_1_I:%.*]] = load i32, i32* [[ARRAY_1_I_PTR]], align 4
+; CHECK-V8M-NEXT:    [[LOOP_ACC_1:%.*]] = add i32 [[LOOP_ACC]], [[ARRAY_1_I]]
+; CHECK-V8M-NEXT:    [[ARRAY_2_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_2:%.*]], i64 [[I_I64]]
+; CHECK-V8M-NEXT:    [[ARRAY_2_I:%.*]] = load i32, i32* [[ARRAY_2_I_PTR]], align 4
+; CHECK-V8M-NEXT:    [[LOOP_ACC_2:%.*]] = add i32 [[LOOP_ACC_1]], [[ARRAY_2_I]]
+; CHECK-V8M-NEXT:    [[ARRAY_3_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_3:%.*]], i64 [[I_I64]]
+; CHECK-V8M-NEXT:    [[ARRAY_3_I:%.*]] = load i32, i32* [[ARRAY_3_I_PTR]], align 4
+; CHECK-V8M-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC_2]], [[ARRAY_3_I]]
+; CHECK-V8M-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-V8M-NEXT:    [[CONTINUE:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; CHECK-V8M-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8M:       exit:
+; CHECK-V8M-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED]] ]
+; CHECK-V8M-NEXT:    ret i32 [[RESULT]]
+;
+; CHECK-V8A-LABEL: @three_range_checks(
+; CHECK-V8A-NEXT:  loop.preheader:
+; CHECK-V8A-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[LENGTH_3:%.*]], [[LENGTH_2:%.*]]
+; CHECK-V8A-NEXT:    [[UMIN:%.*]] = select i1 [[TMP0]], i32 [[LENGTH_3]], i32 [[LENGTH_2]]
+; CHECK-V8A-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[UMIN]], [[LENGTH_1:%.*]]
+; CHECK-V8A-NEXT:    [[UMIN1:%.*]] = select i1 [[TMP1]], i32 [[UMIN]], i32 [[LENGTH_1]]
+; CHECK-V8A-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[LENGTH_3]], [[LENGTH_2]]
+; CHECK-V8A-NEXT:    [[UMIN2:%.*]] = select i1 [[TMP2]], i32 [[LENGTH_3]], i32 [[LENGTH_2]]
+; CHECK-V8A-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[UMIN2]], [[LENGTH_1]]
+; CHECK-V8A-NEXT:    [[UMIN3:%.*]] = select i1 [[TMP3]], i32 [[UMIN2]], i32 [[LENGTH_1]]
+; CHECK-V8A-NEXT:    [[TMP4:%.*]] = icmp ugt i32 [[N:%.*]], 1
+; CHECK-V8A-NEXT:    [[UMAX:%.*]] = select i1 [[TMP4]], i32 [[N]], i32 1
+; CHECK-V8A-NEXT:    [[TMP5:%.*]] = add i32 [[UMAX]], -1
+; CHECK-V8A-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[UMIN3]], [[TMP5]]
+; CHECK-V8A-NEXT:    [[UMIN4:%.*]] = select i1 [[TMP6]], i32 [[UMIN3]], i32 [[TMP5]]
+; CHECK-V8A-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[UMIN1]], [[UMIN4]]
+; CHECK-V8A-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8A:       loop:
+; CHECK-V8A-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8A-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[GUARDED]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8A-NEXT:    br i1 [[TMP7]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8A:       deopt:
+; CHECK-V8A-NEXT:    call void @prevent_merging()
+; CHECK-V8A-NEXT:    ret i32 -1
+; CHECK-V8A:       guarded:
+; CHECK-V8A-NEXT:    [[I_I64:%.*]] = zext i32 [[I]] to i64
+; CHECK-V8A-NEXT:    [[ARRAY_1_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_1:%.*]], i64 [[I_I64]]
+; CHECK-V8A-NEXT:    [[ARRAY_1_I:%.*]] = load i32, i32* [[ARRAY_1_I_PTR]], align 4
+; CHECK-V8A-NEXT:    [[LOOP_ACC_1:%.*]] = add i32 [[LOOP_ACC]], [[ARRAY_1_I]]
+; CHECK-V8A-NEXT:    [[ARRAY_2_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_2:%.*]], i64 [[I_I64]]
+; CHECK-V8A-NEXT:    [[ARRAY_2_I:%.*]] = load i32, i32* [[ARRAY_2_I_PTR]], align 4
+; CHECK-V8A-NEXT:    [[LOOP_ACC_2:%.*]] = add i32 [[LOOP_ACC_1]], [[ARRAY_2_I]]
+; CHECK-V8A-NEXT:    [[ARRAY_3_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_3:%.*]], i64 [[I_I64]]
+; CHECK-V8A-NEXT:    [[ARRAY_3_I:%.*]] = load i32, i32* [[ARRAY_3_I_PTR]], align 4
+; CHECK-V8A-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC_2]], [[ARRAY_3_I]]
+; CHECK-V8A-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-V8A-NEXT:    [[CONTINUE:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; CHECK-V8A-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8A:       exit:
+; CHECK-V8A-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED]] ]
+; CHECK-V8A-NEXT:    ret i32 [[RESULT]]
+;
+loop.preheader:                                   ; preds = %entry
+  br label %loop
+
+loop:                                             ; preds = %guarded, %loop.preheader
+  %loop.acc = phi i32 [ %loop.acc.next, %guarded ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %guarded ], [ 0, %loop.preheader ]
+  %within.bounds.1 = icmp ult i32 %i, %length.1
+  %within.bounds.2 = icmp ult i32 %i, %length.2
+  %within.bounds.3 = icmp ult i32 %i, %length.3
+  %within.bounds.1.and.2 = and i1 %within.bounds.1, %within.bounds.2
+  %within.bounds = and i1 %within.bounds.1.and.2, %within.bounds.3
+  br i1 %within.bounds, label %guarded, label %deopt, !prof !0
+
+deopt:                                            ; preds = %loop
+  call void @prevent_merging()
+  ret i32 -1
+
+guarded:                                          ; preds = %loop
+  %i.i64 = zext i32 %i to i64
+  %array.1.i.ptr = getelementptr inbounds i32, i32* %array.1, i64 %i.i64
+  %array.1.i = load i32, i32* %array.1.i.ptr, align 4
+  %loop.acc.1 = add i32 %loop.acc, %array.1.i
+  %array.2.i.ptr = getelementptr inbounds i32, i32* %array.2, i64 %i.i64
+  %array.2.i = load i32, i32* %array.2.i.ptr, align 4
+  %loop.acc.2 = add i32 %loop.acc.1, %array.2.i
+  %array.3.i.ptr = getelementptr inbounds i32, i32* %array.3, i64 %i.i64
+  %array.3.i = load i32, i32* %array.3.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc.2, %array.3.i
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:                                             ; preds = %guarded, %entry
+  %result = phi i32 [ %loop.acc.next, %guarded ]
+  ret i32 %result
+}
+
+; Analogous to the above, but with two distinct branches (on different conditions)
+define i32 @distinct_checks(i32* %array.1, i32 %length.1, i32* %array.2, i32 %length.2, i32* %array.3, i32 %length.3, i32 %n) #0 {
+; CHECK-V8M-LABEL: @distinct_checks(
+; CHECK-V8M-NEXT:  loop.preheader:
+; CHECK-V8M-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[LENGTH_2:%.*]], [[LENGTH_1:%.*]]
+; CHECK-V8M-NEXT:    [[UMIN:%.*]] = select i1 [[TMP0]], i32 [[LENGTH_2]], i32 [[LENGTH_1]]
+; CHECK-V8M-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[N:%.*]], 1
+; CHECK-V8M-NEXT:    [[UMAX:%.*]] = select i1 [[TMP1]], i32 [[N]], i32 1
+; CHECK-V8M-NEXT:    [[TMP2:%.*]] = add i32 [[UMAX]], -1
+; CHECK-V8M-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[UMIN]], [[TMP2]]
+; CHECK-V8M-NEXT:    [[UMIN1:%.*]] = select i1 [[TMP3]], i32 [[UMIN]], i32 [[TMP2]]
+; CHECK-V8M-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[LENGTH_1]], [[UMIN1]]
+; CHECK-V8M-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[LENGTH_2]], [[UMIN1]]
+; CHECK-V8M-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8M:       loop:
+; CHECK-V8M-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED1:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8M-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[GUARDED1]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8M-NEXT:    br i1 [[TMP4]], label [[GUARDED:%.*]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8M:       deopt:
+; CHECK-V8M-NEXT:    call void @prevent_merging()
+; CHECK-V8M-NEXT:    ret i32 -1
+; CHECK-V8M:       guarded:
+; CHECK-V8M-NEXT:    [[I_I64:%.*]] = zext i32 [[I]] to i64
+; CHECK-V8M-NEXT:    [[ARRAY_1_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_1:%.*]], i64 [[I_I64]]
+; CHECK-V8M-NEXT:    [[ARRAY_1_I:%.*]] = load i32, i32* [[ARRAY_1_I_PTR]], align 4
+; CHECK-V8M-NEXT:    [[LOOP_ACC_1:%.*]] = add i32 [[LOOP_ACC]], [[ARRAY_1_I]]
+; CHECK-V8M-NEXT:    br i1 [[TMP5]], label [[GUARDED1]], label [[DEOPT2:%.*]], !prof !0
+; CHECK-V8M:       deopt2:
+; CHECK-V8M-NEXT:    call void @prevent_merging()
+; CHECK-V8M-NEXT:    ret i32 -1
+; CHECK-V8M:       guarded1:
+; CHECK-V8M-NEXT:    [[ARRAY_3_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_3:%.*]], i64 [[I_I64]]
+; CHECK-V8M-NEXT:    [[ARRAY_3_I:%.*]] = load i32, i32* [[ARRAY_3_I_PTR]], align 4
+; CHECK-V8M-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC_1]], [[ARRAY_3_I]]
+; CHECK-V8M-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-V8M-NEXT:    [[CONTINUE:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; CHECK-V8M-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8M:       exit:
+; CHECK-V8M-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED1]] ]
+; CHECK-V8M-NEXT:    ret i32 [[RESULT]]
+;
+; CHECK-V8A-LABEL: @distinct_checks(
+; CHECK-V8A-NEXT:  loop.preheader:
+; CHECK-V8A-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[LENGTH_2:%.*]], [[LENGTH_1:%.*]]
+; CHECK-V8A-NEXT:    [[UMIN:%.*]] = select i1 [[TMP0]], i32 [[LENGTH_2]], i32 [[LENGTH_1]]
+; CHECK-V8A-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[N:%.*]], 1
+; CHECK-V8A-NEXT:    [[UMAX:%.*]] = select i1 [[TMP1]], i32 [[N]], i32 1
+; CHECK-V8A-NEXT:    [[TMP2:%.*]] = add i32 [[UMAX]], -1
+; CHECK-V8A-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[UMIN]], [[TMP2]]
+; CHECK-V8A-NEXT:    [[UMIN1:%.*]] = select i1 [[TMP3]], i32 [[UMIN]], i32 [[TMP2]]
+; CHECK-V8A-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[LENGTH_1]], [[UMIN1]]
+; CHECK-V8A-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[LENGTH_2]], [[UMIN1]]
+; CHECK-V8A-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8A:       loop:
+; CHECK-V8A-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED1:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8A-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[GUARDED1]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8A-NEXT:    br i1 [[TMP4]], label [[GUARDED:%.*]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8A:       deopt:
+; CHECK-V8A-NEXT:    call void @prevent_merging()
+; CHECK-V8A-NEXT:    ret i32 -1
+; CHECK-V8A:       guarded:
+; CHECK-V8A-NEXT:    [[I_I64:%.*]] = zext i32 [[I]] to i64
+; CHECK-V8A-NEXT:    [[ARRAY_1_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_1:%.*]], i64 [[I_I64]]
+; CHECK-V8A-NEXT:    [[ARRAY_1_I:%.*]] = load i32, i32* [[ARRAY_1_I_PTR]], align 4
+; CHECK-V8A-NEXT:    [[LOOP_ACC_1:%.*]] = add i32 [[LOOP_ACC]], [[ARRAY_1_I]]
+; CHECK-V8A-NEXT:    br i1 [[TMP5]], label [[GUARDED1]], label [[DEOPT2:%.*]], !prof !0
+; CHECK-V8A:       deopt2:
+; CHECK-V8A-NEXT:    call void @prevent_merging()
+; CHECK-V8A-NEXT:    ret i32 -1
+; CHECK-V8A:       guarded1:
+; CHECK-V8A-NEXT:    [[ARRAY_3_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_3:%.*]], i64 [[I_I64]]
+; CHECK-V8A-NEXT:    [[ARRAY_3_I:%.*]] = load i32, i32* [[ARRAY_3_I_PTR]], align 4
+; CHECK-V8A-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC_1]], [[ARRAY_3_I]]
+; CHECK-V8A-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-V8A-NEXT:    [[CONTINUE:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; CHECK-V8A-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8A:       exit:
+; CHECK-V8A-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED1]] ]
+; CHECK-V8A-NEXT:    ret i32 [[RESULT]]
+;
+loop.preheader:                                   ; preds = %entry
+  br label %loop
+
+loop:                                             ; preds = %guarded4, %loop.preheader
+  %loop.acc = phi i32 [ %loop.acc.next, %guarded1 ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %guarded1 ], [ 0, %loop.preheader ]
+  %within.bounds.1 = icmp ult i32 %i, %length.1
+  br i1 %within.bounds.1, label %guarded, label %deopt, !prof !0
+
+deopt:                                            ; preds = %loop
+  call void @prevent_merging()
+  ret i32 -1
+
+guarded:                                          ; preds = %loop
+  %i.i64 = zext i32 %i to i64
+  %array.1.i.ptr = getelementptr inbounds i32, i32* %array.1, i64 %i.i64
+  %array.1.i = load i32, i32* %array.1.i.ptr, align 4
+  %loop.acc.1 = add i32 %loop.acc, %array.1.i
+  %within.bounds.2 = icmp ult i32 %i, %length.2
+  br i1 %within.bounds.2, label %guarded1, label %deopt2, !prof !0
+
+deopt2:                                           ; preds = %guarded
+  call void @prevent_merging()
+  ret i32 -1
+
+guarded1:                                         ; preds = %guarded1
+  %array.3.i.ptr = getelementptr inbounds i32, i32* %array.3, i64 %i.i64
+  %array.3.i = load i32, i32* %array.3.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc.1, %array.3.i
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ %loop.acc.next, %guarded1 ]
+  ret i32 %result
+}
+
+define i32 @duplicate_checks(i32* %array.1, i32* %array.2, i32* %array.3, i32 %length, i32 %n) #0 {
+; CHECK-V8M-LABEL: @duplicate_checks(
+; CHECK-V8M-NEXT:  loop.preheader:
+; CHECK-V8M-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[N:%.*]], 1
+; CHECK-V8M-NEXT:    [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 1
+; CHECK-V8M-NEXT:    [[TMP1:%.*]] = add i32 [[UMAX]], -1
+; CHECK-V8M-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[LENGTH:%.*]], [[TMP1]]
+; CHECK-V8M-NEXT:    [[UMIN:%.*]] = select i1 [[TMP2]], i32 [[LENGTH]], i32 [[TMP1]]
+; CHECK-V8M-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[LENGTH]], [[UMIN]]
+; CHECK-V8M-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8M:       loop:
+; CHECK-V8M-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED1:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8M-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[GUARDED1]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8M-NEXT:    br i1 [[TMP3]], label [[GUARDED:%.*]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8M:       deopt:
+; CHECK-V8M-NEXT:    call void @prevent_merging()
+; CHECK-V8M-NEXT:    ret i32 -1
+; CHECK-V8M:       guarded:
+; CHECK-V8M-NEXT:    [[I_I64:%.*]] = zext i32 [[I]] to i64
+; CHECK-V8M-NEXT:    [[ARRAY_1_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_1:%.*]], i64 [[I_I64]]
+; CHECK-V8M-NEXT:    [[ARRAY_1_I:%.*]] = load i32, i32* [[ARRAY_1_I_PTR]], align 4
+; CHECK-V8M-NEXT:    [[LOOP_ACC_1:%.*]] = add i32 [[LOOP_ACC]], [[ARRAY_1_I]]
+; CHECK-V8M-NEXT:    br i1 true, label [[GUARDED1]], label [[DEOPT2:%.*]], !prof !0
+; CHECK-V8M:       deopt2:
+; CHECK-V8M-NEXT:    call void @prevent_merging()
+; CHECK-V8M-NEXT:    ret i32 -1
+; CHECK-V8M:       guarded1:
+; CHECK-V8M-NEXT:    [[ARRAY_3_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_3:%.*]], i64 [[I_I64]]
+; CHECK-V8M-NEXT:    [[ARRAY_3_I:%.*]] = load i32, i32* [[ARRAY_3_I_PTR]], align 4
+; CHECK-V8M-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC_1]], [[ARRAY_3_I]]
+; CHECK-V8M-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-V8M-NEXT:    [[CONTINUE:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; CHECK-V8M-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8M:       exit:
+; CHECK-V8M-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED1]] ]
+; CHECK-V8M-NEXT:    ret i32 [[RESULT]]
+;
+; CHECK-V8A-LABEL: @duplicate_checks(
+; CHECK-V8A-NEXT:  loop.preheader:
+; CHECK-V8A-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[N:%.*]], 1
+; CHECK-V8A-NEXT:    [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 1
+; CHECK-V8A-NEXT:    [[TMP1:%.*]] = add i32 [[UMAX]], -1
+; CHECK-V8A-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[LENGTH:%.*]], [[TMP1]]
+; CHECK-V8A-NEXT:    [[UMIN:%.*]] = select i1 [[TMP2]], i32 [[LENGTH]], i32 [[TMP1]]
+; CHECK-V8A-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[LENGTH]], [[UMIN]]
+; CHECK-V8A-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8A:       loop:
+; CHECK-V8A-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED1:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8A-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[GUARDED1]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8A-NEXT:    br i1 [[TMP3]], label [[GUARDED:%.*]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8A:       deopt:
+; CHECK-V8A-NEXT:    call void @prevent_merging()
+; CHECK-V8A-NEXT:    ret i32 -1
+; CHECK-V8A:       guarded:
+; CHECK-V8A-NEXT:    [[I_I64:%.*]] = zext i32 [[I]] to i64
+; CHECK-V8A-NEXT:    [[ARRAY_1_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_1:%.*]], i64 [[I_I64]]
+; CHECK-V8A-NEXT:    [[ARRAY_1_I:%.*]] = load i32, i32* [[ARRAY_1_I_PTR]], align 4
+; CHECK-V8A-NEXT:    [[LOOP_ACC_1:%.*]] = add i32 [[LOOP_ACC]], [[ARRAY_1_I]]
+; CHECK-V8A-NEXT:    br i1 true, label [[GUARDED1]], label [[DEOPT2:%.*]], !prof !0
+; CHECK-V8A:       deopt2:
+; CHECK-V8A-NEXT:    call void @prevent_merging()
+; CHECK-V8A-NEXT:    ret i32 -1
+; CHECK-V8A:       guarded1:
+; CHECK-V8A-NEXT:    [[ARRAY_3_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY_3:%.*]], i64 [[I_I64]]
+; CHECK-V8A-NEXT:    [[ARRAY_3_I:%.*]] = load i32, i32* [[ARRAY_3_I_PTR]], align 4
+; CHECK-V8A-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC_1]], [[ARRAY_3_I]]
+; CHECK-V8A-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-V8A-NEXT:    [[CONTINUE:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; CHECK-V8A-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8A:       exit:
+; CHECK-V8A-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED1]] ]
+; CHECK-V8A-NEXT:    ret i32 [[RESULT]]
+;
+loop.preheader:                                   ; preds = %entry
+  br label %loop
+
+loop:                                             ; preds = %guarded4, %loop.preheader
+  %loop.acc = phi i32 [ %loop.acc.next, %guarded1 ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %guarded1 ], [ 0, %loop.preheader ]
+  %within.bounds.1 = icmp ult i32 %i, %length
+  br i1 %within.bounds.1, label %guarded, label %deopt, !prof !0
+
+deopt:                                            ; preds = %loop
+  call void @prevent_merging()
+  ret i32 -1
+
+guarded:                                          ; preds = %loop
+  %i.i64 = zext i32 %i to i64
+  %array.1.i.ptr = getelementptr inbounds i32, i32* %array.1, i64 %i.i64
+  %array.1.i = load i32, i32* %array.1.i.ptr, align 4
+  %loop.acc.1 = add i32 %loop.acc, %array.1.i
+  %within.bounds.2 = icmp ult i32 %i, %length
+  br i1 %within.bounds.2, label %guarded1, label %deopt2, !prof !0
+
+deopt2:                                           ; preds = %guarded
+  call void @prevent_merging()
+  ret i32 -1
+
+guarded1:                                         ; preds = %guarded1
+  %array.3.i.ptr = getelementptr inbounds i32, i32* %array.3, i64 %i.i64
+  %array.3.i = load i32, i32* %array.3.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc.1, %array.3.i
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ %loop.acc.next, %guarded1 ]
+  ret i32 %result
+}
+
+; Demonstrate that this approach works with IVs of different steps, and types
+; This version uses a manually lftred exit condition to work around an issue described
+; in detail on next test.
+define i32 @different_ivs(i32* %array, i32 %length, i32 %n) #0 {
+; CHECK-V8M-LABEL: @different_ivs(
+; CHECK-V8M-NEXT:  loop.preheader:
+; CHECK-V8M-NEXT:    [[N64:%.*]] = zext i32 [[N:%.*]] to i64
+; CHECK-V8M-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[N64]], 1
+; CHECK-V8M-NEXT:    [[UMAX:%.*]] = select i1 [[TMP0]], i64 [[N64]], i64 1
+; CHECK-V8M-NEXT:    [[TMP1:%.*]] = add nsw i64 [[UMAX]], -1
+; CHECK-V8M-NEXT:    [[TMP2:%.*]] = zext i32 [[LENGTH:%.*]] to i64
+; CHECK-V8M-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP1]], [[TMP2]]
+; CHECK-V8M-NEXT:    [[UMIN:%.*]] = select i1 [[TMP3]], i64 [[TMP1]], i64 [[TMP2]]
+; CHECK-V8M-NEXT:    [[TMP4:%.*]] = zext i32 [[LENGTH]] to i64
+; CHECK-V8M-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], [[UMIN]]
+; CHECK-V8M-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8M:       loop:
+; CHECK-V8M-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8M-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[GUARDED]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8M-NEXT:    br i1 [[TMP5]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8M:       deopt:
+; CHECK-V8M-NEXT:    call void @prevent_merging()
+; CHECK-V8M-NEXT:    ret i32 -1
+; CHECK-V8M:       guarded:
+; CHECK-V8M-NEXT:    [[ARRAY_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY:%.*]], i64 [[I]]
+; CHECK-V8M-NEXT:    [[ARRAY_I:%.*]] = load i32, i32* [[ARRAY_I_PTR]], align 4
+; CHECK-V8M-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC]], [[ARRAY_I]]
+; CHECK-V8M-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-V8M-NEXT:    [[CONTINUE:%.*]] = icmp ult i64 [[I_NEXT]], [[N64]]
+; CHECK-V8M-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8M:       exit:
+; CHECK-V8M-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED]] ]
+; CHECK-V8M-NEXT:    ret i32 [[RESULT]]
+;
+; CHECK-V8A-LABEL: @different_ivs(
+; CHECK-V8A-NEXT:  loop.preheader:
+; CHECK-V8A-NEXT:    [[N64:%.*]] = zext i32 [[N:%.*]] to i64
+; CHECK-V8A-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[N64]], 1
+; CHECK-V8A-NEXT:    [[UMAX:%.*]] = select i1 [[TMP0]], i64 [[N64]], i64 1
+; CHECK-V8A-NEXT:    [[TMP1:%.*]] = add nsw i64 [[UMAX]], -1
+; CHECK-V8A-NEXT:    [[TMP2:%.*]] = zext i32 [[LENGTH:%.*]] to i64
+; CHECK-V8A-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP1]], [[TMP2]]
+; CHECK-V8A-NEXT:    [[UMIN:%.*]] = select i1 [[TMP3]], i64 [[TMP1]], i64 [[TMP2]]
+; CHECK-V8A-NEXT:    [[TMP4:%.*]] = zext i32 [[LENGTH]] to i64
+; CHECK-V8A-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], [[UMIN]]
+; CHECK-V8A-NEXT:    br label [[LOOP:%.*]]
+; CHECK-V8A:       loop:
+; CHECK-V8A-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED:%.*]] ], [ 0, [[LOOP_PREHEADER:%.*]] ]
+; CHECK-V8A-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[GUARDED]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-V8A-NEXT:    br i1 [[TMP5]], label [[GUARDED]], label [[DEOPT:%.*]], !prof !0
+; CHECK-V8A:       deopt:
+; CHECK-V8A-NEXT:    call void @prevent_merging()
+; CHECK-V8A-NEXT:    ret i32 -1
+; CHECK-V8A:       guarded:
+; CHECK-V8A-NEXT:    [[ARRAY_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY:%.*]], i64 [[I]]
+; CHECK-V8A-NEXT:    [[ARRAY_I:%.*]] = load i32, i32* [[ARRAY_I_PTR]], align 4
+; CHECK-V8A-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC]], [[ARRAY_I]]
+; CHECK-V8A-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
+; CHECK-V8A-NEXT:    [[CONTINUE:%.*]] = icmp ult i64 [[I_NEXT]], [[N64]]
+; CHECK-V8A-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-V8A:       exit:
+; CHECK-V8A-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED]] ]
+; CHECK-V8A-NEXT:    ret i32 [[RESULT]]
+;
+loop.preheader:
+  %j.start = sub nuw nsw i32 %length, 1
+  %n64 = zext i32 %n to i64
+  br label %loop
+
+loop:
+  %loop.acc = phi i32 [ %loop.acc.next, %guarded ], [ 0, %loop.preheader ]
+  %i = phi i64 [ %i.next, %guarded ], [ 0, %loop.preheader ]
+  %j = phi i32 [ %j.next, %guarded ], [ %j.start, %loop.preheader ]
+  %within.bounds = icmp ne i32 %j, -1
+  br i1 %within.bounds, label %guarded, label %deopt, !prof !0
+
+deopt:
+  call void @prevent_merging()
+  ret i32 -1
+
+guarded:
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+  %i.next = add nuw i64 %i, 1
+  %j.next = sub nuw i32 %j, 1
+  %continue = icmp ult i64 %i.next, %n64
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ %loop.acc.next, %guarded ]
+  ret i32 %result
+}
+
+declare void @prevent_merging()
+declare void @call()
+
+!0 = !{!"branch_weights", i32 1048576, i32 1}
+!1 = !{i32 1, i32 -2147483648}
+!2 = !{i32 0, i32 50}
+
+attributes #0 = { minsize optsize }

From 9f63dc3265748bf63d7475ba4013dc1b0c728f6c Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 13 Aug 2020 15:52:00 +0100
Subject: [PATCH 026/101] [SVE] Fix shift-by-imm patterns used by asr, lsl &
 lsr intrinsics.

Right shift patterns will no longer incorrectly accept a shift
amount of zero.  At the same time they will allow larger shift
amounts that are now saturated to their upper bound.

Patterns have been extended to enable immediate forms for shifts
taking an arbitrary predicate.

This patch also unifies the code path for immediate parsing so the
i64 based shifts are no longer treated specially.

Differential Revision: https://reviews.llvm.org/D86084
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  39 +-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  18 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  93 ++-
 .../AArch64/sve-intrinsics-int-arith-imm.ll   | 669 ++++++++++++++----
 4 files changed, 633 insertions(+), 186 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 7799ebfbd68e63..184458607c3cc4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -190,9 +190,9 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
     return SelectSVELogicalImm(N, VT, Imm);
   }
 
-  template <unsigned Low, unsigned High>
-  bool SelectSVEShiftImm64(SDValue N, SDValue &Imm) {
-    return SelectSVEShiftImm64(N, Low, High, Imm);
+  template <unsigned Low, unsigned High, bool AllowSaturation = false>
+  bool SelectSVEShiftImm(SDValue N, SDValue &Imm) {
+    return SelectSVEShiftImm(N, Low, High, AllowSaturation, Imm);
   }
 
   // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
@@ -323,8 +323,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);
 
   bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
-  bool SelectSVEShiftImm64(SDValue N, uint64_t Low, uint64_t High,
-                           SDValue &Imm);
+  bool SelectSVEShiftImm(SDValue N, uint64_t Low, uint64_t High,
+                         bool AllowSaturation, SDValue &Imm);
 
   bool SelectSVEArithImm(SDValue N, SDValue &Imm);
   bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
@@ -3177,19 +3177,30 @@ bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) {
   return false;
 }
 
-// This method is only needed to "cast" i64s into i32s when the value
-// is a valid shift which has been splatted into a vector with i64 elements.
-// Every other type is fine in tablegen.
-bool AArch64DAGToDAGISel::SelectSVEShiftImm64(SDValue N, uint64_t Low,
-                                              uint64_t High, SDValue &Imm) {
+// SVE shift intrinsics allow shift amounts larger than the element's bitwidth.
+// Rather than attempt to normalise everything we can sometimes saturate the
+// shift amount during selection. This function also allows for consistent
+// isel patterns by ensuring the resulting "Imm" node is of the i32 type
+// required by the instructions.
+bool AArch64DAGToDAGISel::SelectSVEShiftImm(SDValue N, uint64_t Low,
+                                            uint64_t High, bool AllowSaturation,
+                                            SDValue &Imm) {
   if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
     uint64_t ImmVal = CN->getZExtValue();
-    SDLoc DL(N);
 
-    if (ImmVal >= Low && ImmVal <= High) {
-      Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
-      return true;
+    // Reject shift amounts that are too small.
+    if (ImmVal < Low)
+      return false;
+
+    // Reject or saturate shift amounts that are too big.
+    if (ImmVal > High) {
+      if (!AllowSaturation)
+        return false;
+      ImmVal = High;
     }
+
+    Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i32);
+    return true;
   }
 
   return false;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 19a701d647a5e0..ea4c6cab5c35d3 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1343,10 +1343,10 @@ multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instructio
   defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;
 
   // Predicated shifts
-  defm ASR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b0000, "asr", "ASR_ZPZI">;
-  defm LSR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b0001, "lsr", "LSR_ZPZI">;
-  defm LSL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">;
-  defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>;
+  defm ASR_ZPmI  : sve_int_bin_pred_shift_imm_right_dup<0b0000, "asr",  "ASR_ZPZI",  int_aarch64_sve_asr>;
+  defm LSR_ZPmI  : sve_int_bin_pred_shift_imm_right_dup<0b0001, "lsr",  "LSR_ZPZI",  int_aarch64_sve_lsr>;
+  defm LSL_ZPmI  : sve_int_bin_pred_shift_imm_left_dup< 0b0011, "lsl",  "LSL_ZPZI",  int_aarch64_sve_lsl>;
+  defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<    0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>;
 
   let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
     defm ASR_ZPZZ    : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_asr>;
@@ -2385,11 +2385,11 @@ let Predicates = [HasSVE2] in {
   }
 
   // SVE2 predicated shifts
-  defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">;
-  defm UQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">;
-  defm SRSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1100,  "srshr",  "SRSHR_ZPZI",  int_aarch64_sve_srshr>;
-  defm URSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1101,  "urshr",  "URSHR_ZPZI",  int_aarch64_sve_urshr>;
-  defm SQSHLU_ZPmI : sve2_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;
+  defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl",  "SQSHL_ZPZI">;
+  defm UQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl",  "UQSHL_ZPZI">;
+  defm SRSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1100, "srshr",  "SRSHR_ZPZI",  int_aarch64_sve_srshr>;
+  defm URSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1101, "urshr",  "URSHR_ZPZI",  int_aarch64_sve_urshr>;
+  defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;
 
   // SVE2 integer add/subtract long
   defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index b517d11de753da..5eb811b9c78eb5 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -209,7 +209,14 @@ def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
 def SVEArithUImmPat  : ComplexPattern<i32, 1, "SelectSVEArithImm", []>;
 def SVEArithSImmPat  : ComplexPattern<i32, 1, "SelectSVESignedArithImm", []>;
 
-def SVEShiftImm64 : ComplexPattern<i32, 1, "SelectSVEShiftImm64<0, 64>", []>;
+def SVEShiftImmL8  : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 7>",  []>;
+def SVEShiftImmL16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 15>", []>;
+def SVEShiftImmL32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 31>", []>;
+def SVEShiftImmL64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<0, 63>", []>;
+def SVEShiftImmR8  : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 8,  true>", []>;
+def SVEShiftImmR16 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 16, true>", []>;
+def SVEShiftImmR32 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 32, true>", []>;
+def SVEShiftImmR64 : ComplexPattern<i32, 1, "SelectSVEShiftImm<1, 64, true>", []>;
 
 class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
   let Name = "SVEExactFPImmOperand" # Suffix;
@@ -315,11 +322,6 @@ class SVE_1_Op_Imm_OptLsl_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty
   : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm, i32:$shift)))))),
         (inst $Op1, i32:$imm, i32:$shift)>;
 
-class SVE_1_Op_Imm_Shift_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
-                                  ZPRRegOp zprty, Operand ImmTy, Instruction inst>
-  : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (ImmTy:$imm))))),
-        (inst $Op1, ImmTy:$imm)>;
-
 class SVE_1_Op_Imm_Arith_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
                                   ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst>
   : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
@@ -409,6 +411,18 @@ class SVE_InReg_Extend<ValueType vt, SDPatternOperator op, ValueType pt,
 : Pat<(vt (op pt:$Pg, vt:$Src, inreg_vt, vt:$PassThru)),
       (inst $PassThru, $Pg, $Src)>;
 
+class SVE_Shift_DupImm_Pred_Pat<ValueType vt, SDPatternOperator op,
+                                ValueType pt, ValueType it,
+                                ComplexPattern cast, Instruction inst>
+: Pat<(vt (op pt:$Pg, vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
+      (inst $Pg, $Rn, i32:$imm)>;
+
+class SVE_Shift_DupImm_All_Active_Pat<ValueType vt, SDPatternOperator op,
+                                      ValueType pt, ValueType it,
+                                      ComplexPattern cast, Instruction inst>
+: Pat<(vt (op (pt (AArch64ptrue 31)), vt:$Rn, (vt (AArch64dup (it (cast i32:$imm)))))),
+      (inst $Rn, i32:$imm)>;
+
 //
 // Pseudo -> Instruction mappings
 //
@@ -4761,38 +4775,19 @@ class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string psName=""> {
-  def _B : SVEPseudo2Instr<psName # _B, 1>,
+multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string Ps,
+                                           SDPatternOperator op = null_frag> {
+  def _B : SVEPseudo2Instr<Ps # _B, 1>,
            sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
-  def _H : SVEPseudo2Instr<psName # _H, 1>,
-           sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
-    let Inst{8} = imm{3};
-  }
-  def _S : SVEPseudo2Instr<psName # _S, 1>,
-           sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
-    let Inst{9-8} = imm{4-3};
-  }
-  def _D : SVEPseudo2Instr<psName # _D, 1>,
-           sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
-    let Inst{22}  = imm{5};
-    let Inst{9-8} = imm{4-3};
-  }
-}
-
-multiclass sve2_int_bin_pred_shift_imm_left<bits<4> opc, string asm,
-                                            string psName,
-                                            SDPatternOperator op> {
-
-  def _B : SVEPseudo2Instr<psName # _B, 1>, sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
-  def _H : SVEPseudo2Instr<psName # _H, 1>,
+  def _H : SVEPseudo2Instr<Ps # _H, 1>,
            sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
     let Inst{8} = imm{3};
   }
-  def _S : SVEPseudo2Instr<psName # _S, 1>,
+  def _S : SVEPseudo2Instr<Ps # _S, 1>,
            sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
     let Inst{9-8} = imm{4-3};
   }
-  def _D : SVEPseudo2Instr<psName # _D, 1>,
+  def _D : SVEPseudo2Instr<Ps # _D, 1>,
            sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
     let Inst{22}  = imm{5};
     let Inst{9-8} = imm{4-3};
@@ -4804,6 +4799,16 @@ multiclass sve2_int_bin_pred_shift_imm_left<bits<4> opc, string asm,
   def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1,  nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>;
 }
 
+// As above but shift amount takes the form of a "vector immediate".
+multiclass sve_int_bin_pred_shift_imm_left_dup<bits<4> opc, string asm,
+                                               string Ps, SDPatternOperator op>
+: sve_int_bin_pred_shift_imm_left<opc, asm, Ps, null_frag> {
+  def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmL8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1,  i32, SVEShiftImmL16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1,  i32, SVEShiftImmL32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1,  i64, SVEShiftImmL64, !cast<Instruction>(NAME # _D)>;
+}
+
 multiclass sve_int_bin_pred_shift_imm_left_zeroing_bhsd<SDPatternOperator op> {
   def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8,  tvecshiftL8,  FalseLanesZero>;
   def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, tvecshiftL16, FalseLanesZero>;
@@ -4840,6 +4845,16 @@ multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm, string Ps,
   def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1,  nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
 }
 
+// As above but shift amount takes the form of a "vector immediate".
+multiclass sve_int_bin_pred_shift_imm_right_dup<bits<4> opc, string asm,
+                                            string Ps, SDPatternOperator op>
+: sve_int_bin_pred_shift_imm_right<opc, asm, Ps, null_frag> {
+  def : SVE_Shift_DupImm_Pred_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv8i16, op, nxv8i1,  i32, SVEShiftImmR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv4i32, op, nxv4i1,  i32, SVEShiftImmR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_Shift_DupImm_Pred_Pat<nxv2i64, op, nxv2i1,  i64, SVEShiftImmR64, !cast<Instruction>(NAME # _D)>;
+}
+
 multiclass sve_int_bin_pred_shift_imm_right_zeroing_bhsd<SDPatternOperator op = null_frag> {
   def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, vecshiftR8, FalseLanesZero>;
   def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, vecshiftR16, FalseLanesZero>;
@@ -4980,10 +4995,10 @@ multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm,
     let Inst{20-19} = imm{4-3};
   }
 
-  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8,  vecshiftL8,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1,  op, ZPR16, vecshiftL16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1,  op, ZPR32, vecshiftL32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmL8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv8i16, op, nxv8i1,  i32, SVEShiftImmL16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv4i32, op, nxv4i1,  i32, SVEShiftImmL32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv2i64, op, nxv2i1,  i64, SVEShiftImmL64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm,
@@ -5000,10 +5015,10 @@ multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm,
     let Inst{20-19} = imm{4-3};
   }
 
-  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8,  vecshiftR8,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1,  op, ZPR16, vecshiftR16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1,  op, ZPR32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1,  op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv16i8, op, nxv16i1, i32, SVEShiftImmR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv8i16, op, nxv8i1,  i32, SVEShiftImmR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv4i32, op, nxv4i1,  i32, SVEShiftImmR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_Shift_DupImm_All_Active_Pat<nxv2i64, op, nxv2i1,  i64, SVEShiftImmR64, !cast<Instruction>(NAME # _D)>;
 }
 //===----------------------------------------------------------------------===//
 // SVE Memory - Store Group
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
index 0a6842921cbe3d..c70686d3447c85 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
@@ -8,8 +9,9 @@
 
 define <vscale x 16 x i8> @smax_i8(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: smax_i8:
-; CHECK: smax z0.b, z0.b, #-128
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smax z0.b, z0.b, #-128
+; CHECK-NEXT:    ret
   %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %elt = insertelement <vscale x 16 x i8> undef, i8 -128, i32 0
   %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
@@ -21,8 +23,9 @@ define <vscale x 16 x i8> @smax_i8(<vscale x 16 x i8> %a) {
 
 define <vscale x 8 x i16> @smax_i16(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: smax_i16:
-; CHECK: smax z0.h, z0.h, #127
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smax z0.h, z0.h, #127
+; CHECK-NEXT:    ret
   %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %elt = insertelement <vscale x 8 x i16> undef, i16 127, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
@@ -34,8 +37,9 @@ define <vscale x 8 x i16> @smax_i16(<vscale x 8 x i16> %a) {
 
 define <vscale x 4 x i32> @smax_i32(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: smax_i32:
-; CHECK: smax z0.s, z0.s, #-128
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smax z0.s, z0.s, #-128
+; CHECK-NEXT:    ret
   %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %elt = insertelement <vscale x 4 x i32> undef, i32 -128, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -47,8 +51,9 @@ define <vscale x 4 x i32> @smax_i32(<vscale x 4 x i32> %a) {
 
 define <vscale x 2 x i64> @smax_i64(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: smax_i64:
-; CHECK: smax z0.d, z0.d, #127
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smax z0.d, z0.d, #127
+; CHECK-NEXT:    ret
   %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %elt = insertelement <vscale x 2 x i64> undef, i64 127, i64 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
@@ -62,8 +67,9 @@ define <vscale x 2 x i64> @smax_i64(<vscale x 2 x i64> %a) {
 
 define <vscale x 16 x i8> @smin_i8(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: smin_i8:
-; CHECK: smin z0.b, z0.b, #127
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smin z0.b, z0.b, #127
+; CHECK-NEXT:    ret
   %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %elt = insertelement <vscale x 16 x i8> undef, i8 127, i32 0
   %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
@@ -75,8 +81,9 @@ define <vscale x 16 x i8> @smin_i8(<vscale x 16 x i8> %a) {
 
 define <vscale x 8 x i16> @smin_i16(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: smin_i16:
-; CHECK: smin z0.h, z0.h, #-128
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smin z0.h, z0.h, #-128
+; CHECK-NEXT:    ret
   %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %elt = insertelement <vscale x 8 x i16> undef, i16 -128, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
@@ -88,8 +95,9 @@ define <vscale x 8 x i16> @smin_i16(<vscale x 8 x i16> %a) {
 
 define <vscale x 4 x i32> @smin_i32(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: smin_i32:
-; CHECK: smin z0.s, z0.s, #127
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smin z0.s, z0.s, #127
+; CHECK-NEXT:    ret
   %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %elt = insertelement <vscale x 4 x i32> undef, i32 127, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -101,8 +109,9 @@ define <vscale x 4 x i32> @smin_i32(<vscale x 4 x i32> %a) {
 
 define <vscale x 2 x i64> @smin_i64(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: smin_i64:
-; CHECK: smin z0.d, z0.d, #-128
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smin z0.d, z0.d, #-128
+; CHECK-NEXT:    ret
   %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %elt = insertelement <vscale x 2 x i64> undef, i64 -128, i64 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
@@ -116,8 +125,9 @@ define <vscale x 2 x i64> @smin_i64(<vscale x 2 x i64> %a) {
 
 define <vscale x 16 x i8> @umax_i8(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: umax_i8:
-; CHECK: umax z0.b, z0.b, #0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umax z0.b, z0.b, #0
+; CHECK-NEXT:    ret
   %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %elt = insertelement <vscale x 16 x i8> undef, i8 0, i32 0
   %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
@@ -129,8 +139,9 @@ define <vscale x 16 x i8> @umax_i8(<vscale x 16 x i8> %a) {
 
 define <vscale x 8 x i16> @umax_i16(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: umax_i16:
-; CHECK: umax z0.h, z0.h, #255
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umax z0.h, z0.h, #255
+; CHECK-NEXT:    ret
   %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %elt = insertelement <vscale x 8 x i16> undef, i16 255, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
@@ -142,8 +153,9 @@ define <vscale x 8 x i16> @umax_i16(<vscale x 8 x i16> %a) {
 
 define <vscale x 4 x i32> @umax_i32(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: umax_i32:
-; CHECK: umax z0.s, z0.s, #0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umax z0.s, z0.s, #0
+; CHECK-NEXT:    ret
   %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %elt = insertelement <vscale x 4 x i32> undef, i32 0, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -155,8 +167,9 @@ define <vscale x 4 x i32> @umax_i32(<vscale x 4 x i32> %a) {
 
 define <vscale x 2 x i64> @umax_i64(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: umax_i64:
-; CHECK: umax z0.d, z0.d, #255
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umax z0.d, z0.d, #255
+; CHECK-NEXT:    ret
   %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %elt = insertelement <vscale x 2 x i64> undef, i64 255, i64 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
@@ -170,8 +183,9 @@ define <vscale x 2 x i64> @umax_i64(<vscale x 2 x i64> %a) {
 
 define <vscale x 16 x i8> @umin_i8(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: umin_i8:
-; CHECK: umin z0.b, z0.b, #255
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umin z0.b, z0.b, #255
+; CHECK-NEXT:    ret
   %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %elt = insertelement <vscale x 16 x i8> undef, i8 255, i32 0
   %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
@@ -183,8 +197,9 @@ define <vscale x 16 x i8> @umin_i8(<vscale x 16 x i8> %a) {
 
 define <vscale x 8 x i16> @umin_i16(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: umin_i16:
-; CHECK: umin z0.h, z0.h, #0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umin z0.h, z0.h, #0
+; CHECK-NEXT:    ret
   %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %elt = insertelement <vscale x 8 x i16> undef, i16 0, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
@@ -196,8 +211,9 @@ define <vscale x 8 x i16> @umin_i16(<vscale x 8 x i16> %a) {
 
 define <vscale x 4 x i32> @umin_i32(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: umin_i32:
-; CHECK: umin z0.s, z0.s, #255
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umin z0.s, z0.s, #255
+; CHECK-NEXT:    ret
   %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %elt = insertelement <vscale x 4 x i32> undef, i32 255, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -209,8 +225,9 @@ define <vscale x 4 x i32> @umin_i32(<vscale x 4 x i32> %a) {
 
 define <vscale x 2 x i64> @umin_i64(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: umin_i64:
-; CHECK: umin z0.d, z0.d, #0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umin z0.d, z0.d, #0
+; CHECK-NEXT:    ret
   %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %elt = insertelement <vscale x 2 x i64> undef, i64 0, i64 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
@@ -224,8 +241,9 @@ define <vscale x 2 x i64> @umin_i64(<vscale x 2 x i64> %a) {
 
 define <vscale x 16 x i8> @sqadd_b_lowimm(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: sqadd_b_lowimm:
-; CHECK: sqadd z0.b, z0.b, #27
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqadd z0.b, z0.b, #27 // =0x1b
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 16 x i8> undef, i8 27, i32 0
   %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
   %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sqadd.x.nxv16i8(<vscale x 16 x i8> %a,
@@ -235,8 +253,9 @@ define <vscale x 16 x i8> @sqadd_b_lowimm(<vscale x 16 x i8> %a) {
 
 define <vscale x 8 x i16> @sqadd_h_lowimm(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: sqadd_h_lowimm:
-; CHECK: sqadd z0.h, z0.h, #43
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqadd z0.h, z0.h, #43 // =0x2b
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 43, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
   %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqadd.x.nxv8i16(<vscale x 8 x i16> %a,
@@ -246,8 +265,9 @@ define <vscale x 8 x i16> @sqadd_h_lowimm(<vscale x 8 x i16> %a) {
 
 define <vscale x 8 x i16> @sqadd_h_highimm(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: sqadd_h_highimm:
-; CHECK: sqadd z0.h, z0.h, #2048
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqadd z0.h, z0.h, #2048 // =0x800
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 2048, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
   %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqadd.x.nxv8i16(<vscale x 8 x i16> %a,
@@ -257,8 +277,9 @@ define <vscale x 8 x i16> @sqadd_h_highimm(<vscale x 8 x i16> %a) {
 
 define <vscale x 4 x i32> @sqadd_s_lowimm(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: sqadd_s_lowimm:
-; CHECK: sqadd z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqadd z0.s, z0.s, #1 // =0x1
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 1, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
   %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqadd.x.nxv4i32(<vscale x 4 x i32> %a,
@@ -268,8 +289,9 @@ define <vscale x 4 x i32> @sqadd_s_lowimm(<vscale x 4 x i32> %a) {
 
 define <vscale x 4 x i32> @sqadd_s_highimm(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: sqadd_s_highimm:
-; CHECK: sqadd z0.s, z0.s, #8192
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqadd z0.s, z0.s, #8192 // =0x2000
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 8192, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
   %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqadd.x.nxv4i32(<vscale x 4 x i32> %a,
@@ -279,8 +301,9 @@ define <vscale x 4 x i32> @sqadd_s_highimm(<vscale x 4 x i32> %a) {
 
 define <vscale x 2 x i64> @sqadd_d_lowimm(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: sqadd_d_lowimm:
-; CHECK: sqadd z0.d, z0.d, #255
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqadd z0.d, z0.d, #255 // =0xff
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 255, i32 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
   %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqadd.x.nxv2i64(<vscale x 2 x i64> %a,
@@ -290,8 +313,9 @@ define <vscale x 2 x i64> @sqadd_d_lowimm(<vscale x 2 x i64> %a) {
 
 define <vscale x 2 x i64> @sqadd_d_highimm(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: sqadd_d_highimm:
-; CHECK: sqadd z0.d, z0.d, #65280
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqadd z0.d, z0.d, #65280 // =0xff00
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 65280, i32 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
   %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqadd.x.nxv2i64(<vscale x 2 x i64> %a,
@@ -303,8 +327,9 @@ define <vscale x 2 x i64> @sqadd_d_highimm(<vscale x 2 x i64> %a) {
 
 define <vscale x 16 x i8> @sqsub_b_lowimm(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: sqsub_b_lowimm:
-; CHECK: sqsub z0.b, z0.b, #27
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqsub z0.b, z0.b, #27 // =0x1b
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 16 x i8> undef, i8 27, i32 0
   %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
   %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sqsub.x.nxv16i8(<vscale x 16 x i8> %a,
@@ -314,8 +339,9 @@ define <vscale x 16 x i8> @sqsub_b_lowimm(<vscale x 16 x i8> %a) {
 
 define <vscale x 8 x i16> @sqsub_h_lowimm(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: sqsub_h_lowimm:
-; CHECK: sqsub z0.h, z0.h, #43
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqsub z0.h, z0.h, #43 // =0x2b
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 43, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
   %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqsub.x.nxv8i16(<vscale x 8 x i16> %a,
@@ -325,8 +351,9 @@ define <vscale x 8 x i16> @sqsub_h_lowimm(<vscale x 8 x i16> %a) {
 
 define <vscale x 8 x i16> @sqsub_h_highimm(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: sqsub_h_highimm:
-; CHECK: sqsub z0.h, z0.h, #2048
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqsub z0.h, z0.h, #2048 // =0x800
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 2048, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
   %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqsub.x.nxv8i16(<vscale x 8 x i16> %a,
@@ -336,8 +363,9 @@ define <vscale x 8 x i16> @sqsub_h_highimm(<vscale x 8 x i16> %a) {
 
 define <vscale x 4 x i32> @sqsub_s_lowimm(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: sqsub_s_lowimm:
-; CHECK: sqsub z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqsub z0.s, z0.s, #1 // =0x1
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 1, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
   %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqsub.x.nxv4i32(<vscale x 4 x i32> %a,
@@ -347,8 +375,9 @@ define <vscale x 4 x i32> @sqsub_s_lowimm(<vscale x 4 x i32> %a) {
 
 define <vscale x 4 x i32> @sqsub_s_highimm(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: sqsub_s_highimm:
-; CHECK: sqsub z0.s, z0.s, #8192
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqsub z0.s, z0.s, #8192 // =0x2000
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 8192, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
   %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqsub.x.nxv4i32(<vscale x 4 x i32> %a,
@@ -358,8 +387,9 @@ define <vscale x 4 x i32> @sqsub_s_highimm(<vscale x 4 x i32> %a) {
 
 define <vscale x 2 x i64> @sqsub_d_lowimm(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: sqsub_d_lowimm:
-; CHECK: sqsub z0.d, z0.d, #255
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqsub z0.d, z0.d, #255 // =0xff
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 255, i32 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
   %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqsub.x.nxv2i64(<vscale x 2 x i64> %a,
@@ -369,8 +399,9 @@ define <vscale x 2 x i64> @sqsub_d_lowimm(<vscale x 2 x i64> %a) {
 
 define <vscale x 2 x i64> @sqsub_d_highimm(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: sqsub_d_highimm:
-; CHECK: sqsub z0.d, z0.d, #65280
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqsub z0.d, z0.d, #65280 // =0xff00
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 65280, i32 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
   %out = call <vscale x 2 x i64> @llvm.aarch64.sve.sqsub.x.nxv2i64(<vscale x 2 x i64> %a,
@@ -382,8 +413,9 @@ define <vscale x 2 x i64> @sqsub_d_highimm(<vscale x 2 x i64> %a) {
 
 define <vscale x 16 x i8> @uqadd_b_lowimm(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: uqadd_b_lowimm:
-; CHECK: uqadd z0.b, z0.b, #27
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.b, z0.b, #27 // =0x1b
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 16 x i8> undef, i8 27, i32 0
   %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
   %out = call <vscale x 16 x i8> @llvm.aarch64.sve.uqadd.x.nxv16i8(<vscale x 16 x i8> %a,
@@ -393,8 +425,9 @@ define <vscale x 16 x i8> @uqadd_b_lowimm(<vscale x 16 x i8> %a) {
 
 define <vscale x 8 x i16> @uqadd_h_lowimm(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: uqadd_h_lowimm:
-; CHECK: uqadd z0.h, z0.h, #43
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.h, z0.h, #43 // =0x2b
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 43, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
   %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uqadd.x.nxv8i16(<vscale x 8 x i16> %a,
@@ -404,8 +437,9 @@ define <vscale x 8 x i16> @uqadd_h_lowimm(<vscale x 8 x i16> %a) {
 
 define <vscale x 8 x i16> @uqadd_h_highimm(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: uqadd_h_highimm:
-; CHECK: uqadd z0.h, z0.h, #2048
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.h, z0.h, #2048 // =0x800
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 2048, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
   %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uqadd.x.nxv8i16(<vscale x 8 x i16> %a,
@@ -415,8 +449,9 @@ define <vscale x 8 x i16> @uqadd_h_highimm(<vscale x 8 x i16> %a) {
 
 define <vscale x 4 x i32> @uqadd_s_lowimm(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: uqadd_s_lowimm:
-; CHECK: uqadd z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.s, z0.s, #1 // =0x1
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 1, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
   %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqadd.x.nxv4i32(<vscale x 4 x i32> %a,
@@ -428,8 +463,9 @@ define <vscale x 4 x i32> @uqadd_s_lowimm(<vscale x 4 x i32> %a) {
 
 define <vscale x 16 x i8> @uqsub_b_lowimm(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: uqsub_b_lowimm:
-; CHECK: uqsub z0.b, z0.b, #27
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqsub z0.b, z0.b, #27 // =0x1b
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 16 x i8> undef, i8 27, i32 0
   %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
   %out = call <vscale x 16 x i8> @llvm.aarch64.sve.uqsub.x.nxv16i8(<vscale x 16 x i8> %a,
@@ -439,8 +475,9 @@ define <vscale x 16 x i8> @uqsub_b_lowimm(<vscale x 16 x i8> %a) {
 
 define <vscale x 8 x i16> @uqsub_h_lowimm(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: uqsub_h_lowimm:
-; CHECK: uqsub z0.h, z0.h, #43
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqsub z0.h, z0.h, #43 // =0x2b
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 43, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
   %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uqsub.x.nxv8i16(<vscale x 8 x i16> %a,
@@ -450,8 +487,9 @@ define <vscale x 8 x i16> @uqsub_h_lowimm(<vscale x 8 x i16> %a) {
 
 define <vscale x 8 x i16> @uqsub_h_highimm(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: uqsub_h_highimm:
-; CHECK: uqsub z0.h, z0.h, #2048
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqsub z0.h, z0.h, #2048 // =0x800
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 2048, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
   %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uqsub.x.nxv8i16(<vscale x 8 x i16> %a,
@@ -461,8 +499,9 @@ define <vscale x 8 x i16> @uqsub_h_highimm(<vscale x 8 x i16> %a) {
 
 define <vscale x 4 x i32> @uqsub_s_lowimm(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: uqsub_s_lowimm:
-; CHECK: uqsub z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqsub z0.s, z0.s, #1 // =0x1
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 1, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
   %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqsub.x.nxv4i32(<vscale x 4 x i32> %a,
@@ -472,8 +511,9 @@ define <vscale x 4 x i32> @uqsub_s_lowimm(<vscale x 4 x i32> %a) {
 
 define <vscale x 4 x i32> @uqsub_s_highimm(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: uqsub_s_highimm:
-; CHECK: uqsub z0.s, z0.s, #8192
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqsub z0.s, z0.s, #8192 // =0x2000
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 8192, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
   %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqsub.x.nxv4i32(<vscale x 4 x i32> %a,
@@ -483,8 +523,9 @@ define <vscale x 4 x i32> @uqsub_s_highimm(<vscale x 4 x i32> %a) {
 
 define <vscale x 2 x i64> @uqsub_d_lowimm(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: uqsub_d_lowimm:
-; CHECK: uqsub z0.d, z0.d, #255
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqsub z0.d, z0.d, #255 // =0xff
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 255, i32 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
   %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqsub.x.nxv2i64(<vscale x 2 x i64> %a,
@@ -494,8 +535,9 @@ define <vscale x 2 x i64> @uqsub_d_lowimm(<vscale x 2 x i64> %a) {
 
 define <vscale x 2 x i64> @uqsub_d_highimm(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: uqsub_d_highimm:
-; CHECK: uqsub z0.d, z0.d, #65280
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqsub z0.d, z0.d, #65280 // =0xff00
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 65280, i32 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
   %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqsub.x.nxv2i64(<vscale x 2 x i64> %a,
@@ -506,8 +548,9 @@ define <vscale x 2 x i64> @uqsub_d_highimm(<vscale x 2 x i64> %a) {
 
 define <vscale x 4 x i32> @uqadd_s_highimm(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: uqadd_s_highimm:
-; CHECK: uqadd z0.s, z0.s, #8192
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.s, z0.s, #8192 // =0x2000
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 8192, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
   %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqadd.x.nxv4i32(<vscale x 4 x i32> %a,
@@ -517,8 +560,9 @@ define <vscale x 4 x i32> @uqadd_s_highimm(<vscale x 4 x i32> %a) {
 
 define <vscale x 2 x i64> @uqadd_d_lowimm(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: uqadd_d_lowimm:
-; CHECK: uqadd z0.d, z0.d, #255
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.d, z0.d, #255 // =0xff
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 255, i32 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
   %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a,
@@ -528,8 +572,9 @@ define <vscale x 2 x i64> @uqadd_d_lowimm(<vscale x 2 x i64> %a) {
 
 define <vscale x 2 x i64> @uqadd_d_highimm(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: uqadd_d_highimm:
-; CHECK: uqadd z0.d, z0.d, #65280
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.d, z0.d, #65280 // =0xff00
+; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 65280, i32 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
   %out = call <vscale x 2 x i64> @llvm.aarch64.sve.uqadd.x.nxv2i64(<vscale x 2 x i64> %a,
@@ -539,10 +584,24 @@ define <vscale x 2 x i64> @uqadd_d_highimm(<vscale x 2 x i64> %a) {
 
 ; ASR
 
-define <vscale x 16 x i8> @asr_i8(<vscale x 16 x i8> %a) {
+define <vscale x 16 x i8> @asr_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
 ; CHECK-LABEL: asr_i8:
-; CHECK: asr z0.b, z0.b, #8
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    asr z0.b, p0/m, z0.b, #8
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 16 x i8> undef, i8 9, i32 0
+  %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.asr.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a,
+                                                               <vscale x 16 x i8> %splat)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 16 x i8> @asr_i8_all_active(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: asr_i8_all_active:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    asr z0.b, z0.b, #8
+; CHECK-NEXT:    ret
   %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %elt = insertelement <vscale x 16 x i8> undef, i8 8, i32 0
   %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
@@ -552,10 +611,37 @@ define <vscale x 16 x i8> @asr_i8(<vscale x 16 x i8> %a) {
   ret <vscale x 16 x i8> %out
 }
 
-define <vscale x 8 x i16> @asr_i16(<vscale x 8 x i16> %a) {
+; Ensure we don't match a right shift by zero to the immediate form.
+define <vscale x 16 x i8> @asr_i8_too_small(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: asr_i8_too_small:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.b, #0 // =0x0
+; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.asr.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a,
+                                                               <vscale x 16 x i8> zeroinitializer)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @asr_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
 ; CHECK-LABEL: asr_i16:
-; CHECK: asr z0.h, z0.h, #16
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    asr z0.h, p0/m, z0.h, #16
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 8 x i16> undef, i16 17, i32 0
+  %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.asr.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a,
+                                                               <vscale x 8 x i16> %splat)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 8 x i16> @asr_i16_all_active(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: asr_i16_all_active:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    asr z0.h, z0.h, #16
+; CHECK-NEXT:    ret
   %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %elt = insertelement <vscale x 8 x i16> undef, i16 16, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
@@ -565,10 +651,37 @@ define <vscale x 8 x i16> @asr_i16(<vscale x 8 x i16> %a) {
   ret <vscale x 8 x i16> %out
 }
 
-define <vscale x 4 x i32> @asr_i32(<vscale x 4 x i32> %a) {
+; Ensure we don't match a right shift by zero to the immediate form.
+define <vscale x 8 x i16> @asr_i16_too_small(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: asr_i16_too_small:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.h, #0 // =0x0
+; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.asr.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a,
+                                                               <vscale x 8 x i16> zeroinitializer)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @asr_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
 ; CHECK-LABEL: asr_i32:
-; CHECK: asr z0.s, z0.s, #32
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    asr z0.s, p0/m, z0.s, #32
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 4 x i32> undef, i32 33, i32 0
+  %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.asr.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a,
+                                                               <vscale x 4 x i32> %splat)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @asr_i32_all_active(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: asr_i32_all_active:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    asr z0.s, z0.s, #32
+; CHECK-NEXT:    ret
   %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %elt = insertelement <vscale x 4 x i32> undef, i32 32, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -578,10 +691,37 @@ define <vscale x 4 x i32> @asr_i32(<vscale x 4 x i32> %a) {
   ret <vscale x 4 x i32> %out
 }
 
-define <vscale x 2 x i64> @asr_i64(<vscale x 2 x i64> %a) {
+; Ensure we don't match a right shift by zero to the immediate form.
+define <vscale x 4 x i32> @asr_i32_too_small(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: asr_i32_too_small:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.asr.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a,
+                                                               <vscale x 4 x i32> zeroinitializer)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @asr_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
 ; CHECK-LABEL: asr_i64:
-; CHECK: asr z0.d, z0.d, #64
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    asr z0.d, p0/m, z0.d, #64
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 2 x i64> undef, i64 65, i64 0
+  %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.asr.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a,
+                                                               <vscale x 2 x i64> %splat)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @asr_i64_all_active(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: asr_i64_all_active:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    asr z0.d, z0.d, #64
+; CHECK-NEXT:    ret
   %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %elt = insertelement <vscale x 2 x i64> undef, i64 64, i64 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
@@ -591,12 +731,39 @@ define <vscale x 2 x i64> @asr_i64(<vscale x 2 x i64> %a) {
   ret <vscale x 2 x i64> %out
 }
 
+; Ensure we don't match a right shift by zero to the immediate form.
+define <vscale x 2 x i64> @asr_i64_too_small(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: asr_i64_too_small:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.asr.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a,
+                                                               <vscale x 2 x i64> zeroinitializer)
+  ret <vscale x 2 x i64> %out
+}
+
 ; LSL
 
-define <vscale x 16 x i8> @lsl_i8(<vscale x 16 x i8> %a) {
+define <vscale x 16 x i8> @lsl_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
 ; CHECK-LABEL: lsl_i8:
-; CHECK: lsl z0.b, z0.b, #7
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, #7
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 16 x i8> undef, i8 7, i32 0
+  %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a,
+                                                               <vscale x 16 x i8> %splat)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 16 x i8> @lsl_i8_all_active(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: lsl_i8_all_active:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.b, z0.b, #7
+; CHECK-NEXT:    ret
   %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %elt = insertelement <vscale x 16 x i8> undef, i8 7, i32 0
   %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
@@ -606,10 +773,50 @@ define <vscale x 16 x i8> @lsl_i8(<vscale x 16 x i8> %a) {
   ret <vscale x 16 x i8> %out
 }
 
-define <vscale x 8 x i16> @lsl_i16(<vscale x 8 x i16> %a) {
+; Ensure we don't match a left shift bigger than its bitwidth to the immediate form.
+define <vscale x 16 x i8> @lsl_i8_too_big(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: lsl_i8_too_big:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.b, #8 // =0x8
+; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 16 x i8> undef, i8 8, i32 0
+  %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a,
+                                                               <vscale x 16 x i8> %splat)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 16 x i8> @lsl_i8_zero(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: lsl_i8_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, #0
+; CHECK-NEXT:    ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a,
+                                                               <vscale x 16 x i8> zeroinitializer)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @lsl_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
 ; CHECK-LABEL: lsl_i16:
-; CHECK: lsl z0.h, z0.h, #15
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, #15
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 8 x i16> undef, i16 15, i32 0
+  %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a,
+                                                               <vscale x 8 x i16> %splat)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 8 x i16> @lsl_i16_all_active(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: lsl_i16_all_active:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.h, z0.h, #15
+; CHECK-NEXT:    ret
   %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %elt = insertelement <vscale x 8 x i16> undef, i16 15, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
@@ -619,10 +826,50 @@ define <vscale x 8 x i16> @lsl_i16(<vscale x 8 x i16> %a) {
   ret <vscale x 8 x i16> %out
 }
 
-define <vscale x 4 x i32> @lsl_i32(<vscale x 4 x i32> %a) {
+; Ensure we don't match a left shift bigger than its bitwidth to the immediate form.
+define <vscale x 8 x i16> @lsl_i16_too_big(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: lsl_i16_too_big:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.h, #16 // =0x10
+; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 8 x i16> undef, i16 16, i32 0
+  %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a,
+                                                               <vscale x 8 x i16> %splat)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 8 x i16> @lsl_i16_zero(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: lsl_i16_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, #0
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a,
+                                                               <vscale x 8 x i16> zeroinitializer)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @lsl_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
 ; CHECK-LABEL: lsl_i32:
-; CHECK: lsl z0.s, z0.s, #31
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, #31
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 4 x i32> undef, i32 31, i32 0
+  %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a,
+                                                               <vscale x 4 x i32> %splat)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @lsl_i32_all_active(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: lsl_i32_all_active:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, z0.s, #31
+; CHECK-NEXT:    ret
   %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %elt = insertelement <vscale x 4 x i32> undef, i32 31, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -632,10 +879,50 @@ define <vscale x 4 x i32> @lsl_i32(<vscale x 4 x i32> %a) {
   ret <vscale x 4 x i32> %out
 }
 
-define <vscale x 2 x i64> @lsl_i64(<vscale x 2 x i64> %a) {
+; Ensure we don't match a left shift bigger than its bitwidth to the immediate form.
+define <vscale x 4 x i32> @lsl_i32_too_big(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: lsl_i32_too_big:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, #32 // =0x20
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 4 x i32> undef, i32 32, i32 0
+  %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a,
+                                                               <vscale x 4 x i32> %splat)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @lsl_i32_zero(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: lsl_i32_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, #0
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a,
+                                                               <vscale x 4 x i32> zeroinitializer)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @lsl_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
 ; CHECK-LABEL: lsl_i64:
-; CHECK: lsl z0.d, z0.d, #63
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, #63
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 2 x i64> undef, i64 63, i64 0
+  %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.lsl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a,
+                                                               <vscale x 2 x i64> %splat)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @lsl_i64_all_active(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: lsl_i64_all_active:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.d, z0.d, #63
+; CHECK-NEXT:    ret
   %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %elt = insertelement <vscale x 2 x i64> undef, i64 63, i64 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
@@ -645,12 +932,52 @@ define <vscale x 2 x i64> @lsl_i64(<vscale x 2 x i64> %a) {
   ret <vscale x 2 x i64> %out
 }
 
+; Ensure we don't match a left shift bigger than its bitwidth to the immediate form.
+define <vscale x 2 x i64> @lsl_i64_too_big(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: lsl_i64_too_big:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, #64 // =0x40
+; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 2 x i64> undef, i64 64, i64 0
+  %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.lsl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a,
+                                                               <vscale x 2 x i64> %splat)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @lsl_i64_zero(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: lsl_i64_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, #0
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.lsl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a,
+                                                               <vscale x 2 x i64> zeroinitializer)
+  ret <vscale x 2 x i64> %out
+}
+
 ; LSR
 
-define <vscale x 16 x i8> @lsr_i8(<vscale x 16 x i8> %a) {
+define <vscale x 16 x i8> @lsr_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
 ; CHECK-LABEL: lsr_i8:
-; CHECK: lsr z0.b, z0.b, #8
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, #8
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 16 x i8> undef, i8 9, i32 0
+  %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsr.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a,
+                                                               <vscale x 16 x i8> %splat)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 16 x i8> @lsr_i8_all_active(<vscale x 16 x i8> %a) {
+; CHECK-LABEL: lsr_i8_all_active:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr z0.b, z0.b, #8
+; CHECK-NEXT:    ret
   %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
   %elt = insertelement <vscale x 16 x i8> undef, i8 8, i32 0
   %splat = shufflevector <vscale x 16 x i8> %elt, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
@@ -660,10 +987,37 @@ define <vscale x 16 x i8> @lsr_i8(<vscale x 16 x i8> %a) {
   ret <vscale x 16 x i8> %out
 }
 
-define <vscale x 8 x i16> @lsr_i16(<vscale x 8 x i16> %a) {
+; Ensure we don't match a right shift by zero to the immediate form.
+define <vscale x 16 x i8> @lsr_i8_too_small(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: lsr_i8_too_small:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.b, #0 // =0x0
+; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsr.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a,
+                                                               <vscale x 16 x i8> zeroinitializer)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @lsr_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
 ; CHECK-LABEL: lsr_i16:
-; CHECK: lsr z0.h, z0.h, #16
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, #16
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 8 x i16> undef, i16 17, i32 0
+  %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a,
+                                                               <vscale x 8 x i16> %splat)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 8 x i16> @lsr_i16_all_active(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: lsr_i16_all_active:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr z0.h, z0.h, #16
+; CHECK-NEXT:    ret
   %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
   %elt = insertelement <vscale x 8 x i16> undef, i16 16, i32 0
   %splat = shufflevector <vscale x 8 x i16> %elt, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
@@ -673,10 +1027,37 @@ define <vscale x 8 x i16> @lsr_i16(<vscale x 8 x i16> %a) {
   ret <vscale x 8 x i16> %out
 }
 
-define <vscale x 4 x i32> @lsr_i32(<vscale x 4 x i32> %a) {
+; Ensure we don't match a right shift by zero to the immediate form.
+define <vscale x 8 x i16> @lsr_i16_too_small(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: lsr_i16_too_small:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.h, #0 // =0x0
+; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a,
+                                                               <vscale x 8 x i16> zeroinitializer)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @lsr_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
 ; CHECK-LABEL: lsr_i32:
-; CHECK: lsr z0.s, z0.s, #32
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, #32
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 4 x i32> undef, i32 33, i32 0
+  %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a,
+                                                               <vscale x 4 x i32> %splat)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @lsr_i32_all_active(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: lsr_i32_all_active:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr z0.s, z0.s, #32
+; CHECK-NEXT:    ret
   %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
   %elt = insertelement <vscale x 4 x i32> undef, i32 32, i32 0
   %splat = shufflevector <vscale x 4 x i32> %elt, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
@@ -686,10 +1067,37 @@ define <vscale x 4 x i32> @lsr_i32(<vscale x 4 x i32> %a) {
   ret <vscale x 4 x i32> %out
 }
 
-define <vscale x 2 x i64> @lsr_i64(<vscale x 2 x i64> %a) {
+; Ensure we don't match a right shift by zero to the immediate form.
+define <vscale x 4 x i32> @lsr_i32_too_small(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: lsr_i32_too_small:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a,
+                                                               <vscale x 4 x i32> zeroinitializer)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @lsr_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
 ; CHECK-LABEL: lsr_i64:
-; CHECK: lsr z0.d, z0.d, #64
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, #64
+; CHECK-NEXT:    ret
+  %elt = insertelement <vscale x 2 x i64> undef, i64 65, i64 0
+  %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a,
+                                                               <vscale x 2 x i64> %splat)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @lsr_i64_all_active(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: lsr_i64_all_active:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr z0.d, z0.d, #64
+; CHECK-NEXT:    ret
   %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
   %elt = insertelement <vscale x 2 x i64> undef, i64 64, i64 0
   %splat = shufflevector <vscale x 2 x i64> %elt, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
@@ -699,6 +1107,19 @@ define <vscale x 2 x i64> @lsr_i64(<vscale x 2 x i64> %a) {
   ret <vscale x 2 x i64> %out
 }
 
+; Ensure we don't match a right shift by zero to the immediate form.
+define <vscale x 2 x i64> @lsr_i64_too_small(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: lsr_i64_too_small:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT:    ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a,
+                                                               <vscale x 2 x i64> zeroinitializer)
+  ret <vscale x 2 x i64> %out
+}
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.sqadd.x.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.sqadd.x.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.sqadd.x.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)

From 74f577845e8174e255688589d845d43eacf3923f Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Tue, 18 Aug 2020 10:26:30 +0200
Subject: [PATCH 027/101] [mlir] expose standard types to C API

Provide C API for MLIR standard types. Since standard types live under lib/IR
in core MLIR, place the C APIs in the IR library as well (standard ops will go
into a separate library). This also defines a placeholder for affine maps that
are necessary to construct a memref, but are not yet exposed to the C API.

Reviewed By: stellaraccident

Differential Revision: https://reviews.llvm.org/D86094
---
 mlir/docs/CAPI.md                   |  42 ++++-
 mlir/include/mlir-c/AffineMap.h     |  25 +++
 mlir/include/mlir-c/IR.h            |   5 +-
 mlir/include/mlir-c/StandardTypes.h | 249 ++++++++++++++++++++++++++
 mlir/include/mlir/CAPI/AffineMap.h  |  24 +++
 mlir/include/mlir/CAPI/IR.h         |  34 ++++
 mlir/include/mlir/CAPI/Wrap.h       |  56 ++++++
 mlir/include/mlir/IR/AffineMap.h    |   8 +
 mlir/lib/CAPI/IR/AffineMap.cpp      |  15 ++
 mlir/lib/CAPI/IR/CMakeLists.txt     |   2 +
 mlir/lib/CAPI/IR/IR.cpp             |  43 +----
 mlir/lib/CAPI/IR/StandardTypes.cpp  | 263 ++++++++++++++++++++++++++++
 mlir/test/CAPI/ir.c                 | 165 +++++++++++++++++
 13 files changed, 886 insertions(+), 45 deletions(-)
 create mode 100644 mlir/include/mlir-c/AffineMap.h
 create mode 100644 mlir/include/mlir-c/StandardTypes.h
 create mode 100644 mlir/include/mlir/CAPI/AffineMap.h
 create mode 100644 mlir/include/mlir/CAPI/IR.h
 create mode 100644 mlir/include/mlir/CAPI/Wrap.h
 create mode 100644 mlir/lib/CAPI/IR/AffineMap.cpp
 create mode 100644 mlir/lib/CAPI/IR/StandardTypes.cpp

diff --git a/mlir/docs/CAPI.md b/mlir/docs/CAPI.md
index 6adb9db3331c5a..a8fcfbafb8b15a 100644
--- a/mlir/docs/CAPI.md
+++ b/mlir/docs/CAPI.md
@@ -75,6 +75,28 @@ check if an object is null by using `mlirXIsNull(MlirX)`. API functions do _not_
 expect null objects as arguments unless explicitly stated otherwise. API
 functions _may_ return null objects.
 
+### Type Hierarchies
+
+MLIR objects can form type hierarchies in C++. For example, all IR classes
+representing types are derived from `mlir::Type`, some of them may also be also
+derived from common base classes such as `mlir::ShapedType` or dialect-specific
+base classes. Type hierarchies are exposed to C API through naming conventions
+as follows.
+
+-   Only the top-level class of each hierarchy is exposed, e.g. `MlirType` is
+    defined as a type but `MlirShapedType` is not. This avoids the need for
+    explicit upcasting when passing an object of a derived type to a function
+    that expects a base type (this happens more often in core/standard APIs,
+    while downcasting usually involves further checks anyway).
+-   A type `Y` that derives from `X` provides a function `int mlirXIsAY(MlirX)`
+    that returns a non-zero value if the given dynamic instance of `X` is also
+    an instance of `Y`. For example, `int MlirTypeIsAInteger(MlirType)`.
+-   A function that expects a derived type as its first argument takes the base
+    type instead and documents the expectation by using `Y` in its name
+    `MlirY<...>(MlirX, ...)`. This function asserts that the dynamic instance of
+    its first argument is `Y`, and it is the responsibility of the caller to
+    ensure it is indeed the case.
+
 ### Conversion To String and Printing
 
 IR objects can be converted to a string representation, for example for
@@ -96,11 +118,11 @@ allocation and avoid unnecessary allocation and copying inside the printer.
 For convenience, `mlirXDump(MlirX)` functions are provided to print the given
 object to the standard error stream.
 
-### Common Patterns
+## Common Patterns
 
 The API adopts the following patterns for recurrent functionality in MLIR.
 
-#### Indexed Components
+### Indexed Components
 
 An object has an _indexed component_ if it has fields accessible using a
 zero-based contiguous integer index, typically arrays. For example, an
@@ -120,7 +142,7 @@ Note that the name of subobject in the function does not necessarily match the
 type of the subobject. For example, `mlirOperationGetOperand` returns a
 `MlirValue`.
 
-#### Iterable Components
+### Iterable Components
 
 An object has an _iterable component_ if it has iterators accessing its fields
 in some order other than integer indexing, typically linked lists. For example,
@@ -146,3 +168,17 @@ for (iter = mlirXGetFirst<Y>(x); !mlirYIsNull(iter);
   /* User 'iter'. */
 }
 ```
+
+## Extending the API
+
+### Extensions for Dialect Attributes and Types
+
+Dialect attributes and types can follow the example of standard attrbutes and
+types, provided that implementations live in separate directories, i.e.
+`include/mlir-c/<...>Dialect/` and `lib/CAPI/<...>Dialect/`. The core APIs
+provide implementation-private headers in `include/mlir/CAPI/IR` that allow one
+to convert between opaque C structures for core IR components and their C++
+counterparts. `wrap` converts a C++ class into a C structure and `unwrap` does
+the inverse conversion. Once the a C++ object is available, the API
+implementation should rely on `isa` to implement `mlirXIsAY` and is expected to
+use `cast` inside other API calls.
diff --git a/mlir/include/mlir-c/AffineMap.h b/mlir/include/mlir-c/AffineMap.h
new file mode 100644
index 00000000000000..bef13fd0bfa84f
--- /dev/null
+++ b/mlir/include/mlir-c/AffineMap.h
@@ -0,0 +1,25 @@
+/*===-- mlir-c/AffineMap.h - C API for MLIR Affine maps -----------*- C -*-===*\
+|*                                                                            *|
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM          *|
+|* Exceptions.                                                                *|
+|* See https://llvm.org/LICENSE.txt for license information.                  *|
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef MLIR_C_AFFINEMAP_H
+#define MLIR_C_AFFINEMAP_H
+
+#include "mlir-c/IR.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+DEFINE_C_API_STRUCT(MlirAffineMap, const void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MLIR_C_AFFINEMAP_H
diff --git a/mlir/include/mlir-c/IR.h b/mlir/include/mlir-c/IR.h
index 6b5be2d0195b09..68546bf35625a2 100644
--- a/mlir/include/mlir-c/IR.h
+++ b/mlir/include/mlir-c/IR.h
@@ -56,8 +56,6 @@ DEFINE_C_API_STRUCT(MlirType, const void);
 DEFINE_C_API_STRUCT(MlirLocation, const void);
 DEFINE_C_API_STRUCT(MlirModule, const void);
 
-#undef DEFINE_C_API_STRUCT
-
 /** Named MLIR attribute.
  *
  * A named attribute is essentially a (name, attribute) pair where the name is
@@ -314,6 +312,9 @@ void mlirValuePrint(MlirValue value, MlirPrintCallback callback,
 /** Parses a type. The type is owned by the context. */
 MlirType mlirTypeParseGet(MlirContext context, const char *type);
 
+/** Checks if two types are equal. */
+int mlirTypeEqual(MlirType t1, MlirType t2);
+
 /** Prints a location by sending chunks of the string representation and
  * forwarding `userData to `callback`. Note that the callback may be called
  * several times with consecutive chunks of the string. */
diff --git a/mlir/include/mlir-c/StandardTypes.h b/mlir/include/mlir-c/StandardTypes.h
new file mode 100644
index 00000000000000..ad28ea5467171a
--- /dev/null
+++ b/mlir/include/mlir-c/StandardTypes.h
@@ -0,0 +1,249 @@
+/*===-- mlir-c/StandardTypes.h - C API for MLIR Standard types ----*- C -*-===*\
+|*                                                                            *|
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM          *|
+|* Exceptions.                                                                *|
+|* See https://llvm.org/LICENSE.txt for license information.                  *|
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                    *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef MLIR_C_STANDARDTYPES_H
+#define MLIR_C_STANDARDTYPES_H
+
+#include "mlir-c/AffineMap.h"
+#include "mlir-c/IR.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*============================================================================*/
+/* Integer types.                                                             */
+/*============================================================================*/
+
+/** Checks whether the given type is an integer type. */
+int mlirTypeIsAInteger(MlirType type);
+
+/** Creates a signless integer type of the given bitwidth in the context. The
+ * type is owned by the context. */
+MlirType mlirIntegerTypeGet(MlirContext ctx, unsigned bitwidth);
+
+/** Creates a signed integer type of the given bitwidth in the context. The type
+ * is owned by the context. */
+MlirType mlirIntegerTypeSignedGet(MlirContext ctx, unsigned bitwidth);
+
+/** Creates an unsigned integer type of the given bitwidth in the context. The
+ * type is owned by the context. */
+MlirType mlirIntegerTypeUnsignedGet(MlirContext ctx, unsigned bitwidth);
+
+/** Returns the bitwidth of an integer type. */
+unsigned mlirIntegerTypeGetWidth(MlirType type);
+
+/** Checks whether the given integer type is signless. */
+int mlirIntegerTypeIsSignless(MlirType type);
+
+/** Checks whether the given integer type is signed. */
+int mlirIntegerTypeIsSigned(MlirType type);
+
+/** Checks whether the given integer type is unsigned. */
+int mlirIntegerTypeIsUnsigned(MlirType type);
+
+/*============================================================================*/
+/* Index type.                                                                */
+/*============================================================================*/
+
+/** Checks whether the given type is an index type. */
+int mlirTypeIsAIndex(MlirType type);
+
+/** Creates an index type in the given context. The type is owned by the
+ * context. */
+MlirType mlirIndexTypeGet(MlirContext ctx);
+
+/*============================================================================*/
+/* Floating-point types.                                                      */
+/*============================================================================*/
+
+/** Checks whether the given type is a bf16 type. */
+int mlirTypeIsABF16(MlirType type);
+
+/** Creates a bf16 type in the given context. The type is owned by the
+ * context. */
+MlirType mlirBF16TypeGet(MlirContext ctx);
+
+/** Checks whether the given type is an f16 type. */
+int mlirTypeIsAF16(MlirType type);
+
+/** Creates an f16 type in the given context. The type is owned by the
+ * context. */
+MlirType mlirF16TypeGet(MlirContext ctx);
+
+/** Checks whether the given type is an f32 type. */
+int mlirTypeIsAF32(MlirType type);
+
+/** Creates an f32 type in the given context. The type is owned by the
+ * context. */
+MlirType mlirF32TypeGet(MlirContext ctx);
+
+/** Checks whether the given type is an f64 type. */
+int mlirTypeIsAF64(MlirType type);
+
+/** Creates a f64 type in the given context. The type is owned by the
+ * context. */
+MlirType mlirF64TypeGet(MlirContext ctx);
+
+/*============================================================================*/
+/* None type.                                                                 */
+/*============================================================================*/
+
+/** Checks whether the given type is a None type. */
+int mlirTypeIsANone(MlirType type);
+
+/** Creates a None type in the given context. The type is owned by the
+ * context. */
+MlirType mlirNoneTypeGet(MlirContext ctx);
+
+/*============================================================================*/
+/* Complex type.                                                              */
+/*============================================================================*/
+
+/** Checks whether the given type is a Complex type. */
+int mlirTypeIsAComplex(MlirType type);
+
+/** Creates a complex type with the given element type in the same context as
+ * the element type. The type is owned by the context. */
+MlirType mlirComplexTypeGet(MlirType elementType);
+
+/** Returns the element type of the given complex type. */
+MlirType mlirComplexTypeGetElementType(MlirType type);
+
+/*============================================================================*/
+/* Shaped type.                                                               */
+/*============================================================================*/
+
+/** Checks whether the given type is a Shaped type. */
+int mlirTypeIsAShaped(MlirType type);
+
+/** Returns the element type of the shaped type. */
+MlirType mlirShapedTypeGetElementType(MlirType type);
+
+/** Checks whether the given shaped type is ranked. */
+int mlirShapedTypeHasRank(MlirType type);
+
+/** Returns the rank of the given ranked shaped type. */
+int64_t mlirShapedTypeGetRank(MlirType type);
+
+/** Checks whether the given shaped type has a static shape. */
+int mlirShapedTypeHasStaticShape(MlirType type);
+
+/** Checks wither the dim-th dimension of the given shaped type is dynamic. */
+int mlirShapedTypeIsDynamicDim(MlirType type, intptr_t dim);
+
+/** Returns the dim-th dimension of the given ranked shaped type. */
+int64_t mlirShapedTypeGetDimSize(MlirType type, intptr_t dim);
+
+/** Checks whether the given value is used as a placeholder for dynamic sizes
+ * in shaped types. */
+int mlirShapedTypeIsDynamicSize(int64_t size);
+
+/** Checks whether the given value is used as a placeholder for dynamic strides
+ * and offsets in shaped types. */
+int mlirShapedTypeIsDynamicStrideOrOffset(int64_t val);
+
+/*============================================================================*/
+/* Vector type.                                                               */
+/*============================================================================*/
+
+/** Checks whether the given type is a Vector type. */
+int mlirTypeIsAVector(MlirType type);
+
+/** Creates a vector type of the shape identified by its rank and dimensios,
+ * with the given element type in the same context as the element type. The type
+ * is owned by the context. */
+MlirType mlirVectorTypeGet(intptr_t rank, int64_t *shape, MlirType elementType);
+
+/*============================================================================*/
+/* Ranked / Unranked Tensor type.                                             */
+/*============================================================================*/
+
+/** Checks whether the given type is a Tensor type. */
+int mlirTypeIsATensor(MlirType type);
+
+/** Checks whether the given type is a ranked tensor type. */
+int mlirTypeIsARankedTensor(MlirType type);
+
+/** Checks whether the given type is an unranked tensor type. */
+int mlirTypeIsAUnrankedTensor(MlirType type);
+
+/** Creates a tensor type of a fixed rank with the given shape and element type
+ * in the same context as the element type. The type is owned by the context. */
+MlirType mlirRankedTensorTypeGet(intptr_t rank, int64_t *shape,
+                                 MlirType elementType);
+
+/** Creates an unranked tensor type with the given element type in the same
+ * context as the element type. The type is owned by the context. */
+MlirType mlirUnrankedTensorTypeGet(MlirType elementType);
+
+/*============================================================================*/
+/* Ranked / Unranked MemRef type.                                             */
+/*============================================================================*/
+
+/** Checks whether the given type is a MemRef type. */
+int mlirTypeIsAMemRef(MlirType type);
+
+/** Checks whether the given type is an UnrankedMemRef type. */
+int mlirTypeIsAUnrankedMemRef(MlirType type);
+
+/** Creates a MemRef type with the given rank and shape, a potentially empty
+ * list of affine layout maps, the given memory space and element type, in the
+ * same context as element type. The type is owned by the context. */
+MlirType mlirMemRefTypeGet(MlirType elementType, intptr_t rank, int64_t *shape,
+                           intptr_t numMaps, MlirAttribute *affineMaps,
+                           unsigned memorySpace);
+
+/** Creates a MemRef type with the given rank, shape, memory space and element
+ * type in the same context as the element type. The type has no affine maps,
+ * i.e. represents a default row-major contiguous memref. The type is owned by
+ * the context. */
+MlirType mlirMemRefTypeContiguousGet(MlirType elementType, intptr_t rank,
+                                     int64_t *shape, unsigned memorySpace);
+
+/** Creates an Unranked MemRef type with the given element type and in the given
+ * memory space. The type is owned by the context of element type. */
+MlirType mlirUnrankedMemRefTypeGet(MlirType elementType, unsigned memorySpace);
+
+/** Returns the number of affine layout maps in the given MemRef type. */
+intptr_t mlirMemRefTypeGetNumAffineMaps(MlirType type);
+
+/** Returns the pos-th affine map of the given MemRef type. */
+MlirAffineMap mlirMemRefTypeGetAffineMap(MlirType type, intptr_t pos);
+
+/** Returns the memory space of the given MemRef type. */
+unsigned mlirMemRefTypeGetMemorySpace(MlirType type);
+
+/** Returns the memory spcae of the given Unranked MemRef type. */
+unsigned mlirUnrankedMemrefGetMemorySpace(MlirType type);
+
+/*============================================================================*/
+/* Tuple type.                                                                */
+/*============================================================================*/
+
+/** Checks whether the given type is a tuple type. */
+int mlirTypeIsATuple(MlirType type);
+
+/** Creates a tuple type that consists of the given list of elemental types. The
+ * type is owned by the context. */
+MlirType mlirTupleTypeGet(MlirContext ctx, intptr_t numElements,
+                          MlirType *elements);
+
+/** Returns the number of types contained in a tuple. */
+intptr_t mlirTupleTypeGetNumTypes(MlirType type);
+
+/** Returns the pos-th type in the tuple type. */
+MlirType mlirTupleTypeGetType(MlirType type, intptr_t pos);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // MLIR_C_STANDARDTYPES_H
diff --git a/mlir/include/mlir/CAPI/AffineMap.h b/mlir/include/mlir/CAPI/AffineMap.h
new file mode 100644
index 00000000000000..cea48ffae8b6d8
--- /dev/null
+++ b/mlir/include/mlir/CAPI/AffineMap.h
@@ -0,0 +1,24 @@
+//===- AffineMap.h - C API Utils for Affine Maps ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains declarations of implementation details of the C API for
+// MLIR Affine maps. This file should not be included from C++ code other than
+// C API implementation nor from C code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CAPI_AFFINEMAP_H
+#define MLIR_CAPI_AFFINEMAP_H
+
+#include "mlir-c/AffineMap.h"
+#include "mlir/CAPI/Wrap.h"
+#include "mlir/IR/AffineMap.h"
+
+DEFINE_C_API_METHODS(MlirAffineMap, mlir::AffineMap)
+
+#endif // MLIR_CAPI_AFFINEMAP_H
diff --git a/mlir/include/mlir/CAPI/IR.h b/mlir/include/mlir/CAPI/IR.h
new file mode 100644
index 00000000000000..9a60ecf04fc892
--- /dev/null
+++ b/mlir/include/mlir/CAPI/IR.h
@@ -0,0 +1,34 @@
+//===- IR.h - C API Utils for Core MLIR classes -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains declarations of implementation details of the C API for
+// core MLIR classes. This file should not be included from C++ code other than
+// C API implementation nor from C code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_INCLUDE_MLIR_CAPI_IR_H
+#define MLIR_INCLUDE_MLIR_CAPI_IR_H
+
+#include "mlir/CAPI/Wrap.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Module.h"
+#include "mlir/IR/Operation.h"
+
+DEFINE_C_API_PTR_METHODS(MlirContext, mlir::MLIRContext)
+DEFINE_C_API_PTR_METHODS(MlirOperation, mlir::Operation)
+DEFINE_C_API_PTR_METHODS(MlirBlock, mlir::Block)
+DEFINE_C_API_PTR_METHODS(MlirRegion, mlir::Region)
+
+DEFINE_C_API_METHODS(MlirAttribute, mlir::Attribute)
+DEFINE_C_API_METHODS(MlirLocation, mlir::Location)
+DEFINE_C_API_METHODS(MlirType, mlir::Type)
+DEFINE_C_API_METHODS(MlirValue, mlir::Value)
+DEFINE_C_API_METHODS(MlirModule, mlir::ModuleOp)
+
+#endif // MLIR_INCLUDE_MLIR_CAPI_IR_H
diff --git a/mlir/include/mlir/CAPI/Wrap.h b/mlir/include/mlir/CAPI/Wrap.h
new file mode 100644
index 00000000000000..940007caac060a
--- /dev/null
+++ b/mlir/include/mlir/CAPI/Wrap.h
@@ -0,0 +1,56 @@
+//===- Wrap.h - C API Utilities ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains common definitions for wrapping opaque C++ pointers into
+// C structures for the purpose of C API. This file should not be included from
+// C++ code other than C API implementation nor from C code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CAPI_WRAP_H
+#define MLIR_CAPI_WRAP_H
+
+#include "mlir-c/IR.h"
+#include "mlir/Support/LLVM.h"
+
+/* ========================================================================== */
+/* Definitions of methods for non-owning structures used in C API.            */
+/* ========================================================================== */
+
+#define DEFINE_C_API_PTR_METHODS(name, cpptype)                                \
+  static inline name wrap(cpptype *cpp) { return name{cpp}; }                  \
+  static inline cpptype *unwrap(name c) {                                      \
+    return static_cast<cpptype *>(c.ptr);                                      \
+  }
+
+#define DEFINE_C_API_METHODS(name, cpptype)                                    \
+  static inline name wrap(cpptype cpp) {                                       \
+    return name{cpp.getAsOpaquePointer()};                                     \
+  }                                                                            \
+  static inline cpptype unwrap(name c) {                                       \
+    return cpptype::getFromOpaquePointer(c.ptr);                               \
+  }
+
+template <typename CppTy, typename CTy>
+static llvm::ArrayRef<CppTy> unwrapList(size_t size, CTy *first,
+                                        llvm::SmallVectorImpl<CppTy> &storage) {
+  static_assert(
+      std::is_same<decltype(unwrap(std::declval<CTy>())), CppTy>::value,
+      "incompatible C and C++ types");
+
+  if (size == 0)
+    return llvm::None;
+
+  assert(storage.empty() && "expected to populate storage");
+  storage.reserve(size);
+  for (size_t i = 0; i < size; ++i)
+    storage.push_back(unwrap(*(first + i)));
+  return storage;
+}
+
+#endif // MLIR_CAPI_WRAP_H
diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h
index d946c7591c2a05..dd4960a02c5c67 100644
--- a/mlir/include/mlir/IR/AffineMap.h
+++ b/mlir/include/mlir/IR/AffineMap.h
@@ -196,6 +196,14 @@ class AffineMap {
 
   friend ::llvm::hash_code hash_value(AffineMap arg);
 
+  /// Methods supporting C API.
+  const void *getAsOpaquePointer() const {
+    return static_cast<const void *>(map);
+  }
+  static AffineMap getFromOpaquePointer(const void *pointer) {
+    return AffineMap(reinterpret_cast<ImplType *>(const_cast<void *>(pointer)));
+  }
+
 private:
   ImplType *map;
 
diff --git a/mlir/lib/CAPI/IR/AffineMap.cpp b/mlir/lib/CAPI/IR/AffineMap.cpp
new file mode 100644
index 00000000000000..d80d9e20486a0c
--- /dev/null
+++ b/mlir/lib/CAPI/IR/AffineMap.cpp
@@ -0,0 +1,15 @@
+//===- AffineMap.cpp - C API for MLIR Affine Maps -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir-c/AffineMap.h"
+#include "mlir-c/IR.h"
+#include "mlir/CAPI/AffineMap.h"
+#include "mlir/IR/AffineMap.h"
+
+// This is a placeholder for affine map bindings. The file is here to serve as a
+// compilation unit that includes the headers.
diff --git a/mlir/lib/CAPI/IR/CMakeLists.txt b/mlir/lib/CAPI/IR/CMakeLists.txt
index fdf239f975d36d..64e715e33d887f 100644
--- a/mlir/lib/CAPI/IR/CMakeLists.txt
+++ b/mlir/lib/CAPI/IR/CMakeLists.txt
@@ -1,6 +1,8 @@
 # Main API.
 add_mlir_library(MLIRCAPIIR
+  AffineMap.cpp
   IR.cpp
+  StandardTypes.cpp
 
   EXCLUDE_FROM_LIBMLIR
 
diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp
index 5231096af7859c..1ba1a6aca6f8ed 100644
--- a/mlir/lib/CAPI/IR/IR.cpp
+++ b/mlir/lib/CAPI/IR/IR.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir-c/IR.h"
 
+#include "mlir/CAPI/IR.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Module.h"
 #include "mlir/IR/Operation.h"
@@ -17,46 +18,6 @@
 
 using namespace mlir;
 
-/* ========================================================================== */
-/* Definitions of methods for non-owning structures used in C API.            */
-/* ========================================================================== */
-
-#define DEFINE_C_API_PTR_METHODS(name, cpptype)                                \
-  static name wrap(cpptype *cpp) { return name{cpp}; }                         \
-  static cpptype *unwrap(name c) { return static_cast<cpptype *>(c.ptr); }
-
-DEFINE_C_API_PTR_METHODS(MlirContext, MLIRContext)
-DEFINE_C_API_PTR_METHODS(MlirOperation, Operation)
-DEFINE_C_API_PTR_METHODS(MlirBlock, Block)
-DEFINE_C_API_PTR_METHODS(MlirRegion, Region)
-
-#define DEFINE_C_API_METHODS(name, cpptype)                                    \
-  static name wrap(cpptype cpp) { return name{cpp.getAsOpaquePointer()}; }     \
-  static cpptype unwrap(name c) { return cpptype::getFromOpaquePointer(c.ptr); }
-
-DEFINE_C_API_METHODS(MlirAttribute, Attribute)
-DEFINE_C_API_METHODS(MlirLocation, Location);
-DEFINE_C_API_METHODS(MlirType, Type)
-DEFINE_C_API_METHODS(MlirValue, Value)
-DEFINE_C_API_METHODS(MlirModule, ModuleOp)
-
-template <typename CppTy, typename CTy>
-static ArrayRef<CppTy> unwrapList(intptr_t size, CTy *first,
-                                  SmallVectorImpl<CppTy> &storage) {
-  static_assert(
-      std::is_same<decltype(unwrap(std::declval<CTy>())), CppTy>::value,
-      "incompatible C and C++ types");
-
-  if (size == 0)
-    return llvm::None;
-
-  assert(storage.empty() && "expected to populate storage");
-  storage.reserve(size);
-  for (intptr_t i = 0; i < size; ++i)
-    storage.push_back(unwrap(*(first + i)));
-  return storage;
-}
-
 /* ========================================================================== */
 /* Printing helper.                                                           */
 /* ========================================================================== */
@@ -388,6 +349,8 @@ MlirType mlirTypeParseGet(MlirContext context, const char *type) {
   return wrap(mlir::parseType(type, unwrap(context)));
 }
 
+int mlirTypeEqual(MlirType t1, MlirType t2) { return unwrap(t1) == unwrap(t2); }
+
 void mlirTypePrint(MlirType type, MlirPrintCallback callback, void *userData) {
   CallbackOstream stream(callback, userData);
   unwrap(type).print(stream);
diff --git a/mlir/lib/CAPI/IR/StandardTypes.cpp b/mlir/lib/CAPI/IR/StandardTypes.cpp
new file mode 100644
index 00000000000000..eb006242e88089
--- /dev/null
+++ b/mlir/lib/CAPI/IR/StandardTypes.cpp
@@ -0,0 +1,263 @@
+//===- StandardTypes.cpp - C Interface to MLIR Standard Types -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir-c/StandardTypes.h"
+#include "mlir-c/AffineMap.h"
+#include "mlir-c/IR.h"
+#include "mlir/CAPI/AffineMap.h"
+#include "mlir/CAPI/IR.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/StandardTypes.h"
+
+using namespace mlir;
+
+/* ========================================================================== */
+/* Integer types.                                                             */
+/* ========================================================================== */
+
+int mlirTypeIsAInteger(MlirType type) {
+  return unwrap(type).isa<IntegerType>();
+}
+
+MlirType mlirIntegerTypeGet(MlirContext ctx, unsigned bitwidth) {
+  return wrap(IntegerType::get(bitwidth, unwrap(ctx)));
+}
+
+MlirType mlirIntegerTypeSignedGet(MlirContext ctx, unsigned bitwidth) {
+  return wrap(IntegerType::get(bitwidth, IntegerType::Signed, unwrap(ctx)));
+}
+
+MlirType mlirIntegerTypeUnsignedGet(MlirContext ctx, unsigned bitwidth) {
+  return wrap(IntegerType::get(bitwidth, IntegerType::Unsigned, unwrap(ctx)));
+}
+
+unsigned mlirIntegerTypeGetWidth(MlirType type) {
+  return unwrap(type).cast<IntegerType>().getWidth();
+}
+
+int mlirIntegerTypeIsSignless(MlirType type) {
+  return unwrap(type).cast<IntegerType>().isSignless();
+}
+
+int mlirIntegerTypeIsSigned(MlirType type) {
+  return unwrap(type).cast<IntegerType>().isSigned();
+}
+
+int mlirIntegerTypeIsUnsigned(MlirType type) {
+  return unwrap(type).cast<IntegerType>().isUnsigned();
+}
+
+/* ========================================================================== */
+/* Index type.                                                                */
+/* ========================================================================== */
+
+int mlirTypeIsAIndex(MlirType type) { return unwrap(type).isa<IndexType>(); }
+
+MlirType mlirIndexTypeGet(MlirContext ctx) {
+  return wrap(IndexType::get(unwrap(ctx)));
+}
+
+/* ========================================================================== */
+/* Floating-point types.                                                      */
+/* ========================================================================== */
+
+int mlirTypeIsABF16(MlirType type) { return unwrap(type).isBF16(); }
+
+MlirType mlirBF16TypeGet(MlirContext ctx) {
+  return wrap(FloatType::getBF16(unwrap(ctx)));
+}
+
+int mlirTypeIsAF16(MlirType type) { return unwrap(type).isF16(); }
+
+MlirType mlirF16TypeGet(MlirContext ctx) {
+  return wrap(FloatType::getF16(unwrap(ctx)));
+}
+
+int mlirTypeIsAF32(MlirType type) { return unwrap(type).isF32(); }
+
+MlirType mlirF32TypeGet(MlirContext ctx) {
+  return wrap(FloatType::getF32(unwrap(ctx)));
+}
+
+int mlirTypeIsAF64(MlirType type) { return unwrap(type).isF64(); }
+
+MlirType mlirF64TypeGet(MlirContext ctx) {
+  return wrap(FloatType::getF64(unwrap(ctx)));
+}
+
+/* ========================================================================== */
+/* None type.                                                                 */
+/* ========================================================================== */
+
+int mlirTypeIsANone(MlirType type) { return unwrap(type).isa<NoneType>(); }
+
+MlirType mlirNoneTypeGet(MlirContext ctx) {
+  return wrap(NoneType::get(unwrap(ctx)));
+}
+
+/* ========================================================================== */
+/* Complex type.                                                              */
+/* ========================================================================== */
+
+int mlirTypeIsAComplex(MlirType type) {
+  return unwrap(type).isa<ComplexType>();
+}
+
+MlirType mlirComplexTypeGet(MlirType elementType) {
+  return wrap(ComplexType::get(unwrap(elementType)));
+}
+
+MlirType mlirComplexTypeGetElementType(MlirType type) {
+  return wrap(unwrap(type).cast<ComplexType>().getElementType());
+}
+
+/* ========================================================================== */
+/* Shaped type.                                                               */
+/* ========================================================================== */
+
+int mlirTypeIsAShaped(MlirType type) { return unwrap(type).isa<ShapedType>(); }
+
+MlirType mlirShapedTypeGetElementType(MlirType type) {
+  return wrap(unwrap(type).cast<ShapedType>().getElementType());
+}
+
+int mlirShapedTypeHasRank(MlirType type) {
+  return unwrap(type).cast<ShapedType>().hasRank();
+}
+
+int64_t mlirShapedTypeGetRank(MlirType type) {
+  return unwrap(type).cast<ShapedType>().getRank();
+}
+
+int mlirShapedTypeHasStaticShape(MlirType type) {
+  return unwrap(type).cast<ShapedType>().hasStaticShape();
+}
+
+int mlirShapedTypeIsDynamicDim(MlirType type, intptr_t dim) {
+  return unwrap(type).cast<ShapedType>().isDynamicDim(
+      static_cast<unsigned>(dim));
+}
+
+int64_t mlirShapedTypeGetDimSize(MlirType type, intptr_t dim) {
+  return unwrap(type).cast<ShapedType>().getDimSize(static_cast<unsigned>(dim));
+}
+
+int mlirShapedTypeIsDynamicSize(int64_t size) {
+  return ShapedType::isDynamic(size);
+}
+
+int mlirShapedTypeIsDynamicStrideOrOffset(int64_t val) {
+  return ShapedType::isDynamicStrideOrOffset(val);
+}
+
+/* ========================================================================== */
+/* Vector type.                                                               */
+/* ========================================================================== */
+
+int mlirTypeIsAVector(MlirType type) { return unwrap(type).isa<VectorType>(); }
+
+MlirType mlirVectorTypeGet(intptr_t rank, int64_t *shape,
+                           MlirType elementType) {
+  return wrap(
+      VectorType::get(llvm::makeArrayRef(shape, static_cast<size_t>(rank)),
+                      unwrap(elementType)));
+}
+
+/* ========================================================================== */
+/* Ranked / Unranked tensor type.                                             */
+/* ========================================================================== */
+
+int mlirTypeIsATensor(MlirType type) { return unwrap(type).isa<TensorType>(); }
+
+int mlirTypeIsARankedTensor(MlirType type) {
+  return unwrap(type).isa<RankedTensorType>();
+}
+
+int mlirTypeIsAUnrankedTensor(MlirType type) {
+  return unwrap(type).isa<UnrankedTensorType>();
+}
+
+MlirType mlirRankedTensorTypeGet(intptr_t rank, int64_t *shape,
+                                 MlirType elementType) {
+  return wrap(RankedTensorType::get(
+      llvm::makeArrayRef(shape, static_cast<size_t>(rank)),
+      unwrap(elementType)));
+}
+
+MlirType mlirUnrankedTensorTypeGet(MlirType elementType) {
+  return wrap(UnrankedTensorType::get(unwrap(elementType)));
+}
+
+/* ========================================================================== */
+/* Ranked / Unranked MemRef type.                                             */
+/* ========================================================================== */
+
+int mlirTypeIsAMemRef(MlirType type) { return unwrap(type).isa<MemRefType>(); }
+
+MlirType mlirMemRefTypeGet(MlirType elementType, intptr_t rank, int64_t *shape,
+                           intptr_t numMaps, MlirAffineMap *affineMaps,
+                           unsigned memorySpace) {
+  SmallVector<AffineMap, 1> maps;
+  (void)unwrapList(numMaps, affineMaps, maps);
+  return wrap(
+      MemRefType::get(llvm::makeArrayRef(shape, static_cast<size_t>(rank)),
+                      unwrap(elementType), maps, memorySpace));
+}
+
+MlirType mlirMemRefTypeContiguousGet(MlirType elementType, intptr_t rank,
+                                     int64_t *shape, unsigned memorySpace) {
+  return wrap(
+      MemRefType::get(llvm::makeArrayRef(shape, static_cast<size_t>(rank)),
+                      unwrap(elementType), llvm::None, memorySpace));
+}
+
+intptr_t mlirMemRefTypeGetNumAffineMaps(MlirType type) {
+  return static_cast<intptr_t>(
+      unwrap(type).cast<MemRefType>().getAffineMaps().size());
+}
+
+MlirAffineMap mlirMemRefTypeGetAffineMap(MlirType type, intptr_t pos) {
+  return wrap(unwrap(type).cast<MemRefType>().getAffineMaps()[pos]);
+}
+
+unsigned mlirMemRefTypeGetMemorySpace(MlirType type) {
+  return unwrap(type).cast<MemRefType>().getMemorySpace();
+}
+
+int mlirTypeIsAUnrankedMemRef(MlirType type) {
+  return unwrap(type).isa<UnrankedMemRefType>();
+}
+
+MlirType mlirUnrankedMemRefTypeGet(MlirType elementType, unsigned memorySpace) {
+  return wrap(UnrankedMemRefType::get(unwrap(elementType), memorySpace));
+}
+
+unsigned mlirUnrankedMemrefGetMemorySpace(MlirType type) {
+  return unwrap(type).cast<UnrankedMemRefType>().getMemorySpace();
+}
+
+/* ========================================================================== */
+/* Tuple type.                                                                */
+/* ========================================================================== */
+
+int mlirTypeIsATuple(MlirType type) { return unwrap(type).isa<TupleType>(); }
+
+MlirType mlirTupleTypeGet(MlirContext ctx, intptr_t numElements,
+                          MlirType *elements) {
+  SmallVector<Type, 4> types;
+  ArrayRef<Type> typeRef = unwrapList(numElements, elements, types);
+  return wrap(TupleType::get(typeRef, unwrap(ctx)));
+}
+
+intptr_t mlirTupleTypeGetNumTypes(MlirType type) {
+  return unwrap(type).cast<TupleType>().size();
+}
+
+MlirType mlirTupleTypeGetType(MlirType type, intptr_t pos) {
+  return wrap(unwrap(type).cast<TupleType>().getType(static_cast<size_t>(pos)));
+}
diff --git a/mlir/test/CAPI/ir.c b/mlir/test/CAPI/ir.c
index d6ab3513384fc5..56b7ecd7fd7c4c 100644
--- a/mlir/test/CAPI/ir.c
+++ b/mlir/test/CAPI/ir.c
@@ -12,6 +12,7 @@
 
 #include "mlir-c/IR.h"
 #include "mlir-c/Registration.h"
+#include "mlir-c/StandardTypes.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -240,6 +241,145 @@ static void printFirstOfEach(MlirOperation operation) {
   fprintf(stderr, "\n");
 }
 
+/// Dumps instances of all standard types to check that C API works correctly.
+/// Additionally, performs simple identity checks that a standard type
+/// constructed with C API can be inspected and has the expected type. The
+/// latter achieves full coverage of C API for standard types. Returns 0 on
+/// success and a non-zero error code on failure.
+static int printStandardTypes(MlirContext ctx) {
+  // Integer types.
+  MlirType i32 = mlirIntegerTypeGet(ctx, 32);
+  MlirType si32 = mlirIntegerTypeSignedGet(ctx, 32);
+  MlirType ui32 = mlirIntegerTypeUnsignedGet(ctx, 32);
+  if (!mlirTypeIsAInteger(i32) || mlirTypeIsAF32(i32))
+    return 1;
+  if (!mlirTypeIsAInteger(si32) || !mlirIntegerTypeIsSigned(si32))
+    return 2;
+  if (!mlirTypeIsAInteger(ui32) || !mlirIntegerTypeIsUnsigned(ui32))
+    return 3;
+  if (mlirTypeEqual(i32, ui32) || mlirTypeEqual(i32, si32))
+    return 4;
+  if (mlirIntegerTypeGetWidth(i32) != mlirIntegerTypeGetWidth(si32))
+    return 5;
+  mlirTypeDump(i32);
+  fprintf(stderr, "\n");
+  mlirTypeDump(si32);
+  fprintf(stderr, "\n");
+  mlirTypeDump(ui32);
+  fprintf(stderr, "\n");
+
+  // Index type.
+  MlirType index = mlirIndexTypeGet(ctx);
+  if (!mlirTypeIsAIndex(index))
+    return 6;
+  mlirTypeDump(index);
+  fprintf(stderr, "\n");
+
+  // Floating-point types.
+  MlirType bf16 = mlirBF16TypeGet(ctx);
+  MlirType f16 = mlirF16TypeGet(ctx);
+  MlirType f32 = mlirF32TypeGet(ctx);
+  MlirType f64 = mlirF64TypeGet(ctx);
+  if (!mlirTypeIsABF16(bf16))
+    return 7;
+  if (!mlirTypeIsAF16(f16))
+    return 9;
+  if (!mlirTypeIsAF32(f32))
+    return 10;
+  if (!mlirTypeIsAF64(f64))
+    return 11;
+  mlirTypeDump(bf16);
+  fprintf(stderr, "\n");
+  mlirTypeDump(f16);
+  fprintf(stderr, "\n");
+  mlirTypeDump(f32);
+  fprintf(stderr, "\n");
+  mlirTypeDump(f64);
+  fprintf(stderr, "\n");
+
+  // None type.
+  MlirType none = mlirNoneTypeGet(ctx);
+  if (!mlirTypeIsANone(none))
+    return 12;
+  mlirTypeDump(none);
+  fprintf(stderr, "\n");
+
+  // Complex type.
+  MlirType cplx = mlirComplexTypeGet(f32);
+  if (!mlirTypeIsAComplex(cplx) ||
+      !mlirTypeEqual(mlirComplexTypeGetElementType(cplx), f32))
+    return 13;
+  mlirTypeDump(cplx);
+  fprintf(stderr, "\n");
+
+  // Vector (and Shaped) type. ShapedType is a common base class for vectors,
+  // memrefs and tensors, one cannot create instances of this class so it is
+  // tested on an instance of vector type.
+  int64_t shape[] = {2, 3};
+  MlirType vector =
+      mlirVectorTypeGet(sizeof(shape) / sizeof(int64_t), shape, f32);
+  if (!mlirTypeIsAVector(vector) || !mlirTypeIsAShaped(vector))
+    return 14;
+  if (!mlirTypeEqual(mlirShapedTypeGetElementType(vector), f32) ||
+      !mlirShapedTypeHasRank(vector) || mlirShapedTypeGetRank(vector) != 2 ||
+      mlirShapedTypeGetDimSize(vector, 0) != 2 ||
+      mlirShapedTypeIsDynamicDim(vector, 0) ||
+      mlirShapedTypeGetDimSize(vector, 1) != 3 ||
+      !mlirShapedTypeHasStaticShape(vector))
+    return 15;
+  mlirTypeDump(vector);
+  fprintf(stderr, "\n");
+
+  // Ranked tensor type.
+  MlirType rankedTensor =
+      mlirRankedTensorTypeGet(sizeof(shape) / sizeof(int64_t), shape, f32);
+  if (!mlirTypeIsATensor(rankedTensor) ||
+      !mlirTypeIsARankedTensor(rankedTensor))
+    return 16;
+  mlirTypeDump(rankedTensor);
+  fprintf(stderr, "\n");
+
+  // Unranked tensor type.
+  MlirType unrankedTensor = mlirUnrankedTensorTypeGet(f32);
+  if (!mlirTypeIsATensor(unrankedTensor) ||
+      !mlirTypeIsAUnrankedTensor(unrankedTensor) ||
+      mlirShapedTypeHasRank(unrankedTensor))
+    return 17;
+  mlirTypeDump(unrankedTensor);
+  fprintf(stderr, "\n");
+
+  // MemRef type.
+  MlirType memRef = mlirMemRefTypeContiguousGet(
+      f32, sizeof(shape) / sizeof(int64_t), shape, 2);
+  if (!mlirTypeIsAMemRef(memRef) ||
+      mlirMemRefTypeGetNumAffineMaps(memRef) != 0 ||
+      mlirMemRefTypeGetMemorySpace(memRef) != 2)
+    return 18;
+  mlirTypeDump(memRef);
+  fprintf(stderr, "\n");
+
+  // Unranked MemRef type.
+  MlirType unrankedMemRef = mlirUnrankedMemRefTypeGet(f32, 4);
+  if (!mlirTypeIsAUnrankedMemRef(unrankedMemRef) ||
+      mlirTypeIsAMemRef(unrankedMemRef) ||
+      mlirUnrankedMemrefGetMemorySpace(unrankedMemRef) != 4)
+    return 19;
+  mlirTypeDump(unrankedMemRef);
+  fprintf(stderr, "\n");
+
+  // Tuple type.
+  MlirType types[] = {unrankedMemRef, f32};
+  MlirType tuple = mlirTupleTypeGet(ctx, 2, types);
+  if (!mlirTypeIsATuple(tuple) || mlirTupleTypeGetNumTypes(tuple) != 2 ||
+      !mlirTypeEqual(mlirTupleTypeGetType(tuple, 0), unrankedMemRef) ||
+      !mlirTypeEqual(mlirTupleTypeGetType(tuple, 1), f32))
+    return 20;
+  mlirTypeDump(tuple);
+  fprintf(stderr, "\n");
+
+  return 0;
+}
+
 int main() {
   mlirRegisterAllDialects();
   MlirContext ctx = mlirContextCreate();
@@ -293,6 +433,31 @@ int main() {
   // clang-format on
 
   mlirModuleDestroy(moduleOp);
+
+  // clang-format off
+  // CHECK-LABEL: @types
+  // CHECK: i32
+  // CHECK: si32
+  // CHECK: ui32
+  // CHECK: index
+  // CHECK: bf16
+  // CHECK: f16
+  // CHECK: f32
+  // CHECK: f64
+  // CHECK: none
+  // CHECK: complex<f32>
+  // CHECK: vector<2x3xf32>
+  // CHECK: tensor<2x3xf32>
+  // CHECK: tensor<*xf32>
+  // CHECK: memref<2x3xf32, 2>
+  // CHECK: memref<*xf32, 4>
+  // CHECK: tuple<memref<*xf32, 4>, f32>
+  // CHECK: 0
+  // clang-format on
+  fprintf(stderr, "@types");
+  int errcode = printStandardTypes(ctx);
+  fprintf(stderr, "%d\n", errcode);
+
   mlirContextDestroy(ctx);
 
   return 0;

From dcd4589a0d68032076f6951190ab170fef9c3cf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lubo=C5=A1=20Lu=C5=88=C3=A1k?= <l.lunak@centrum.cz>
Date: Wed, 5 Aug 2020 13:00:37 +0200
Subject: [PATCH 028/101] [lldb][gui] use left/right in the source view to
 scroll

I intentionally decided not to reset the column automatically
anywhere, because I don't know where and if at all that should happen.
There should be always an indication of being scrolled (too much)
to the right, so I'll leave this to whoever has an opinion.

Differential Revision: https://reviews.llvm.org/D85290
---
 lldb/source/Core/IOHandlerCursesGUI.cpp       | 57 ++++++++++++++++---
 .../gui/viewlarge/TestGuiViewLarge.py         | 17 ++++++
 2 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/lldb/source/Core/IOHandlerCursesGUI.cpp b/lldb/source/Core/IOHandlerCursesGUI.cpp
index 6a24625fc7eb1a..10aff7a6c21751 100644
--- a/lldb/source/Core/IOHandlerCursesGUI.cpp
+++ b/lldb/source/Core/IOHandlerCursesGUI.cpp
@@ -480,23 +480,40 @@ class Window {
 
   // Curses doesn't allow direct output of color escape sequences, but that's
   // how we get source lines from the Highligher class. Read the line and
-  // convert color escape sequences to curses color attributes.
-  void OutputColoredStringTruncated(int right_pad, StringRef string,
+  // convert color escape sequences to curses color attributes. Use
+  // first_skip_count to skip leading visible characters. Returns false if all
+  // visible characters were skipped due to first_skip_count.
+  bool OutputColoredStringTruncated(int right_pad, StringRef string,
+                                    size_t skip_first_count,
                                     bool use_blue_background) {
     attr_t saved_attr;
     short saved_pair;
+    bool result = false;
     wattr_get(m_window, &saved_attr, &saved_pair, nullptr);
     if (use_blue_background)
       ::wattron(m_window, COLOR_PAIR(WhiteOnBlue));
     while (!string.empty()) {
       size_t esc_pos = string.find('\x1b');
       if (esc_pos == StringRef::npos) {
-        PutCStringTruncated(right_pad, string.data(), string.size());
+        string = string.substr(skip_first_count);
+        if (!string.empty()) {
+          PutCStringTruncated(right_pad, string.data(), string.size());
+          result = true;
+        }
         break;
       }
       if (esc_pos > 0) {
-        PutCStringTruncated(right_pad, string.data(), esc_pos);
-        string = string.drop_front(esc_pos);
+        if (skip_first_count > 0) {
+          int skip = std::min(esc_pos, skip_first_count);
+          string = string.substr(skip);
+          skip_first_count -= skip;
+          esc_pos -= skip;
+        }
+        if (esc_pos > 0) {
+          PutCStringTruncated(right_pad, string.data(), esc_pos);
+          result = true;
+          string = string.drop_front(esc_pos);
+        }
       }
       bool consumed = string.consume_front("\x1b");
       assert(consumed);
@@ -531,6 +548,7 @@ class Window {
       }
     }
     wattr_set(m_window, saved_attr, saved_pair, nullptr);
+    return result;
   }
 
   void Touch() {
@@ -3379,7 +3397,8 @@ class SourceFileWindowDelegate : public WindowDelegate {
         m_disassembly_scope(nullptr), m_disassembly_sp(), m_disassembly_range(),
         m_title(), m_line_width(4), m_selected_line(0), m_pc_line(0),
         m_stop_id(0), m_frame_idx(UINT32_MAX), m_first_visible_line(0),
-        m_min_x(0), m_min_y(0), m_max_x(0), m_max_y(0) {}
+        m_first_visible_column(0), m_min_x(0), m_min_y(0), m_max_x(0),
+        m_max_y(0) {}
 
   ~SourceFileWindowDelegate() override = default;
 
@@ -3396,6 +3415,8 @@ class SourceFileWindowDelegate : public WindowDelegate {
         {KEY_RETURN, "Run to selected line with one shot breakpoint"},
         {KEY_UP, "Select previous source line"},
         {KEY_DOWN, "Select next source line"},
+        {KEY_LEFT, "Scroll to the left"},
+        {KEY_RIGHT, "Scroll to the right"},
         {KEY_PPAGE, "Page up"},
         {KEY_NPAGE, "Page down"},
         {'b', "Set breakpoint on selected source/disassembly line"},
@@ -3650,7 +3671,15 @@ class SourceFileWindowDelegate : public WindowDelegate {
           StringRef line = lineStream.GetString();
           if (line.endswith("\n"))
             line = line.drop_back();
-          window.OutputColoredStringTruncated(1, line, is_pc_line);
+          bool wasWritten = window.OutputColoredStringTruncated(
+              1, line, m_first_visible_column, line_is_selected);
+          if (line_is_selected && !wasWritten) {
+            // Draw an empty space to show the selected line if empty,
+            // or draw '<' if nothing is visible because of scrolling too much
+            // to the right.
+            window.PutCStringTruncated(
+                1, line.empty() && m_first_visible_column == 0 ? " " : "<");
+          }
 
           if (is_pc_line && frame_sp &&
               frame_sp->GetConcreteFrameIndex() == 0) {
@@ -3801,7 +3830,9 @@ class SourceFileWindowDelegate : public WindowDelegate {
             strm.Printf("%s", mnemonic);
 
           int right_pad = 1;
-          window.PutCStringTruncated(right_pad, strm.GetData());
+          window.PutCStringTruncated(
+              right_pad,
+              strm.GetString().substr(m_first_visible_column).data());
 
           if (is_pc_line && frame_sp &&
               frame_sp->GetConcreteFrameIndex() == 0) {
@@ -3896,6 +3927,15 @@ class SourceFileWindowDelegate : public WindowDelegate {
       }
       return eKeyHandled;
 
+    case KEY_LEFT:
+      if (m_first_visible_column > 0)
+        --m_first_visible_column;
+      return eKeyHandled;
+
+    case KEY_RIGHT:
+      ++m_first_visible_column;
+      return eKeyHandled;
+
     case '\r':
     case '\n':
     case KEY_ENTER:
@@ -4127,6 +4167,7 @@ class SourceFileWindowDelegate : public WindowDelegate {
   uint32_t m_stop_id;
   uint32_t m_frame_idx;
   int m_first_visible_line;
+  int m_first_visible_column;
   int m_min_x;
   int m_min_y;
   int m_max_x;
diff --git a/lldb/test/API/commands/gui/viewlarge/TestGuiViewLarge.py b/lldb/test/API/commands/gui/viewlarge/TestGuiViewLarge.py
index ebb99f65047f69..19f8334e5d7506 100644
--- a/lldb/test/API/commands/gui/viewlarge/TestGuiViewLarge.py
+++ b/lldb/test/API/commands/gui/viewlarge/TestGuiViewLarge.py
@@ -25,6 +25,9 @@ def test_gui(self):
         self.expect("run", substrs=["stop reason ="])
 
         escape_key = chr(27).encode()
+        left_key = chr(27)+'OD' # for vt100 terminal (lldbexpect sets TERM=vt100)
+        right_key = chr(27)+'OC'
+        ctrl_l = chr(12)
 
         # Start the GUI and close the welcome window.
         self.child.sendline("gui")
@@ -45,6 +48,20 @@ def test_gui(self):
         self.child.expect_exact("(int) a_variable_with_a_very_looooooooooooooooooooooooooooooo"+chr(27))
         self.child.expect_exact("(int) shortvar = 1"+chr(27))
 
+        # Scroll the sources view twice to the right.
+        self.child.send(right_key)
+        self.child.send(right_key)
+        # Force a redraw, otherwise curses will optimize the drawing to not draw all 'o'.
+        self.child.send(ctrl_l)
+        # The source code is indented by two spaces, so there'll be just two extra 'o' on the right.
+        self.child.expect_exact("int a_variable_with_a_very_looooooooooooooooooooooooooooo"+chr(27))
+
+        # And scroll back to the left.
+        self.child.send(left_key)
+        self.child.send(left_key)
+        self.child.send(ctrl_l)
+        self.child.expect_exact("int a_variable_with_a_very_looooooooooooooooooooooooooo"+chr(27))
+
         # Press escape to quit the gui
         self.child.send(escape_key)
 

From e760e85680d6ade8934118d0854637b213a21f1a Mon Sep 17 00:00:00 2001
From: Ronak Chauhan <RonakNilesh.Chauhan@amd.com>
Date: Fri, 24 Jul 2020 15:21:46 +0530
Subject: [PATCH 029/101] [llvm-objdump][AMDGPU] Detect CPU string

AMDGPU ISA isn't backwards compatible and hence -mcpu must always be specified during disassembly.
However, the AMDGPU target CPU is stored in e_flags in the ELF object.

This patch allows targets to implement CPU string detection, and also implements it for AMDGPU by looking at e_flags.

Reviewed By: scott.linder

Differential Revision: https://reviews.llvm.org/D84519
---
 llvm/include/llvm/Object/ELFObjectFile.h      |   4 +
 llvm/include/llvm/Object/ObjectFile.h         |   1 +
 llvm/lib/Object/ELFObjectFile.cpp             | 111 ++++++++++++++++++
 .../llvm-objdump/ELF/AMDGPU/subtarget.ll      |  83 +++++++++++++
 llvm/tools/llvm-objdump/llvm-objdump.cpp      |   4 +
 5 files changed, 203 insertions(+)
 create mode 100644 llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll

diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index 35a54c7045b137..d8e1b7e8819eb8 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -86,6 +86,10 @@ class ELFObjectFileBase : public ObjectFile {
 
   SubtargetFeatures getRISCVFeatures() const;
 
+  Optional<StringRef> tryGetCPUName() const override;
+
+  StringRef getAMDGPUCPUName() const;
+
   void setARMSubArch(Triple &TheTriple) const override;
 
   virtual uint16_t getEType() const = 0;
diff --git a/llvm/include/llvm/Object/ObjectFile.h b/llvm/include/llvm/Object/ObjectFile.h
index 8e8937201716b8..744e33d2d9f7ae 100644
--- a/llvm/include/llvm/Object/ObjectFile.h
+++ b/llvm/include/llvm/Object/ObjectFile.h
@@ -327,6 +327,7 @@ class ObjectFile : public SymbolicFile {
   virtual StringRef getFileFormatName() const = 0;
   virtual Triple::ArchType getArch() const = 0;
   virtual SubtargetFeatures getFeatures() const = 0;
+  virtual Optional<StringRef> tryGetCPUName() const { return None; };
   virtual void setARMSubArch(Triple &TheTriple) const { }
   virtual Expected<uint64_t> getStartAddress() const {
     return errorCodeToError(object_error::parse_failed);
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 72eaeb78a21ddc..91ed60a814abfd 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -355,6 +355,117 @@ SubtargetFeatures ELFObjectFileBase::getFeatures() const {
   }
 }
 
+Optional<StringRef> ELFObjectFileBase::tryGetCPUName() const {
+  switch (getEMachine()) {
+  case ELF::EM_AMDGPU:
+    return getAMDGPUCPUName();
+  default:
+    return None;
+  }
+}
+
+StringRef ELFObjectFileBase::getAMDGPUCPUName() const {
+  assert(getEMachine() == ELF::EM_AMDGPU);
+  unsigned CPU = getPlatformFlags() & ELF::EF_AMDGPU_MACH;
+
+  switch (CPU) {
+  // Radeon HD 2000/3000 Series (R600).
+  case ELF::EF_AMDGPU_MACH_R600_R600:
+    return "r600";
+  case ELF::EF_AMDGPU_MACH_R600_R630:
+    return "r630";
+  case ELF::EF_AMDGPU_MACH_R600_RS880:
+    return "rs880";
+  case ELF::EF_AMDGPU_MACH_R600_RV670:
+    return "rv670";
+
+  // Radeon HD 4000 Series (R700).
+  case ELF::EF_AMDGPU_MACH_R600_RV710:
+    return "rv710";
+  case ELF::EF_AMDGPU_MACH_R600_RV730:
+    return "rv730";
+  case ELF::EF_AMDGPU_MACH_R600_RV770:
+    return "rv770";
+
+  // Radeon HD 5000 Series (Evergreen).
+  case ELF::EF_AMDGPU_MACH_R600_CEDAR:
+    return "cedar";
+  case ELF::EF_AMDGPU_MACH_R600_CYPRESS:
+    return "cypress";
+  case ELF::EF_AMDGPU_MACH_R600_JUNIPER:
+    return "juniper";
+  case ELF::EF_AMDGPU_MACH_R600_REDWOOD:
+    return "redwood";
+  case ELF::EF_AMDGPU_MACH_R600_SUMO:
+    return "sumo";
+
+  // Radeon HD 6000 Series (Northern Islands).
+  case ELF::EF_AMDGPU_MACH_R600_BARTS:
+    return "barts";
+  case ELF::EF_AMDGPU_MACH_R600_CAICOS:
+    return "caicos";
+  case ELF::EF_AMDGPU_MACH_R600_CAYMAN:
+    return "cayman";
+  case ELF::EF_AMDGPU_MACH_R600_TURKS:
+    return "turks";
+
+  // AMDGCN GFX6.
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX600:
+    return "gfx600";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX601:
+    return "gfx601";
+
+  // AMDGCN GFX7.
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX700:
+    return "gfx700";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX701:
+    return "gfx701";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX702:
+    return "gfx702";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX703:
+    return "gfx703";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX704:
+    return "gfx704";
+
+  // AMDGCN GFX8.
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX801:
+    return "gfx801";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX802:
+    return "gfx802";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX803:
+    return "gfx803";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX810:
+    return "gfx810";
+
+  // AMDGCN GFX9.
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX900:
+    return "gfx900";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX902:
+    return "gfx902";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX904:
+    return "gfx904";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX906:
+    return "gfx906";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX908:
+    return "gfx908";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909:
+    return "gfx909";
+
+  // AMDGCN GFX10.
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010:
+    return "gfx1010";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011:
+    return "gfx1011";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012:
+    return "gfx1012";
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030:
+    return "gfx1030";
+
+  default:
+    llvm_unreachable("Unknown EF_AMDGPU_MACH value");
+  }
+}
+
 // FIXME Encode from a tablegen description or target parser.
 void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const {
   if (TheTriple.getSubArch() != Triple::NoSubArch)
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
new file mode 100644
index 00000000000000..e49ee40782bb77
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll
@@ -0,0 +1,83 @@
+define amdgpu_kernel void @test_kernel() {
+  ret void
+}
+
+; Test subtarget detection. Disassembly is only supported for GFX8 and beyond.
+;
+; ----------------------------------GFX10--------------------------------------
+;
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx1030 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1012 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx1012 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1011 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx1011 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx1010 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
+
+; ----------------------------------GFX9---------------------------------------
+;
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx909 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx909 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx908 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx906 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx904 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx904 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx902 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx902 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx900 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
+
+; ----------------------------------GFX8---------------------------------------
+;
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx810 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx810 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx803 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx802 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
+
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx801 -filetype=obj -O0 -o %t.o %s
+; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx801 %t.o > %t-specify.txt
+; RUN: llvm-objdump -D %t.o > %t-detect.txt
+; RUN: diff %t-specify.txt %t-detect.txt
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index ecdd227d08bda0..6b3ecd9cef1939 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -2170,6 +2170,10 @@ static void disassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
   if (!AsmInfo)
     reportError(Obj->getFileName(),
                 "no assembly info for target " + TripleName);
+
+  if (MCPU.empty())
+    MCPU = Obj->tryGetCPUName().getValueOr("").str();
+
   std::unique_ptr<const MCSubtargetInfo> STI(
       TheTarget->createMCSubtargetInfo(TripleName, MCPU, Features.getString()));
   if (!STI)

From b4bffdbadfcceb3959aaf231c1542301944e5812 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Tue, 18 Aug 2020 08:40:36 -0400
Subject: [PATCH 030/101] Revert "PR44685: DebugInfo: Handle
 address-use-invalid type units referencing non-type units"

This reverts commit be3ef93bf58aa5546c7baadfb21d43b75fbb4e24.
Test fails on macOS and Windows, e.g. http://45.33.8.238/win/22216/step_11.txt
---
 llvm/lib/CodeGen/AsmPrinter/AddressPool.h    |  2 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp   |  6 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h     |  1 -
 llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll | 89 --------------------
 4 files changed, 4 insertions(+), 94 deletions(-)
 delete mode 100644 llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll

diff --git a/llvm/lib/CodeGen/AsmPrinter/AddressPool.h b/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
index f1edc6c330d51e..f92cf72093ca03 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
+++ b/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
@@ -48,7 +48,7 @@ class AddressPool {
 
   bool hasBeenUsed() const { return HasBeenUsed; }
 
-  void resetUsedFlag(bool HasBeenUsed = false) { this->HasBeenUsed = HasBeenUsed; }
+  void resetUsedFlag() { HasBeenUsed = false; }
 
   MCSymbol *getLabel() { return AddressTableBaseSym; }
   void setLabel(MCSymbol *Sym) { AddressTableBaseSym = Sym; }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index cee72120accb79..f70eed32f0b532 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -3305,14 +3305,14 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
 
 DwarfDebug::NonTypeUnitContext::NonTypeUnitContext(DwarfDebug *DD)
     : DD(DD),
-      TypeUnitsUnderConstruction(std::move(DD->TypeUnitsUnderConstruction)), AddrPoolUsed(DD->AddrPool.hasBeenUsed()) {
+      TypeUnitsUnderConstruction(std::move(DD->TypeUnitsUnderConstruction)) {
   DD->TypeUnitsUnderConstruction.clear();
-  DD->AddrPool.resetUsedFlag();
+  assert(TypeUnitsUnderConstruction.empty() || !DD->AddrPool.hasBeenUsed());
 }
 
 DwarfDebug::NonTypeUnitContext::~NonTypeUnitContext() {
   DD->TypeUnitsUnderConstruction = std::move(TypeUnitsUnderConstruction);
-  DD->AddrPool.resetUsedFlag(AddrPoolUsed);
+  DD->AddrPool.resetUsedFlag();
 }
 
 DwarfDebug::NonTypeUnitContext DwarfDebug::enterNonTypeUnitContext() {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 93e08d1151ff70..0b943ebe46b669 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -648,7 +648,6 @@ class DwarfDebug : public DebugHandlerBase {
   class NonTypeUnitContext {
     DwarfDebug *DD;
     decltype(DwarfDebug::TypeUnitsUnderConstruction) TypeUnitsUnderConstruction;
-    bool AddrPoolUsed;
     friend class DwarfDebug;
     NonTypeUnitContext(DwarfDebug *DD);
   public:
diff --git a/llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll b/llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll
deleted file mode 100644
index 98943b73aefe64..00000000000000
--- a/llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-; RUN: llc -filetype=obj -O0 -generate-type-units -split-dwarf-file=x.dwo < %s \
-; RUN:     | llvm-dwarfdump -debug-info -debug-types - \
-; RUN:     | FileCheck --implicit-check-not=Unit --implicit-check-not=contents --implicit-check-not=declaration %s
-
-; Test that an address-using-with-Split-DWARF type unit that references a
-; non-type unit is handled correctly. A NonTypeUnitContext is used to insulate
-; the type construction from being discarded when the prior/outer type has to be
-; discarded due to finding it used an address & so can't be type united under
-; Split DWARF. 
-
-; The intermediate types tu and t2 are here just to test a bit more
-; thoroughly/broadly. They also demonstrate one slight limitation/sub-optimality
-; since 't2' isn't put in a type unit.
-
-
-; extern int foo;
-; namespace {
-; struct t1 {
-; };
-; }
-; template <int *> struct t2 {
-;   t1 v1;
-; };
-; struct t3 {
-;   t2<&foo> v1;
-; };
-; t3 v1;
-
-; CHECK: .debug_info contents:
-; CHECK: Compile Unit:
-
-; CHECK: .debug_info.dwo contents:
-; CHECK: Compile Unit:
-
-; FIXME: In theory "t3" could be in a type unit - but at the moment, because it
-;        references t2, which needs an address, t3 gets non-type-united.
-;        But the same doesn't happen if t3 referenced an anonymous namespace type.
-
-; CHECK: DW_TAG_structure_type
-; CHECK:   DW_AT_name ("t3")
-; CHECK:   DW_TAG_member
-; CHECK:     DW_AT_type {{.*}} "t2<&foo>"
-; CHECK: DW_TAG_namespace
-; CHECK: [[T1:0x[0-9a-f]*]]:  DW_TAG_structure_type
-; CHECK:     DW_AT_name    ("t1")
-; CHECK: DW_TAG_structure_type
-; CHECK:   DW_AT_name ("t2<&foo>")
-; CHECK:   DW_TAG_member
-; CHECK:     DW_AT_name    ("v1")
-; CHECK:     DW_AT_type    ([[T1]] "t1")
-
-; CHECK: .debug_types contents:
-
-; CHECK-NOT: .debug_types.dwo contents:
-
-
-%struct.t3 = type { %struct.t2 }
-%struct.t2 = type { %"struct.(anonymous namespace)::t1" }
-%"struct.(anonymous namespace)::t1" = type { i8 }
-
-@v1 = dso_local global %struct.t3 zeroinitializer, align 1, !dbg !0
-@foo = external dso_local global i32, align 4
-
-!llvm.dbg.cu = !{!2}
-!llvm.module.flags = !{!18, !19, !20}
-!llvm.ident = !{!21}
-
-!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
-!1 = distinct !DIGlobalVariable(name: "v1", scope: !2, file: !3, line: 16, type: !6, isLocal: false, isDefinition: true)
-!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 12.0.0 (git@github.com:llvm/llvm-project.git be646ae2865371c7a4966797e88f355de5653e04)", isOptimized: false, runtimeVersion: 0, splitDebugFilename: "test.dwo", emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: GNU)
-!3 = !DIFile(filename: "test.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
-!4 = !{}
-!5 = !{!0}
-!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t3", file: !3, line: 12, size: 8, flags: DIFlagTypePassByValue, elements: !7, identifier: "_ZTS2t3")
-!7 = !{!8}
-!8 = !DIDerivedType(tag: DW_TAG_member, name: "v1", scope: !6, file: !3, line: 13, baseType: !9, size: 8)
-!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t2<&foo>", file: !3, line: 8, size: 8, flags: DIFlagTypePassByValue, elements: !10, templateParams: !14, identifier: "_ZTS2t2IXadL_Z3fooEEE")
-!10 = !{!11}
-!11 = !DIDerivedType(tag: DW_TAG_member, name: "v1", scope: !9, file: !3, line: 9, baseType: !12, size: 8)
-!12 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", scope: !13, file: !3, line: 4, size: 8, flags: DIFlagTypePassByValue, elements: !4)
-!13 = !DINamespace(scope: null)
-!14 = !{!15}
-!15 = !DITemplateValueParameter(type: !16, value: i32* @foo)
-!16 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !17, size: 64)
-!17 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!18 = !{i32 7, !"Dwarf Version", i32 4}
-!19 = !{i32 2, !"Debug Info Version", i32 3}
-!20 = !{i32 1, !"wchar_size", i32 4}
-!21 = !{!"clang version 12.0.0 (git@github.com:llvm/llvm-project.git be646ae2865371c7a4966797e88f355de5653e04)"}

From 7db5124736d1b87e938f56c7eaa48f118fd328b0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 18 Aug 2020 13:38:10 +0100
Subject: [PATCH 031/101] [X86][AVX] lowerShuffleWithVTRUNC - avoid unnecessary
 division in element counts. NFCI.

(256 / SrcEltBits) == ((2 * EltSizeInBits * NumElts) / (EltSizeInBits * Scale)) == (2 * (NumElts / Scale)) == NumSrcElts
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a929df328e130a..27dee97edb2fd8 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11415,7 +11415,7 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
     SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
 
     MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
-    MVT SrcVT = MVT::getVectorVT(SrcSVT, 256 / SrcEltBits);
+    MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
     Src = DAG.getBitcast(SrcVT, Src);
 
     if (SrcVT.getVectorNumElements() == NumElts)

From 7b777ee73077dff4d8f3a0160f2dcf977895fd57 Mon Sep 17 00:00:00 2001
From: Ronak Chauhan <RonakNilesh.Chauhan@amd.com>
Date: Tue, 18 Aug 2020 18:12:41 +0530
Subject: [PATCH 032/101] [ELF] Hide target specific methods as private

Reviewed By: jhenderson

Differential Revision: https://reviews.llvm.org/D86136
---
 llvm/include/llvm/Object/ELFObjectFile.h | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index d8e1b7e8819eb8..8049020f564b6e 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -51,6 +51,12 @@ class ELFObjectFileBase : public ObjectFile {
   friend class ELFSectionRef;
   friend class ELFSymbolRef;
 
+  SubtargetFeatures getMIPSFeatures() const;
+  SubtargetFeatures getARMFeatures() const;
+  SubtargetFeatures getRISCVFeatures() const;
+
+  StringRef getAMDGPUCPUName() const;
+
 protected:
   ELFObjectFileBase(unsigned int Type, MemoryBufferRef Source);
 
@@ -80,16 +86,8 @@ class ELFObjectFileBase : public ObjectFile {
 
   SubtargetFeatures getFeatures() const override;
 
-  SubtargetFeatures getMIPSFeatures() const;
-
-  SubtargetFeatures getARMFeatures() const;
-
-  SubtargetFeatures getRISCVFeatures() const;
-
   Optional<StringRef> tryGetCPUName() const override;
 
-  StringRef getAMDGPUCPUName() const;
-
   void setARMSubArch(Triple &TheTriple) const override;
 
   virtual uint16_t getEType() const = 0;

From ab58c9ee8a6e9ace3a93198496b4d85e8cb2b5a9 Mon Sep 17 00:00:00 2001
From: Eduardo Caldas <ecaldas@google.com>
Date: Fri, 14 Aug 2020 09:43:20 +0000
Subject: [PATCH 033/101] [SyntaxTree] Implement annotation-based test
 infrastructure

We add the method `SyntaxTreeTest::treeDumpEqualOnAnnotations`, which
allows us to compare the treeDump of only annotated code. This will reduce a
lot of noise from our `BuildTreeTest` and make them short and easier to
read.
---
 .../unittests/Tooling/Syntax/TreeTestBase.cpp | 29 +++++++++++++++++++
 clang/unittests/Tooling/Syntax/TreeTestBase.h |  3 ++
 2 files changed, 32 insertions(+)

diff --git a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
index 6d2efeaaa8ebf9..05fbac4f47e1c3 100644
--- a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
@@ -180,6 +180,35 @@ ::testing::AssertionResult SyntaxTreeTest::treeDumpEqual(StringRef Code,
   return ::testing::AssertionSuccess();
 }
 
+::testing::AssertionResult
+SyntaxTreeTest::treeDumpEqualOnAnnotations(StringRef CodeWithAnnotations,
+                                           ArrayRef<StringRef> TreeDumps) {
+  SCOPED_TRACE(llvm::join(GetParam().getCommandLineArgs(), " "));
+
+  auto AnnotatedCode = llvm::Annotations(CodeWithAnnotations);
+  auto *Root = buildTree(AnnotatedCode.code(), GetParam());
+
+  if (Diags->getClient()->getNumErrors() != 0) {
+    return ::testing::AssertionFailure()
+           << "Source file has syntax errors, they were printed to the test "
+              "log";
+  }
+
+  bool failed = false;
+  auto AnnotatedRanges = AnnotatedCode.ranges();
+  assert(AnnotatedRanges.size() == TreeDumps.size());
+  for (auto i = 0ul; i < AnnotatedRanges.size(); i++) {
+    auto *AnnotatedNode = nodeByRange(AnnotatedRanges[i], Root);
+    assert(AnnotatedNode);
+    auto AnnotatedNodeDump =
+        std::string(StringRef(AnnotatedNode->dump(*Arena)).trim());
+    // EXPECT_EQ shows the diff between the two strings if they are different.
+    EXPECT_EQ(TreeDumps[i].trim().str(), AnnotatedNodeDump);
+    if (AnnotatedNodeDump != TreeDumps[i].trim().str())
+      failed = true;
+  }
+  return failed ? ::testing::AssertionFailure() : ::testing::AssertionSuccess();
+}
 syntax::Node *SyntaxTreeTest::nodeByRange(llvm::Annotations::Range R,
                                           syntax::Node *Root) {
   ArrayRef<syntax::Token> Toks = tokens(Root);
diff --git a/clang/unittests/Tooling/Syntax/TreeTestBase.h b/clang/unittests/Tooling/Syntax/TreeTestBase.h
index bfa6ecd7909f8f..c282bbf45fd390 100644
--- a/clang/unittests/Tooling/Syntax/TreeTestBase.h
+++ b/clang/unittests/Tooling/Syntax/TreeTestBase.h
@@ -34,6 +34,9 @@ class SyntaxTreeTest : public ::testing::Test,
 
   ::testing::AssertionResult treeDumpEqual(StringRef Code, StringRef Tree);
 
+  ::testing::AssertionResult
+  treeDumpEqualOnAnnotations(StringRef CodeWithAnnotations,
+                             ArrayRef<StringRef> TreeDumps);
   /// Finds the deepest node in the tree that covers exactly \p R.
   /// FIXME: implement this efficiently and move to public syntax tree API.
   syntax::Node *nodeByRange(llvm::Annotations::Range R, syntax::Node *Root);

From c8c92b54d74c1b9256f9aed6ba89d66fbd1d01ae Mon Sep 17 00:00:00 2001
From: Eduardo Caldas <ecaldas@google.com>
Date: Fri, 14 Aug 2020 09:53:45 +0000
Subject: [PATCH 034/101] [SyntaxTree] Use Annotations based tests for
 expressions

In this process we also create some other tests, in order to not lose
coverage when focusing on the annotated code

Differential Revision: https://reviews.llvm.org/D85962
---
 .../Tooling/Syntax/BuildTreeTest.cpp          | 3693 ++++++-----------
 .../unittests/Tooling/Syntax/TreeTestBase.cpp |   24 +-
 2 files changed, 1383 insertions(+), 2334 deletions(-)

diff --git a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
index 211e8b1ae901ff..fd858dfba91fe1 100644
--- a/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
+++ b/clang/unittests/Tooling/Syntax/BuildTreeTest.cpp
@@ -437,672 +437,355 @@ void test() {
 }
 
 TEST_P(SyntaxTreeTest, UnqualifiedId_Identifier) {
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test(int a) {
-  a;
+  [[a]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   `-a
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-IdExpression
-    | | `-UnqualifiedId
-    | |   `-a
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+IdExpression
+`-UnqualifiedId
+  `-a
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, UnqualifiedId_OperatorFunctionId) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X {
   friend X operator+(const X&, const X&);
 };
 void test(X x) {
-  operator+(x, x);
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-UnknownDeclaration
-| | `-SimpleDeclaration
-| |   |-friend
-| |   |-X
-| |   |-SimpleDeclarator
-| |   | |-operator
-| |   | |-+
-| |   | `-ParametersAndQualifiers
-| |   |   |-(
-| |   |   |-SimpleDeclaration
-| |   |   | |-const
-| |   |   | |-X
-| |   |   | `-SimpleDeclarator
-| |   |   |   `-&
-| |   |   |-,
-| |   |   |-SimpleDeclaration
-| |   |   | |-const
-| |   |   | |-X
-| |   |   | `-SimpleDeclarator
-| |   |   |   `-&
-| |   |   `-)
-| |   `-;
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-x
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-UnknownExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   |-operator
-    | | |   `-+
-    | | |-(
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-x
-    | | |-,
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-x
-    | | `-)
-    | `-;
-    `-}
-)txt"));
+  [[operator+(x, x)]];
+}
+)cpp",
+      {R"txt(
+UnknownExpression
+|-IdExpression
+| `-UnqualifiedId
+|   |-operator
+|   `-+
+|-(
+|-IdExpression
+| `-UnqualifiedId
+|   `-x
+|-,
+|-IdExpression
+| `-UnqualifiedId
+|   `-x
+`-)
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, UnqualifiedId_ConversionFunctionId) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X {
   operator int();
 };
 void test(X x) {
   // TODO: Expose `id-expression` from `MemberExpr`
-  x.operator int();
+  [[x.operator int()]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-SimpleDeclaration
-| | |-SimpleDeclarator
-| | | |-operator
-| | | |-int
-| | | `-ParametersAndQualifiers
-| | |   |-(
-| | |   `-)
-| | `-;
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-x
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-UnknownExpression
-    | | |-UnknownExpression
-    | | | |-IdExpression
-    | | | | `-UnqualifiedId
-    | | | |   `-x
-    | | | |-.
-    | | | |-operator
-    | | | `-int
-    | | |-(
-    | | `-)
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+UnknownExpression
+|-UnknownExpression
+| |-IdExpression
+| | `-UnqualifiedId
+| |   `-x
+| |-.
+| |-operator
+| `-int
+|-(
+`-)
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, UnqualifiedId_LiteralOperatorId) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 unsigned operator "" _w(char);
 void test() {
-  operator "" _w('1');
+  [[operator "" _w('1')]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-unsigned
-| |-SimpleDeclarator
-| | |-operator
-| | |-""
-| | |-_w
-| | `-ParametersAndQualifiers
-| |   |-(
-| |   |-SimpleDeclaration
-| |   | `-char
-| |   `-)
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-UnknownExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   |-operator
-    | | |   |-""
-    | | |   `-_w
-    | | |-(
-    | | |-CharacterLiteralExpression
-    | | | `-'1'
-    | | `-)
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+UnknownExpression
+|-IdExpression
+| `-UnqualifiedId
+|   |-operator
+|   |-""
+|   `-_w
+|-(
+|-CharacterLiteralExpression
+| `-'1'
+`-)
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, UnqualifiedId_Destructor) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X { };
 void test(X x) {
   // TODO: Expose `id-expression` from `MemberExpr`
-  x.~X();
+  [[x.~X()]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-x
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-UnknownExpression
-    | | |-UnknownExpression
-    | | | |-IdExpression
-    | | | | `-UnqualifiedId
-    | | | |   `-x
-    | | | |-.
-    | | | |-~
-    | | | `-X
-    | | |-(
-    | | `-)
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+UnknownExpression
+|-UnknownExpression
+| |-IdExpression
+| | `-UnqualifiedId
+| |   `-x
+| |-.
+| |-~
+| `-X
+|-(
+`-)
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, UnqualifiedId_DecltypeDestructor) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X { };
 void test(X x) {
   // TODO: Expose `id-expression` from `MemberExpr`
-  x.~decltype(x)();
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-x
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-UnknownExpression
-    | | |-UnknownExpression
-    | | | |-IdExpression
-    | | | | `-UnqualifiedId
-    | | | |   `-x
-    | | | |-.
-    | | | `-~
-    | | |-decltype
-    | | |-(
-    | | |-x
-    | | |-)
-    | | |-(
-    | | `-)
-    | `-;
-    `-}
-)txt"));
+  [[x.~decltype(x)()]];
+}
+)cpp",
+      {R"txt(
+UnknownExpression
+|-UnknownExpression
+| |-IdExpression
+| | `-UnqualifiedId
+| |   `-x
+| |-.
+| `-~
+|-decltype
+|-(
+|-x
+|-)
+|-(
+`-)
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, UnqualifiedId_TemplateId) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 template<typename T>
 T f();
 void test() {
-  f<int>();
+  [[f<int>()]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-TemplateDeclaration
-| |-template
-| |-<
-| |-UnknownDeclaration
-| | |-typename
-| | `-T
-| |->
-| `-SimpleDeclaration
-|   |-T
-|   |-SimpleDeclarator
-|   | |-f
-|   | `-ParametersAndQualifiers
-|   |   |-(
-|   |   `-)
-|   `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-UnknownExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   |-f
-    | | |   |-<
-    | | |   |-int
-    | | |   `->
-    | | |-(
-    | | `-)
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+UnknownExpression
+|-IdExpression
+| `-UnqualifiedId
+|   |-f
+|   |-<
+|   |-int
+|   `->
+|-(
+`-)
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, QualifiedId_NamespaceSpecifier) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 namespace n {
   struct S { };
 }
 void test() {
-  ::n::S s1;
-  n::S s2;
+  [[::n::S s1]];
+  [[n::S s2]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-NamespaceDefinition
-| |-namespace
-| |-n
-| |-{
-| |-SimpleDeclaration
-| | |-struct
-| | |-S
-| | |-{
-| | |-}
-| | `-;
-| `-}
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-DeclarationStatement
-    | |-SimpleDeclaration
-    | | |-NestedNameSpecifier
-    | | | |-::
-    | | | |-IdentifierNameSpecifier
-    | | | | `-n
-    | | | `-::
-    | | |-S
-    | | `-SimpleDeclarator
-    | |   `-UnknownExpression
-    | |     `-s1
-    | `-;
-    |-DeclarationStatement
-    | |-SimpleDeclaration
-    | | |-NestedNameSpecifier
-    | | | |-IdentifierNameSpecifier
-    | | | | `-n
-    | | | `-::
-    | | |-S
-    | | `-SimpleDeclarator
-    | |   `-UnknownExpression
-    | |     `-s2
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+SimpleDeclaration
+|-NestedNameSpecifier
+| |-::
+| |-IdentifierNameSpecifier
+| | `-n
+| `-::
+|-S
+`-SimpleDeclarator
+  `-UnknownExpression
+    `-s1
+)txt",
+       R"txt(
+SimpleDeclaration
+|-NestedNameSpecifier
+| |-IdentifierNameSpecifier
+| | `-n
+| `-::
+|-S
+`-SimpleDeclarator
+  `-UnknownExpression
+    `-s2
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, QualifiedId_TemplateSpecifier) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 template<typename T>
 struct ST {
   struct S { };
 };
 void test() {
-  ::template ST<int>::S s1;
-  ::ST<int>::S s2;
+  [[::template ST<int>::S s1]];
+  [[::ST<int>::S s2]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-TemplateDeclaration
-| |-template
-| |-<
-| |-UnknownDeclaration
-| | |-typename
-| | `-T
-| |->
-| `-SimpleDeclaration
-|   |-struct
-|   |-ST
-|   |-{
-|   |-SimpleDeclaration
-|   | |-struct
-|   | |-S
-|   | |-{
-|   | |-}
-|   | `-;
-|   |-}
-|   `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-DeclarationStatement
-    | |-SimpleDeclaration
-    | | |-NestedNameSpecifier
-    | | | |-::
-    | | | |-SimpleTemplateNameSpecifier
-    | | | | |-template
-    | | | | |-ST
-    | | | | |-<
-    | | | | |-int
-    | | | | `->
-    | | | `-::
-    | | |-S
-    | | `-SimpleDeclarator
-    | |   `-UnknownExpression
-    | |     `-s1
-    | `-;
-    |-DeclarationStatement
-    | |-SimpleDeclaration
-    | | |-NestedNameSpecifier
-    | | | |-::
-    | | | |-SimpleTemplateNameSpecifier
-    | | | | |-ST
-    | | | | |-<
-    | | | | |-int
-    | | | | `->
-    | | | `-::
-    | | |-S
-    | | `-SimpleDeclarator
-    | |   `-UnknownExpression
-    | |     `-s2
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+SimpleDeclaration
+|-NestedNameSpecifier
+| |-::
+| |-SimpleTemplateNameSpecifier
+| | |-template
+| | |-ST
+| | |-<
+| | |-int
+| | `->
+| `-::
+|-S
+`-SimpleDeclarator
+  `-UnknownExpression
+    `-s1
+)txt",
+       R"txt(
+SimpleDeclaration
+|-NestedNameSpecifier
+| |-::
+| |-SimpleTemplateNameSpecifier
+| | |-ST
+| | |-<
+| | |-int
+| | `->
+| `-::
+|-S
+`-SimpleDeclarator
+  `-UnknownExpression
+    `-s2
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, QualifiedId_DecltypeSpecifier) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct S {
   static void f(){}
 };
 void test(S s) {
-  decltype(s)::f();
+  [[decltype(s)::f()]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-S
-| |-{
-| |-SimpleDeclaration
-| | |-static
-| | |-void
-| | |-SimpleDeclarator
-| | | |-f
-| | | `-ParametersAndQualifiers
-| | |   |-(
-| | |   `-)
-| | `-CompoundStatement
-| |   |-{
-| |   `-}
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-S
-  |   | `-SimpleDeclarator
-  |   |   `-s
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-UnknownExpression
-    | | |-IdExpression
-    | | | |-NestedNameSpecifier
-    | | | | |-DecltypeNameSpecifier
-    | | | | | |-decltype
-    | | | | | |-(
-    | | | | | |-IdExpression
-    | | | | | | `-UnqualifiedId
-    | | | | | |   `-s
-    | | | | | `-)
-    | | | | `-::
-    | | | `-UnqualifiedId
-    | | |   `-f
-    | | |-(
-    | | `-)
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+UnknownExpression
+|-IdExpression
+| |-NestedNameSpecifier
+| | |-DecltypeNameSpecifier
+| | | |-decltype
+| | | |-(
+| | | |-IdExpression
+| | | | `-UnqualifiedId
+| | | |   `-s
+| | | `-)
+| | `-::
+| `-UnqualifiedId
+|   `-f
+|-(
+`-)
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, QualifiedId_OptionalTemplateKw) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct S {
   template<typename U>
   static U f();
 };
 void test() {
-  S::f<int>();
-  S::template f<int>();
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-S
-| |-{
-| |-TemplateDeclaration
-| | |-template
-| | |-<
-| | |-UnknownDeclaration
-| | | |-typename
-| | | `-U
-| | |->
-| | `-SimpleDeclaration
-| |   |-static
-| |   |-U
-| |   |-SimpleDeclarator
-| |   | |-f
-| |   | `-ParametersAndQualifiers
-| |   |   |-(
-| |   |   `-)
-| |   `-;
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-UnknownExpression
-    | | |-IdExpression
-    | | | |-NestedNameSpecifier
-    | | | | |-IdentifierNameSpecifier
-    | | | | | `-S
-    | | | | `-::
-    | | | `-UnqualifiedId
-    | | |   |-f
-    | | |   |-<
-    | | |   |-int
-    | | |   `->
-    | | |-(
-    | | `-)
-    | `-;
-    |-ExpressionStatement
-    | |-UnknownExpression
-    | | |-IdExpression
-    | | | |-NestedNameSpecifier
-    | | | | |-IdentifierNameSpecifier
-    | | | | | `-S
-    | | | | `-::
-    | | | |-template
-    | | | `-UnqualifiedId
-    | | |   |-f
-    | | |   |-<
-    | | |   |-int
-    | | |   `->
-    | | |-(
-    | | `-)
-    | `-;
-    `-}
-)txt"));
+  [[S::f<int>()]];
+  [[S::template f<int>()]];
+}
+)cpp",
+      {R"txt(
+UnknownExpression
+|-IdExpression
+| |-NestedNameSpecifier
+| | |-IdentifierNameSpecifier
+| | | `-S
+| | `-::
+| `-UnqualifiedId
+|   |-f
+|   |-<
+|   |-int
+|   `->
+|-(
+`-)
+)txt",
+       R"txt(
+UnknownExpression
+|-IdExpression
+| |-NestedNameSpecifier
+| | |-IdentifierNameSpecifier
+| | | `-S
+| | `-::
+| |-template
+| `-UnqualifiedId
+|   |-f
+|   |-<
+|   |-int
+|   `->
+|-(
+`-)
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, QualifiedId_Complex) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 namespace n {
   template<typename T>
@@ -1112,80 +795,33 @@ namespace n {
   };
 }
 void test() {
-  ::n::template ST<int>::template f<int>();
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-|-NamespaceDefinition
-| |-namespace
-| |-n
-| |-{
-| |-TemplateDeclaration
-| | |-template
-| | |-<
-| | |-UnknownDeclaration
-| | | |-typename
-| | | `-T
-| | |->
-| | `-SimpleDeclaration
-| |   |-struct
-| |   |-ST
-| |   |-{
-| |   |-TemplateDeclaration
-| |   | |-template
-| |   | |-<
-| |   | |-UnknownDeclaration
-| |   | | |-typename
-| |   | | `-U
-| |   | |->
-| |   | `-SimpleDeclaration
-| |   |   |-static
-| |   |   |-U
-| |   |   |-SimpleDeclarator
-| |   |   | |-f
-| |   |   | `-ParametersAndQualifiers
-| |   |   |   |-(
-| |   |   |   `-)
-| |   |   `-;
-| |   |-}
-| |   `-;
-| `-}
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-UnknownExpression
-    | | |-IdExpression
-    | | | |-NestedNameSpecifier
-    | | | | |-::
-    | | | | |-IdentifierNameSpecifier
-    | | | | | `-n
-    | | | | |-::
-    | | | | |-SimpleTemplateNameSpecifier
-    | | | | | |-template
-    | | | | | |-ST
-    | | | | | |-<
-    | | | | | |-int
-    | | | | | `->
-    | | | | `-::
-    | | | |-template
-    | | | `-UnqualifiedId
-    | | |   |-f
-    | | |   |-<
-    | | |   |-int
-    | | |   `->
-    | | |-(
-    | | `-)
-    | `-;
-    `-}
-)txt"));
+  [[::n::template ST<int>::template f<int>()]];
+}
+)cpp",
+      {R"txt(
+UnknownExpression
+|-IdExpression
+| |-NestedNameSpecifier
+| | |-::
+| | |-IdentifierNameSpecifier
+| | | `-n
+| | |-::
+| | |-SimpleTemplateNameSpecifier
+| | | |-template
+| | | |-ST
+| | | |-<
+| | | |-int
+| | | `->
+| | `-::
+| |-template
+| `-UnqualifiedId
+|   |-f
+|   |-<
+|   |-int
+|   `->
+|-(
+`-)
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, QualifiedId_DependentType) {
@@ -1197,255 +833,152 @@ TEST_P(SyntaxTreeTest, QualifiedId_DependentType) {
     // tree when `-fdelayed-template-parsing` is active.
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 template <typename T>
 void test() {
-  T::template U<int>::f();
-  T::U::f();
-  T::template f<0>();
+  [[T::template U<int>::f()]];
+  [[T::U::f()]];
+  [[T::template f<0>()]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-`-TemplateDeclaration
-  |-template
-  |-<
-  |-UnknownDeclaration
-  | |-typename
-  | `-T
-  |->
-  `-SimpleDeclaration
-    |-void
-    |-SimpleDeclarator
-    | |-test
-    | `-ParametersAndQualifiers
-    |   |-(
-    |   `-)
-    `-CompoundStatement
-      |-{
-      |-ExpressionStatement
-      | |-UnknownExpression
-      | | |-IdExpression
-      | | | |-NestedNameSpecifier
-      | | | | |-IdentifierNameSpecifier
-      | | | | | `-T
-      | | | | |-::
-      | | | | |-SimpleTemplateNameSpecifier
-      | | | | | |-template
-      | | | | | |-U
-      | | | | | |-<
-      | | | | | |-int
-      | | | | | `->
-      | | | | `-::
-      | | | `-UnqualifiedId
-      | | |   `-f
-      | | |-(
-      | | `-)
-      | `-;
-      |-ExpressionStatement
-      | |-UnknownExpression
-      | | |-IdExpression
-      | | | |-NestedNameSpecifier
-      | | | | |-IdentifierNameSpecifier
-      | | | | | `-T
-      | | | | |-::
-      | | | | |-IdentifierNameSpecifier
-      | | | | | `-U
-      | | | | `-::
-      | | | `-UnqualifiedId
-      | | |   `-f
-      | | |-(
-      | | `-)
-      | `-;
-      |-ExpressionStatement
-      | |-UnknownExpression
-      | | |-IdExpression
-      | | | |-NestedNameSpecifier
-      | | | | |-IdentifierNameSpecifier
-      | | | | | `-T
-      | | | | `-::
-      | | | |-template
-      | | | `-UnqualifiedId
-      | | |   |-f
-      | | |   |-<
-      | | |   |-IntegerLiteralExpression
-      | | |   | `-0
-      | | |   `->
-      | | |-(
-      | | `-)
-      | `-;
-      `-}
-)txt"));
+      {R"txt(
+UnknownExpression
+|-IdExpression
+| |-NestedNameSpecifier
+| | |-IdentifierNameSpecifier
+| | | `-T
+| | |-::
+| | |-SimpleTemplateNameSpecifier
+| | | |-template
+| | | |-U
+| | | |-<
+| | | |-int
+| | | `->
+| | `-::
+| `-UnqualifiedId
+|   `-f
+|-(
+`-)
+)txt",
+       R"txt(
+UnknownExpression
+|-IdExpression
+| |-NestedNameSpecifier
+| | |-IdentifierNameSpecifier
+| | | `-T
+| | |-::
+| | |-IdentifierNameSpecifier
+| | | `-U
+| | `-::
+| `-UnqualifiedId
+|   `-f
+|-(
+`-)
+)txt",
+       R"txt(
+UnknownExpression
+|-IdExpression
+| |-NestedNameSpecifier
+| | |-IdentifierNameSpecifier
+| | | `-T
+| | `-::
+| |-template
+| `-UnqualifiedId
+|   |-f
+|   |-<
+|   |-IntegerLiteralExpression
+|   | `-0
+|   `->
+|-(
+`-)
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, ParenExpr) {
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  (1);
-  ((1));
-  (1 + (2));
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-ParenExpression
-    | | |-(
-    | | |-IntegerLiteralExpression
-    | | | `-1
-    | | `-)
-    | `-;
-    |-ExpressionStatement
-    | |-ParenExpression
-    | | |-(
-    | | |-ParenExpression
-    | | | |-(
-    | | | |-IntegerLiteralExpression
-    | | | | `-1
-    | | | `-)
-    | | `-)
-    | `-;
-    |-ExpressionStatement
-    | |-ParenExpression
-    | | |-(
-    | | |-BinaryOperatorExpression
-    | | | |-IntegerLiteralExpression
-    | | | | `-1
-    | | | |-+
-    | | | `-ParenExpression
-    | | |   |-(
-    | | |   |-IntegerLiteralExpression
-    | | |   | `-2
-    | | |   `-)
-    | | `-)
-    | `-;
-    `-}
-)txt"));
+  [[(1)]];
+  [[((1))]];
+  [[(1 + (2))]];
+}
+)cpp",
+      {R"txt(
+ParenExpression
+|-(
+|-IntegerLiteralExpression
+| `-1
+`-)
+)txt",
+       R"txt(
+ParenExpression
+|-(
+|-ParenExpression
+| |-(
+| |-IntegerLiteralExpression
+| | `-1
+| `-)
+`-)
+)txt",
+       R"txt(
+ParenExpression
+|-(
+|-BinaryOperatorExpression
+| |-IntegerLiteralExpression
+| | `-1
+| |-+
+| `-ParenExpression
+|   |-(
+|   |-IntegerLiteralExpression
+|   | `-2
+|   `-)
+`-)
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, UserDefinedLiteral_Char) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 unsigned operator "" _c(char);
 void test() {
-  '2'_c;
+  [['2'_c]];
 }
     )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-unsigned
-| |-SimpleDeclarator
-| | |-operator
-| | |-""
-| | |-_c
-| | `-ParametersAndQualifiers
-| |   |-(
-| |   |-SimpleDeclaration
-| |   | `-char
-| |   `-)
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-CharUserDefinedLiteralExpression
-    | | `-'2'_c
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+CharUserDefinedLiteralExpression
+`-'2'_c
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, UserDefinedLiteral_String) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 typedef decltype(sizeof(void *)) size_t;
 
 unsigned operator "" _s(const char*, size_t);
 
 void test() {
-  "12"_s;
+  [["12"_s]];
 }
     )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-typedef
-| |-decltype
-| |-(
-| |-UnknownExpression
-| | |-sizeof
-| | |-(
-| | |-void
-| | |-*
-| | `-)
-| |-)
-| |-SimpleDeclarator
-| | `-size_t
-| `-;
-|-SimpleDeclaration
-| |-unsigned
-| |-SimpleDeclarator
-| | |-operator
-| | |-""
-| | |-_s
-| | `-ParametersAndQualifiers
-| |   |-(
-| |   |-SimpleDeclaration
-| |   | |-const
-| |   | |-char
-| |   | `-SimpleDeclarator
-| |   |   `-*
-| |   |-,
-| |   |-SimpleDeclaration
-| |   | `-size_t
-| |   `-)
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-StringUserDefinedLiteralExpression
-    | | `-"12"_s
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+StringUserDefinedLiteralExpression
+`-"12"_s
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, UserDefinedLiteral_Integer) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 unsigned operator "" _i(unsigned long long);
 unsigned operator "" _r(const char*);
@@ -1453,89 +986,30 @@ template <char...>
 unsigned operator "" _t();
 
 void test() {
-  12_i;
-  12_r;
-  12_t;
+  [[12_i]];
+  [[12_r]];
+  [[12_t]];
 }
     )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-unsigned
-| |-SimpleDeclarator
-| | |-operator
-| | |-""
-| | |-_i
-| | `-ParametersAndQualifiers
-| |   |-(
-| |   |-SimpleDeclaration
-| |   | |-unsigned
-| |   | |-long
-| |   | `-long
-| |   `-)
-| `-;
-|-SimpleDeclaration
-| |-unsigned
-| |-SimpleDeclarator
-| | |-operator
-| | |-""
-| | |-_r
-| | `-ParametersAndQualifiers
-| |   |-(
-| |   |-SimpleDeclaration
-| |   | |-const
-| |   | |-char
-| |   | `-SimpleDeclarator
-| |   |   `-*
-| |   `-)
-| `-;
-|-TemplateDeclaration
-| |-template
-| |-<
-| |-SimpleDeclaration
-| | `-char
-| |-...
-| |->
-| `-SimpleDeclaration
-|   |-unsigned
-|   |-SimpleDeclarator
-|   | |-operator
-|   | |-""
-|   | |-_t
-|   | `-ParametersAndQualifiers
-|   |   |-(
-|   |   `-)
-|   `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-IntegerUserDefinedLiteralExpression
-    | | `-12_i
-    | `-;
-    |-ExpressionStatement
-    | |-IntegerUserDefinedLiteralExpression
-    | | `-12_r
-    | `-;
-    |-ExpressionStatement
-    | |-IntegerUserDefinedLiteralExpression
-    | | `-12_t
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+IntegerUserDefinedLiteralExpression
+`-12_i
+)txt",
+       R"txt(
+IntegerUserDefinedLiteralExpression
+`-12_r
+)txt",
+       R"txt(
+IntegerUserDefinedLiteralExpression
+`-12_t
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, UserDefinedLiteral_Float) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 unsigned operator "" _f(long double);
 unsigned operator "" _r(const char*);
@@ -1543,457 +1017,269 @@ template <char...>
 unsigned operator "" _t();
 
 void test() {
-  1.2_f;  // call: operator "" _f(1.2L)       | kind: float
-  1.2_r;  // call: operator "" _i("1.2")      | kind: float
-  1.2_t;  // call: operator<'1', '2'> "" _x() | kind: float
+  [[1.2_f]];
+  [[1.2_r]];
+  [[1.2_t]];
 }
     )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-unsigned
-| |-SimpleDeclarator
-| | |-operator
-| | |-""
-| | |-_f
-| | `-ParametersAndQualifiers
-| |   |-(
-| |   |-SimpleDeclaration
-| |   | |-long
-| |   | `-double
-| |   `-)
-| `-;
-|-SimpleDeclaration
-| |-unsigned
-| |-SimpleDeclarator
-| | |-operator
-| | |-""
-| | |-_r
-| | `-ParametersAndQualifiers
-| |   |-(
-| |   |-SimpleDeclaration
-| |   | |-const
-| |   | |-char
-| |   | `-SimpleDeclarator
-| |   |   `-*
-| |   `-)
-| `-;
-|-TemplateDeclaration
-| |-template
-| |-<
-| |-SimpleDeclaration
-| | `-char
-| |-...
-| |->
-| `-SimpleDeclaration
-|   |-unsigned
-|   |-SimpleDeclarator
-|   | |-operator
-|   | |-""
-|   | |-_t
-|   | `-ParametersAndQualifiers
-|   |   |-(
-|   |   `-)
-|   `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-FloatUserDefinedLiteralExpression
-    | | `-1.2_f
-    | `-;
-    |-ExpressionStatement
-    | |-FloatUserDefinedLiteralExpression
-    | | `-1.2_r
-    | `-;
-    |-ExpressionStatement
-    | |-FloatUserDefinedLiteralExpression
-    | | `-1.2_t
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+FloatUserDefinedLiteralExpression
+`-1.2_f
+)txt",
+       R"txt(
+FloatUserDefinedLiteralExpression
+`-1.2_r
+)txt",
+       R"txt(
+FloatUserDefinedLiteralExpression
+`-1.2_t
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, IntegerLiteral_LongLong) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  12ll;
-  12ull;
+  [[12ll]];
+  [[12ull]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-IntegerLiteralExpression
-    | | `-12ll
-    | `-;
-    |-ExpressionStatement
-    | |-IntegerLiteralExpression
-    | | `-12ull
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+IntegerLiteralExpression
+`-12ll
+)txt",
+       R"txt(
+IntegerLiteralExpression
+`-12ull
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, IntegerLiteral_Binary) {
   if (!GetParam().isCXX14OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  0b1100;
+  [[0b1100]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-IntegerLiteralExpression
-    | | `-0b1100
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+IntegerLiteralExpression
+`-0b1100
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, IntegerLiteral_WithDigitSeparators) {
   if (!GetParam().isCXX14OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  1'2'0ull;
+  [[1'2'0ull]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-IntegerLiteralExpression
-    | | `-1'2'0ull
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+IntegerLiteralExpression
+`-1'2'0ull
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, CharacterLiteral) {
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  'a';
-  '\n';
-  '\x20';
-  '\0';
-  L'a';
-  L'α';
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-CharacterLiteralExpression
-    | | `-'a'
-    | `-;
-    |-ExpressionStatement
-    | |-CharacterLiteralExpression
-    | | `-'\n'
-    | `-;
-    |-ExpressionStatement
-    | |-CharacterLiteralExpression
-    | | `-'\x20'
-    | `-;
-    |-ExpressionStatement
-    | |-CharacterLiteralExpression
-    | | `-'\0'
-    | `-;
-    |-ExpressionStatement
-    | |-CharacterLiteralExpression
-    | | `-L'a'
-    | `-;
-    |-ExpressionStatement
-    | |-CharacterLiteralExpression
-    | | `-L'α'
-    | `-;
-    `-}
-)txt"));
+  [['a']];
+  [['\n']];
+  [['\x20']];
+  [['\0']];
+  [[L'a']];
+  [[L'α']];
+}
+)cpp",
+      {R"txt(
+CharacterLiteralExpression
+`-'a'
+)txt",
+       R"txt(
+CharacterLiteralExpression
+`-'\n'
+)txt",
+       R"txt(
+CharacterLiteralExpression
+`-'\x20'
+)txt",
+       R"txt(
+CharacterLiteralExpression
+`-'\0'
+)txt",
+       R"txt(
+CharacterLiteralExpression
+`-L'a'
+)txt",
+       R"txt(
+CharacterLiteralExpression
+`-L'α'
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, CharacterLiteral_Utf) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  u'a';
-  u'構';
-  U'a';
-  U'🌲';
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-CharacterLiteralExpression
-    | | `-u'a'
-    | `-;
-    |-ExpressionStatement
-    | |-CharacterLiteralExpression
-    | | `-u'構'
-    | `-;
-    |-ExpressionStatement
-    | |-CharacterLiteralExpression
-    | | `-U'a'
-    | `-;
-    |-ExpressionStatement
-    | |-CharacterLiteralExpression
-    | | `-U'🌲'
-    | `-;
-    `-}
-)txt"));
+  [[u'a']];
+  [[u'構']];
+  [[U'a']];
+  [[U'🌲']];
+}
+)cpp",
+      {R"txt(
+CharacterLiteralExpression
+`-u'a'
+)txt",
+       R"txt(
+CharacterLiteralExpression
+`-u'構'
+)txt",
+       R"txt(
+CharacterLiteralExpression
+`-U'a'
+)txt",
+       R"txt(
+CharacterLiteralExpression
+`-U'🌲'
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, CharacterLiteral_Utf8) {
   if (!GetParam().isCXX17OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  u8'a';
-  u8'\x7f';
+  [[u8'a']];
+  [[u8'\x7f']];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-CharacterLiteralExpression
-    | | `-u8'a'
-    | `-;
-    |-ExpressionStatement
-    | |-CharacterLiteralExpression
-    | | `-u8'\x7f'
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+CharacterLiteralExpression
+`-u8'a'
+)txt",
+       R"txt(
+CharacterLiteralExpression
+`-u8'\x7f'
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, FloatingLiteral) {
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  1e-2;
-  2.;
-  .2;
-  2.f;
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-FloatingLiteralExpression
-    | | `-1e-2
-    | `-;
-    |-ExpressionStatement
-    | |-FloatingLiteralExpression
-    | | `-2.
-    | `-;
-    |-ExpressionStatement
-    | |-FloatingLiteralExpression
-    | | `-.2
-    | `-;
-    |-ExpressionStatement
-    | |-FloatingLiteralExpression
-    | | `-2.f
-    | `-;
-    `-}
-)txt"));
+  [[1e-2]];
+  [[2.]];
+  [[.2]];
+  [[2.f]];
+}
+)cpp",
+      {R"txt(
+FloatingLiteralExpression
+`-1e-2
+)txt",
+       R"txt(
+FloatingLiteralExpression
+`-2.
+)txt",
+       R"txt(
+FloatingLiteralExpression
+`-.2
+)txt",
+       R"txt(
+FloatingLiteralExpression
+`-2.f
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, FloatingLiteral_Hexadecimal) {
   if (!GetParam().isCXX17OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
+      R"cpp(
+void test() {
+  [[0xfp1]];
+  [[0xf.p1]];
+  [[0x.fp1]];
+  [[0xf.fp1f]];
+}
+)cpp",
+      {R"txt(
+FloatingLiteralExpression
+`-0xfp1
+)txt",
+       R"txt(
+FloatingLiteralExpression
+`-0xf.p1
+)txt",
+       R"txt(
+FloatingLiteralExpression
+`-0x.fp1
+)txt",
+       R"txt(
+FloatingLiteralExpression
+`-0xf.fp1f
+)txt"}));
+}
+
+TEST_P(SyntaxTreeTest, StringLiteral) {
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  0xfp1;
-  0xf.p1;
-  0x.fp1;
-  0xf.fp1f;
+  [["a\n\0\x20"]];
+  [[L"αβ"]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-FloatingLiteralExpression
-    | | `-0xfp1
-    | `-;
-    |-ExpressionStatement
-    | |-FloatingLiteralExpression
-    | | `-0xf.p1
-    | `-;
-    |-ExpressionStatement
-    | |-FloatingLiteralExpression
-    | | `-0x.fp1
-    | `-;
-    |-ExpressionStatement
-    | |-FloatingLiteralExpression
-    | | `-0xf.fp1f
-    | `-;
-    `-}
-)txt"));
-}
-
-TEST_P(SyntaxTreeTest, StringLiteral) {
-  EXPECT_TRUE(treeDumpEqual(
-      R"cpp(
-void test() {
-  "a\n\0\x20";
-  L"αβ";
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-StringLiteralExpression
-    | | `-"a\n\0\x20"
-    | `-;
-    |-ExpressionStatement
-    | |-StringLiteralExpression
-    | | `-L"αβ"
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+StringLiteralExpression
+`-"a\n\0\x20"
+)txt",
+       R"txt(
+StringLiteralExpression
+`-L"αβ"
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, StringLiteral_Utf) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  u8"a\x1f\x05";
-  u"C++抽象構文木";
-  U"📖🌲\n";
+  [[u8"a\x1f\x05"]];
+  [[u"C++抽象構文木"]];
+  [[U"📖🌲\n"]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-StringLiteralExpression
-    | | `-u8"a\x1f\x05"
-    | `-;
-    |-ExpressionStatement
-    | |-StringLiteralExpression
-    | | `-u"C++抽象構文木"
-    | `-;
-    |-ExpressionStatement
-    | |-StringLiteralExpression
-    | | `-U"📖🌲\n"
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+StringLiteralExpression
+`-u8"a\x1f\x05"
+)txt",
+       R"txt(
+StringLiteralExpression
+`-u"C++抽象構文木"
+)txt",
+       R"txt(
+StringLiteralExpression
+`-U"📖🌲\n"
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, StringLiteral_Raw) {
@@ -2033,1221 +1319,646 @@ TEST_P(SyntaxTreeTest, BoolLiteral) {
   if (GetParam().isC()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  true;
-  false;
+  [[true]];
+  [[false]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-BoolLiteralExpression
-    | | `-true
-    | `-;
-    |-ExpressionStatement
-    | |-BoolLiteralExpression
-    | | `-false
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+BoolLiteralExpression
+`-true
+)txt",
+       R"txt(
+BoolLiteralExpression
+`-false
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, CxxNullPtrLiteral) {
   if (!GetParam().isCXX11OrLater()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  nullptr;
+  [[nullptr]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-CxxNullPtrExpression
-    | | `-nullptr
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+CxxNullPtrExpression
+`-nullptr
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, PostfixUnaryOperator) {
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test(int a) {
-  a++;
-  a--;
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   `-a
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-PostfixUnaryOperatorExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-a
-    | | `-++
-    | `-;
-    |-ExpressionStatement
-    | |-PostfixUnaryOperatorExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-a
-    | | `---
-    | `-;
-    `-}
-)txt"));
+  [[a++]];
+  [[a--]];
+}
+)cpp",
+      {R"txt(
+PostfixUnaryOperatorExpression
+|-IdExpression
+| `-UnqualifiedId
+|   `-a
+`-++
+)txt",
+       R"txt(
+PostfixUnaryOperatorExpression
+|-IdExpression
+| `-UnqualifiedId
+|   `-a
+`---
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, PrefixUnaryOperator) {
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test(int a, int *ap) {
-  --a; ++a;
-  ~a;
-  -a;
-  +a;
-  &a;
-  *ap;
-  !a;
-  __real a; __imag a;
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   `-a
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   |-*
-  |   |   `-ap
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |---
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-a
-    | `-;
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |-++
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-a
-    | `-;
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |-~
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-a
-    | `-;
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |--
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-a
-    | `-;
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |-+
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-a
-    | `-;
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |-&
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-a
-    | `-;
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |-*
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-ap
-    | `-;
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |-!
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-a
-    | `-;
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |-__real
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-a
-    | `-;
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |-__imag
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-a
-    | `-;
-    `-}
-)txt"));
+  [[--a]]; [[++a]];
+  [[~a]];
+  [[-a]];
+  [[+a]];
+  [[&a]];
+  [[*ap]];
+  [[!a]];
+  [[__real a]]; [[__imag a]];
+}
+)cpp",
+      {R"txt(
+PrefixUnaryOperatorExpression
+|---
+`-IdExpression
+  `-UnqualifiedId
+    `-a
+)txt",
+       R"txt(
+PrefixUnaryOperatorExpression
+|-++
+`-IdExpression
+  `-UnqualifiedId
+    `-a
+)txt",
+       R"txt(
+PrefixUnaryOperatorExpression
+|-~
+`-IdExpression
+  `-UnqualifiedId
+    `-a
+)txt",
+       R"txt(
+PrefixUnaryOperatorExpression
+|--
+`-IdExpression
+  `-UnqualifiedId
+    `-a
+)txt",
+       R"txt(
+PrefixUnaryOperatorExpression
+|-+
+`-IdExpression
+  `-UnqualifiedId
+    `-a
+)txt",
+       R"txt(
+PrefixUnaryOperatorExpression
+|-&
+`-IdExpression
+  `-UnqualifiedId
+    `-a
+)txt",
+       R"txt(
+PrefixUnaryOperatorExpression
+|-*
+`-IdExpression
+  `-UnqualifiedId
+    `-ap
+)txt",
+       R"txt(
+PrefixUnaryOperatorExpression
+|-!
+`-IdExpression
+  `-UnqualifiedId
+    `-a
+)txt",
+       R"txt(
+PrefixUnaryOperatorExpression
+|-__real
+`-IdExpression
+  `-UnqualifiedId
+    `-a
+)txt",
+       R"txt(
+PrefixUnaryOperatorExpression
+|-__imag
+`-IdExpression
+  `-UnqualifiedId
+    `-a
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, PrefixUnaryOperatorCxx) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test(int a, bool b) {
-  compl a;
-  not b;
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   `-a
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-bool
-  |   | `-SimpleDeclarator
-  |   |   `-b
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |-compl
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-a
-    | `-;
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |-not
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-b
-    | `-;
-    `-}
-)txt"));
+  [[compl a]];
+  [[not b]];
+}
+)cpp",
+      {R"txt(
+PrefixUnaryOperatorExpression
+|-compl
+`-IdExpression
+  `-UnqualifiedId
+    `-a
+)txt",
+       R"txt(
+PrefixUnaryOperatorExpression
+|-not
+`-IdExpression
+  `-UnqualifiedId
+    `-b
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, BinaryOperator) {
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test(int a) {
-  1 - 2;
-  1 == 2;
-  a = 1;
-  a <<= 1;
-  1 || 0;
-  1 & 2;
-  a ^= 3;
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   `-a
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IntegerLiteralExpression
-    | | | `-1
-    | | |--
-    | | `-IntegerLiteralExpression
-    | |   `-2
-    | `-;
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IntegerLiteralExpression
-    | | | `-1
-    | | |-==
-    | | `-IntegerLiteralExpression
-    | |   `-2
-    | `-;
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-a
-    | | |-=
-    | | `-IntegerLiteralExpression
-    | |   `-1
-    | `-;
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-a
-    | | |-<<=
-    | | `-IntegerLiteralExpression
-    | |   `-1
-    | `-;
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IntegerLiteralExpression
-    | | | `-1
-    | | |-||
-    | | `-IntegerLiteralExpression
-    | |   `-0
-    | `-;
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IntegerLiteralExpression
-    | | | `-1
-    | | |-&
-    | | `-IntegerLiteralExpression
-    | |   `-2
-    | `-;
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-a
-    | | |-^=
-    | | `-IntegerLiteralExpression
-    | |   `-3
-    | `-;
-    `-}
-)txt"));
+  [[1 - 2]];
+  [[1 == 2]];
+  [[a = 1]];
+  [[a <<= 1]];
+  [[1 || 0]];
+  [[1 & 2]];
+  [[a != 3]];
+}
+)cpp",
+      {R"txt(
+BinaryOperatorExpression
+|-IntegerLiteralExpression
+| `-1
+|--
+`-IntegerLiteralExpression
+  `-2
+)txt",
+       R"txt(
+BinaryOperatorExpression
+|-IntegerLiteralExpression
+| `-1
+|-==
+`-IntegerLiteralExpression
+  `-2
+)txt",
+       R"txt(
+BinaryOperatorExpression
+|-IdExpression
+| `-UnqualifiedId
+|   `-a
+|-=
+`-IntegerLiteralExpression
+  `-1
+)txt",
+       R"txt(
+BinaryOperatorExpression
+|-IdExpression
+| `-UnqualifiedId
+|   `-a
+|-<<=
+`-IntegerLiteralExpression
+  `-1
+)txt",
+       R"txt(
+BinaryOperatorExpression
+|-IntegerLiteralExpression
+| `-1
+|-||
+`-IntegerLiteralExpression
+  `-0
+)txt",
+       R"txt(
+BinaryOperatorExpression
+|-IntegerLiteralExpression
+| `-1
+|-&
+`-IntegerLiteralExpression
+  `-2
+)txt",
+       R"txt(
+BinaryOperatorExpression
+|-IdExpression
+| `-UnqualifiedId
+|   `-a
+|-!=
+`-IntegerLiteralExpression
+  `-3
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, BinaryOperatorCxx) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test(int a) {
-  true || false;
-  true or false;
-  1 bitand 2;
-  a xor_eq 3;
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   `-a
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-BoolLiteralExpression
-    | | | `-true
-    | | |-||
-    | | `-BoolLiteralExpression
-    | |   `-false
-    | `-;
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-BoolLiteralExpression
-    | | | `-true
-    | | |-or
-    | | `-BoolLiteralExpression
-    | |   `-false
-    | `-;
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IntegerLiteralExpression
-    | | | `-1
-    | | |-bitand
-    | | `-IntegerLiteralExpression
-    | |   `-2
-    | `-;
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-a
-    | | |-xor_eq
-    | | `-IntegerLiteralExpression
-    | |   `-3
-    | `-;
-    `-}
-)txt"));
+  [[true || false]];
+  [[true or false]];
+  [[1 bitand 2]];
+  [[a xor_eq 3]];
+}
+)cpp",
+      {R"txt(
+BinaryOperatorExpression
+|-BoolLiteralExpression
+| `-true
+|-||
+`-BoolLiteralExpression
+  `-false
+)txt",
+       R"txt(
+BinaryOperatorExpression
+|-BoolLiteralExpression
+| `-true
+|-or
+`-BoolLiteralExpression
+  `-false
+)txt",
+       R"txt(
+BinaryOperatorExpression
+|-IntegerLiteralExpression
+| `-1
+|-bitand
+`-IntegerLiteralExpression
+  `-2
+)txt",
+       R"txt(
+BinaryOperatorExpression
+|-IdExpression
+| `-UnqualifiedId
+|   `-a
+|-xor_eq
+`-IntegerLiteralExpression
+  `-3
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, BinaryOperator_NestedWithParenthesis) {
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  (1 + 2) * (4 / 2);
+  [[(1 + 2) * (4 / 2)]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-ParenExpression
-    | | | |-(
-    | | | |-BinaryOperatorExpression
-    | | | | |-IntegerLiteralExpression
-    | | | | | `-1
-    | | | | |-+
-    | | | | `-IntegerLiteralExpression
-    | | | |   `-2
-    | | | `-)
-    | | |-*
-    | | `-ParenExpression
-    | |   |-(
-    | |   |-BinaryOperatorExpression
-    | |   | |-IntegerLiteralExpression
-    | |   | | `-4
-    | |   | |-/
-    | |   | `-IntegerLiteralExpression
-    | |   |   `-2
-    | |   `-)
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+BinaryOperatorExpression
+|-ParenExpression
+| |-(
+| |-BinaryOperatorExpression
+| | |-IntegerLiteralExpression
+| | | `-1
+| | |-+
+| | `-IntegerLiteralExpression
+| |   `-2
+| `-)
+|-*
+`-ParenExpression
+  |-(
+  |-BinaryOperatorExpression
+  | |-IntegerLiteralExpression
+  | | `-4
+  | |-/
+  | `-IntegerLiteralExpression
+  |   `-2
+  `-)
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, BinaryOperator_Associativity) {
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test(int a, int b) {
-  a + b + 42;
-  a = b = 42;
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   `-a
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   `-b
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-BinaryOperatorExpression
-    | | | |-IdExpression
-    | | | | `-UnqualifiedId
-    | | | |   `-a
-    | | | |-+
-    | | | `-IdExpression
-    | | |   `-UnqualifiedId
-    | | |     `-b
-    | | |-+
-    | | `-IntegerLiteralExpression
-    | |   `-42
-    | `-;
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-a
-    | | |-=
-    | | `-BinaryOperatorExpression
-    | |   |-IdExpression
-    | |   | `-UnqualifiedId
-    | |   |   `-b
-    | |   |-=
-    | |   `-IntegerLiteralExpression
-    | |     `-42
-    | `-;
-    `-}
-)txt"));
+  [[a + b + 42]];
+  [[a = b = 42]];
+}
+)cpp",
+      {R"txt(
+BinaryOperatorExpression
+|-BinaryOperatorExpression
+| |-IdExpression
+| | `-UnqualifiedId
+| |   `-a
+| |-+
+| `-IdExpression
+|   `-UnqualifiedId
+|     `-b
+|-+
+`-IntegerLiteralExpression
+  `-42
+)txt",
+       R"txt(
+BinaryOperatorExpression
+|-IdExpression
+| `-UnqualifiedId
+|   `-a
+|-=
+`-BinaryOperatorExpression
+  |-IdExpression
+  | `-UnqualifiedId
+  |   `-b
+  |-=
+  `-IntegerLiteralExpression
+    `-42
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, BinaryOperator_Precedence) {
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 void test() {
-  1 + 2 * 3 + 4;
-  1 % 2 + 3 * 4;
-}
-)cpp",
-      R"txt(
-*: TranslationUnit
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-BinaryOperatorExpression
-    | | | |-IntegerLiteralExpression
-    | | | | `-1
-    | | | |-+
-    | | | `-BinaryOperatorExpression
-    | | |   |-IntegerLiteralExpression
-    | | |   | `-2
-    | | |   |-*
-    | | |   `-IntegerLiteralExpression
-    | | |     `-3
-    | | |-+
-    | | `-IntegerLiteralExpression
-    | |   `-4
-    | `-;
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-BinaryOperatorExpression
-    | | | |-IntegerLiteralExpression
-    | | | | `-1
-    | | | |-%
-    | | | `-IntegerLiteralExpression
-    | | |   `-2
-    | | |-+
-    | | `-BinaryOperatorExpression
-    | |   |-IntegerLiteralExpression
-    | |   | `-3
-    | |   |-*
-    | |   `-IntegerLiteralExpression
-    | |     `-4
-    | `-;
-    `-}
-)txt"));
+  [[1 + 2 * 3 + 4]];
+  [[1 % 2 + 3 * 4]];
+}
+)cpp",
+      {R"txt(
+BinaryOperatorExpression
+|-BinaryOperatorExpression
+| |-IntegerLiteralExpression
+| | `-1
+| |-+
+| `-BinaryOperatorExpression
+|   |-IntegerLiteralExpression
+|   | `-2
+|   |-*
+|   `-IntegerLiteralExpression
+|     `-3
+|-+
+`-IntegerLiteralExpression
+  `-4
+)txt",
+       R"txt(
+BinaryOperatorExpression
+|-BinaryOperatorExpression
+| |-IntegerLiteralExpression
+| | `-1
+| |-%
+| `-IntegerLiteralExpression
+|   `-2
+|-+
+`-BinaryOperatorExpression
+  |-IntegerLiteralExpression
+  | `-3
+  |-*
+  `-IntegerLiteralExpression
+    `-4
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, OverloadedOperator_Assignment) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X {
   X& operator=(const X&);
 };
 void test(X x, X y) {
-  x = y;
+  [[x = y]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-SimpleDeclaration
-| | |-X
-| | |-SimpleDeclarator
-| | | |-&
-| | | |-operator
-| | | |-=
-| | | `-ParametersAndQualifiers
-| | |   |-(
-| | |   |-SimpleDeclaration
-| | |   | |-const
-| | |   | |-X
-| | |   | `-SimpleDeclarator
-| | |   |   `-&
-| | |   `-)
-| | `-;
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-x
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-y
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-x
-    | | |-=
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-y
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+BinaryOperatorExpression
+|-IdExpression
+| `-UnqualifiedId
+|   `-x
+|-=
+`-IdExpression
+  `-UnqualifiedId
+    `-y
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, OverloadedOperator_Plus) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X {
   friend X operator+(X, const X&);
 };
 void test(X x, X y) {
-  x + y;
+  [[x + y]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-UnknownDeclaration
-| | `-SimpleDeclaration
-| |   |-friend
-| |   |-X
-| |   |-SimpleDeclarator
-| |   | |-operator
-| |   | |-+
-| |   | `-ParametersAndQualifiers
-| |   |   |-(
-| |   |   |-SimpleDeclaration
-| |   |   | `-X
-| |   |   |-,
-| |   |   |-SimpleDeclaration
-| |   |   | |-const
-| |   |   | |-X
-| |   |   | `-SimpleDeclarator
-| |   |   |   `-&
-| |   |   `-)
-| |   `-;
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-x
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-y
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-UnknownExpression
-    | | | `-IdExpression
-    | | |   `-UnqualifiedId
-    | | |     `-x
-    | | |-+
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-y
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+BinaryOperatorExpression
+|-UnknownExpression
+| `-IdExpression
+|   `-UnqualifiedId
+|     `-x
+|-+
+`-IdExpression
+  `-UnqualifiedId
+    `-y
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, OverloadedOperator_Less) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X {
   friend bool operator<(const X&, const X&);
 };
 void test(X x, X y) {
-  x < y;
+  [[x < y]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-UnknownDeclaration
-| | `-SimpleDeclaration
-| |   |-friend
-| |   |-bool
-| |   |-SimpleDeclarator
-| |   | |-operator
-| |   | |-<
-| |   | `-ParametersAndQualifiers
-| |   |   |-(
-| |   |   |-SimpleDeclaration
-| |   |   | |-const
-| |   |   | |-X
-| |   |   | `-SimpleDeclarator
-| |   |   |   `-&
-| |   |   |-,
-| |   |   |-SimpleDeclaration
-| |   |   | |-const
-| |   |   | |-X
-| |   |   | `-SimpleDeclarator
-| |   |   |   `-&
-| |   |   `-)
-| |   `-;
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-x
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-y
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-x
-    | | |-<
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-y
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+BinaryOperatorExpression
+|-IdExpression
+| `-UnqualifiedId
+|   `-x
+|-<
+`-IdExpression
+  `-UnqualifiedId
+    `-y
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, OverloadedOperator_LeftShift) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X {
   friend X operator<<(X&, const X&);
 };
 void test(X x, X y) {
-  x << y;
+  [[x << y]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-UnknownDeclaration
-| | `-SimpleDeclaration
-| |   |-friend
-| |   |-X
-| |   |-SimpleDeclarator
-| |   | |-operator
-| |   | |-<<
-| |   | `-ParametersAndQualifiers
-| |   |   |-(
-| |   |   |-SimpleDeclaration
-| |   |   | |-X
-| |   |   | `-SimpleDeclarator
-| |   |   |   `-&
-| |   |   |-,
-| |   |   |-SimpleDeclaration
-| |   |   | |-const
-| |   |   | |-X
-| |   |   | `-SimpleDeclarator
-| |   |   |   `-&
-| |   |   `-)
-| |   `-;
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-x
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-y
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-x
-    | | |-<<
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-y
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+BinaryOperatorExpression
+|-IdExpression
+| `-UnqualifiedId
+|   `-x
+|-<<
+`-IdExpression
+  `-UnqualifiedId
+    `-y
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, OverloadedOperator_Comma) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X {
   X operator,(X&);
 };
 void test(X x, X y) {
-  x, y;
+  [[x, y]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-SimpleDeclaration
-| | |-X
-| | |-SimpleDeclarator
-| | | |-operator
-| | | |-,
-| | | `-ParametersAndQualifiers
-| | |   |-(
-| | |   |-SimpleDeclaration
-| | |   | |-X
-| | |   | `-SimpleDeclarator
-| | |   |   `-&
-| | |   `-)
-| | `-;
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-x
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-y
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-x
-    | | |-,
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-y
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+BinaryOperatorExpression
+|-IdExpression
+| `-UnqualifiedId
+|   `-x
+|-,
+`-IdExpression
+  `-UnqualifiedId
+    `-y
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, OverloadedOperator_PointerToMember) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X {
   X operator->*(int);
 };
 void test(X* xp, int X::* pmi) {
-  xp->*pmi;
+  [[xp->*pmi]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-SimpleDeclaration
-| | |-X
-| | |-SimpleDeclarator
-| | | |-operator
-| | | |-->*
-| | | `-ParametersAndQualifiers
-| | |   |-(
-| | |   |-SimpleDeclaration
-| | |   | `-int
-| | |   `-)
-| | `-;
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   |-*
-  |   |   `-xp
-  |   |-,
-  |   |-SimpleDeclaration
-  |   | |-int
-  |   | `-SimpleDeclarator
-  |   |   |-MemberPointer
-  |   |   | |-X
-  |   |   | |-::
-  |   |   | `-*
-  |   |   `-pmi
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-BinaryOperatorExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-xp
-    | | |-->*
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-pmi
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+BinaryOperatorExpression
+|-IdExpression
+| `-UnqualifiedId
+|   `-xp
+|-->*
+`-IdExpression
+  `-UnqualifiedId
+    `-pmi
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, OverloadedOperator_Negation) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X {
   bool operator!();
 };
 void test(X x) {
-  !x;
+  [[!x]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-SimpleDeclaration
-| | |-bool
-| | |-SimpleDeclarator
-| | | |-operator
-| | | |-!
-| | | `-ParametersAndQualifiers
-| | |   |-(
-| | |   `-)
-| | `-;
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-x
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |-!
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-x
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+PrefixUnaryOperatorExpression
+|-!
+`-IdExpression
+  `-UnqualifiedId
+    `-x
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, OverloadedOperator_AddressOf) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X {
   X* operator&();
 };
 void test(X x) {
-  &x;
+  [[&x]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-SimpleDeclaration
-| | |-X
-| | |-SimpleDeclarator
-| | | |-*
-| | | |-operator
-| | | |-&
-| | | `-ParametersAndQualifiers
-| | |   |-(
-| | |   `-)
-| | `-;
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-x
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |-&
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-x
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+PrefixUnaryOperatorExpression
+|-&
+`-IdExpression
+  `-UnqualifiedId
+    `-x
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, OverloadedOperator_PrefixIncrement) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X {
   X operator++();
 };
 void test(X x) {
-  ++x;
+  [[++x]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-SimpleDeclaration
-| | |-X
-| | |-SimpleDeclarator
-| | | |-operator
-| | | |-++
-| | | `-ParametersAndQualifiers
-| | |   |-(
-| | |   `-)
-| | `-;
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-x
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-PrefixUnaryOperatorExpression
-    | | |-++
-    | | `-IdExpression
-    | |   `-UnqualifiedId
-    | |     `-x
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+PrefixUnaryOperatorExpression
+|-++
+`-IdExpression
+  `-UnqualifiedId
+    `-x
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, OverloadedOperator_PostfixIncrement) {
   if (!GetParam().isCXX()) {
     return;
   }
-  EXPECT_TRUE(treeDumpEqual(
+  EXPECT_TRUE(treeDumpEqualOnAnnotations(
       R"cpp(
 struct X {
   X operator++(int);
 };
 void test(X x) {
-  x++;
+  [[x++]];
 }
 )cpp",
-      R"txt(
-*: TranslationUnit
-|-SimpleDeclaration
-| |-struct
-| |-X
-| |-{
-| |-SimpleDeclaration
-| | |-X
-| | |-SimpleDeclarator
-| | | |-operator
-| | | |-++
-| | | `-ParametersAndQualifiers
-| | |   |-(
-| | |   |-SimpleDeclaration
-| | |   | `-int
-| | |   `-)
-| | `-;
-| |-}
-| `-;
-`-SimpleDeclaration
-  |-void
-  |-SimpleDeclarator
-  | |-test
-  | `-ParametersAndQualifiers
-  |   |-(
-  |   |-SimpleDeclaration
-  |   | |-X
-  |   | `-SimpleDeclarator
-  |   |   `-x
-  |   `-)
-  `-CompoundStatement
-    |-{
-    |-ExpressionStatement
-    | |-PostfixUnaryOperatorExpression
-    | | |-IdExpression
-    | | | `-UnqualifiedId
-    | | |   `-x
-    | | `-++
-    | `-;
-    `-}
-)txt"));
+      {R"txt(
+PostfixUnaryOperatorExpression
+|-IdExpression
+| `-UnqualifiedId
+|   `-x
+`-++
+)txt"}));
 }
 
 TEST_P(SyntaxTreeTest, MultipleDeclaratorsGrouping) {
@@ -3343,6 +2054,33 @@ void foo() {
 )txt"));
 }
 
+TEST_P(SyntaxTreeTest, SizeTTypedef) {
+  if (!GetParam().isCXX11OrLater()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqual(
+      R"cpp(
+typedef decltype(sizeof(void *)) size_t;
+    )cpp",
+      R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-typedef
+  |-decltype
+  |-(
+  |-UnknownExpression
+  | |-sizeof
+  | |-(
+  | |-void
+  | |-*
+  | `-)
+  |-)
+  |-SimpleDeclarator
+  | `-size_t
+  `-;
+)txt"));
+}
+
 TEST_P(SyntaxTreeTest, Namespaces) {
   if (!GetParam().isCXX()) {
     return;
@@ -3496,68 +2234,318 @@ struct {} *a1;
 )txt"));
 }
 
-TEST_P(SyntaxTreeTest, Templates) {
+TEST_P(SyntaxTreeTest, StaticMemberFunction) {
+  if (!GetParam().isCXX11OrLater()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqual(
+      R"cpp(
+struct S {
+  static void f(){}
+};
+)cpp",
+      R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-struct
+  |-S
+  |-{
+  |-SimpleDeclaration
+  | |-static
+  | |-void
+  | |-SimpleDeclarator
+  | | |-f
+  | | `-ParametersAndQualifiers
+  | |   |-(
+  | |   `-)
+  | `-CompoundStatement
+  |   |-{
+  |   `-}
+  |-}
+  `-;
+)txt"));
+}
+
+TEST_P(SyntaxTreeTest, ConversionMemberFunction) {
   if (!GetParam().isCXX()) {
     return;
   }
-  if (GetParam().hasDelayedTemplateParsing()) {
-    // FIXME: Make this test work on Windows by generating the expected syntax
-    // tree when `-fdelayed-template-parsing` is active.
+  EXPECT_TRUE(treeDumpEqual(
+      R"cpp(
+struct X {
+  operator int();
+};
+)cpp",
+      R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-struct
+  |-X
+  |-{
+  |-SimpleDeclaration
+  | |-SimpleDeclarator
+  | | |-operator
+  | | |-int
+  | | `-ParametersAndQualifiers
+  | |   |-(
+  | |   `-)
+  | `-;
+  |-}
+  `-;
+)txt"));
+}
+
+TEST_P(SyntaxTreeTest, LiteralOperatorDeclaration) {
+  if (!GetParam().isCXX11OrLater()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqual(
+      R"cpp(
+unsigned operator "" _c(char);
+    )cpp",
+      R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-unsigned
+  |-SimpleDeclarator
+  | |-operator
+  | |-""
+  | |-_c
+  | `-ParametersAndQualifiers
+  |   |-(
+  |   |-SimpleDeclaration
+  |   | `-char
+  |   `-)
+  `-;
+)txt"));
+}
+
+TEST_P(SyntaxTreeTest, NumericLiteralOperatorTemplateDeclaration) {
+  if (!GetParam().isCXX11OrLater()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqual(
+      R"cpp(
+template <char...>
+unsigned operator "" _t();
+    )cpp",
+      R"txt(
+*: TranslationUnit
+`-TemplateDeclaration
+  |-template
+  |-<
+  |-SimpleDeclaration
+  | `-char
+  |-...
+  |->
+  `-SimpleDeclaration
+    |-unsigned
+    |-SimpleDeclarator
+    | |-operator
+    | |-""
+    | |-_t
+    | `-ParametersAndQualifiers
+    |   |-(
+    |   `-)
+    `-;
+)txt"));
+}
+
+TEST_P(SyntaxTreeTest, OverloadedOperatorDeclaration) {
+  if (!GetParam().isCXX()) {
     return;
   }
   EXPECT_TRUE(treeDumpEqual(
       R"cpp(
-template <class T> struct cls {};
-template <class T> int var = 10;
-template <class T> int fun() {}
+struct X {
+  X& operator=(const X&);
+};
+)cpp",
+      R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-struct
+  |-X
+  |-{
+  |-SimpleDeclaration
+  | |-X
+  | |-SimpleDeclarator
+  | | |-&
+  | | |-operator
+  | | |-=
+  | | `-ParametersAndQualifiers
+  | |   |-(
+  | |   |-SimpleDeclaration
+  | |   | |-const
+  | |   | |-X
+  | |   | `-SimpleDeclarator
+  | |   |   `-&
+  | |   `-)
+  | `-;
+  |-}
+  `-;
+)txt"));
+}
+
+TEST_P(SyntaxTreeTest, OverloadedOperatorFriendDeclarataion) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqual(
+      R"cpp(
+struct X {
+  friend X operator+(X, const X&);
+};
+)cpp",
+      R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-struct
+  |-X
+  |-{
+  |-UnknownDeclaration
+  | `-SimpleDeclaration
+  |   |-friend
+  |   |-X
+  |   |-SimpleDeclarator
+  |   | |-operator
+  |   | |-+
+  |   | `-ParametersAndQualifiers
+  |   |   |-(
+  |   |   |-SimpleDeclaration
+  |   |   | `-X
+  |   |   |-,
+  |   |   |-SimpleDeclaration
+  |   |   | |-const
+  |   |   | |-X
+  |   |   | `-SimpleDeclarator
+  |   |   |   `-&
+  |   |   `-)
+  |   `-;
+  |-}
+  `-;
+)txt"));
+}
+
+TEST_P(SyntaxTreeTest, ClassTemplateDeclaration) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqual(
+      R"cpp(
+template<typename T>
+struct ST {};
 )cpp",
       R"txt(
 *: TranslationUnit
-|-TemplateDeclaration
-| |-template
-| |-<
-| |-UnknownDeclaration
-| | |-class
-| | `-T
-| |->
-| `-SimpleDeclaration
-|   |-struct
-|   |-cls
-|   |-{
-|   |-}
-|   `-;
-|-TemplateDeclaration
-| |-template
-| |-<
-| |-UnknownDeclaration
-| | |-class
-| | `-T
-| |->
-| `-SimpleDeclaration
-|   |-int
-|   |-SimpleDeclarator
-|   | |-var
-|   | |-=
-|   | `-IntegerLiteralExpression
-|   |   `-10
-|   `-;
 `-TemplateDeclaration
   |-template
   |-<
   |-UnknownDeclaration
-  | |-class
+  | |-typename
+  | `-T
+  |->
+  `-SimpleDeclaration
+    |-struct
+    |-ST
+    |-{
+    |-}
+    `-;
+)txt"));
+}
+
+TEST_P(SyntaxTreeTest, FunctionTemplateDeclaration) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqual(
+      R"cpp(
+template<typename T>
+T f();
+)cpp",
+      R"txt(
+*: TranslationUnit
+`-TemplateDeclaration
+  |-template
+  |-<
+  |-UnknownDeclaration
+  | |-typename
   | `-T
   |->
   `-SimpleDeclaration
-    |-int
+    |-T
     |-SimpleDeclarator
-    | |-fun
+    | |-f
     | `-ParametersAndQualifiers
     |   |-(
     |   `-)
-    `-CompoundStatement
-      |-{
-      `-}
+    `-;
+)txt"));
+}
+
+TEST_P(SyntaxTreeTest, VariableTemplateDeclaration) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqual(
+      R"cpp(
+template <class T> T var = 10;
+)cpp",
+      R"txt(
+*: TranslationUnit
+`-TemplateDeclaration
+  |-template
+  |-<
+  |-UnknownDeclaration
+  | |-class
+  | `-T
+  |->
+  `-SimpleDeclaration
+    |-T
+    |-SimpleDeclarator
+    | |-var
+    | |-=
+    | `-IntegerLiteralExpression
+    |   `-10
+    `-;
+)txt"));
+}
+
+TEST_P(SyntaxTreeTest, StaticMemberFunctionTemplate) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqual(
+      R"cpp(
+struct S {
+  template<typename U>
+  static U f();
+};
+)cpp",
+      R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-struct
+  |-S
+  |-{
+  |-TemplateDeclaration
+  | |-template
+  | |-<
+  | |-UnknownDeclaration
+  | | |-typename
+  | | `-U
+  | |->
+  | `-SimpleDeclaration
+  |   |-static
+  |   |-U
+  |   |-SimpleDeclarator
+  |   | |-f
+  |   | `-ParametersAndQualifiers
+  |   |   |-(
+  |   |   `-)
+  |   `-;
+  |-}
+  `-;
 )txt"));
 }
 
@@ -3606,6 +2594,59 @@ struct X {
 )txt"));
 }
 
+TEST_P(SyntaxTreeTest, NestedTemplatesInNamespace) {
+  if (!GetParam().isCXX()) {
+    return;
+  }
+  EXPECT_TRUE(treeDumpEqual(
+      R"cpp(
+namespace n {
+  template<typename T>
+  struct ST {
+    template<typename U>
+    static U f();
+  };
+}
+)cpp",
+      R"txt(
+*: TranslationUnit
+`-NamespaceDefinition
+  |-namespace
+  |-n
+  |-{
+  |-TemplateDeclaration
+  | |-template
+  | |-<
+  | |-UnknownDeclaration
+  | | |-typename
+  | | `-T
+  | |->
+  | `-SimpleDeclaration
+  |   |-struct
+  |   |-ST
+  |   |-{
+  |   |-TemplateDeclaration
+  |   | |-template
+  |   | |-<
+  |   | |-UnknownDeclaration
+  |   | | |-typename
+  |   | | `-U
+  |   | |->
+  |   | `-SimpleDeclaration
+  |   |   |-static
+  |   |   |-U
+  |   |   |-SimpleDeclarator
+  |   |   | |-f
+  |   |   | `-ParametersAndQualifiers
+  |   |   |   |-(
+  |   |   |   `-)
+  |   |   `-;
+  |   |-}
+  |   `-;
+  `-}
+)txt"));
+}
+
 TEST_P(SyntaxTreeTest, Templates2) {
   if (!GetParam().isCXX()) {
     return;
diff --git a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
index 05fbac4f47e1c3..c5dbb770c53879 100644
--- a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
@@ -171,7 +171,7 @@ ::testing::AssertionResult SyntaxTreeTest::treeDumpEqual(StringRef Code,
            << "Source file has syntax errors, they were printed to the test "
               "log";
   }
-  std::string Actual = std::string(StringRef(Root->dump(*Arena)).trim());
+  auto Actual = StringRef(Root->dump(*Arena)).trim().str();
   // EXPECT_EQ shows the diff between the two strings if they are different.
   EXPECT_EQ(Tree.trim().str(), Actual);
   if (Actual != Tree.trim().str()) {
@@ -194,21 +194,29 @@ SyntaxTreeTest::treeDumpEqualOnAnnotations(StringRef CodeWithAnnotations,
               "log";
   }
 
-  bool failed = false;
   auto AnnotatedRanges = AnnotatedCode.ranges();
-  assert(AnnotatedRanges.size() == TreeDumps.size());
-  for (auto i = 0ul; i < AnnotatedRanges.size(); i++) {
+  if (AnnotatedRanges.size() != TreeDumps.size()) {
+    return ::testing::AssertionFailure()
+           << "The number of annotated ranges in the source code is different "
+              "to the number of their corresponding tree dumps.";
+  }
+  bool Failed = false;
+  for (unsigned i = 0; i < AnnotatedRanges.size(); i++) {
     auto *AnnotatedNode = nodeByRange(AnnotatedRanges[i], Root);
     assert(AnnotatedNode);
     auto AnnotatedNodeDump =
-        std::string(StringRef(AnnotatedNode->dump(*Arena)).trim());
+        StringRef(AnnotatedNode->dump(*Arena)).trim().str();
     // EXPECT_EQ shows the diff between the two strings if they are different.
-    EXPECT_EQ(TreeDumps[i].trim().str(), AnnotatedNodeDump);
+    EXPECT_EQ(TreeDumps[i].trim().str(), AnnotatedNodeDump)
+        << "Dumps diverged for the code:\n"
+        << AnnotatedCode.code().slice(AnnotatedRanges[i].Begin,
+                                      AnnotatedRanges[i].End);
     if (AnnotatedNodeDump != TreeDumps[i].trim().str())
-      failed = true;
+      Failed = true;
   }
-  return failed ? ::testing::AssertionFailure() : ::testing::AssertionSuccess();
+  return Failed ? ::testing::AssertionFailure() : ::testing::AssertionSuccess();
 }
+
 syntax::Node *SyntaxTreeTest::nodeByRange(llvm::Annotations::Range R,
                                           syntax::Node *Root) {
   ArrayRef<syntax::Token> Toks = tokens(Root);

From 1b93ebccaa094c079db7ad729e2f7fea7bac2f34 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Wed, 5 Aug 2020 11:48:35 -0400
Subject: [PATCH 035/101] [OPENMP]Do not capture base pointer by reference if
 it is used as a base for array-like reduction.

If the declaration is used in the reduction clause, it is captured by
reference by default. But if the declaration is a pointer and it is a
base for array-like reduction, this declaration can be captured by
value, since the pointee is reduced but not the original declaration.

Differential Revision: https://reviews.llvm.org/D85321
---
 clang/lib/Sema/SemaOpenMP.cpp                 | 155 ++++++++++++------
 ...te_parallel_for_reduction_task_codegen.cpp |   7 +-
 .../OpenMP/for_reduction_task_codegen.cpp     |   3 +-
 .../parallel_for_reduction_task_codegen.cpp   |   7 +-
 ...parallel_master_reduction_task_codegen.cpp |   7 +-
 .../parallel_reduction_task_codegen.cpp       |   7 +-
 ...rallel_sections_reduction_task_codegen.cpp |   7 +-
 .../sections_reduction_task_codegen.cpp       |   3 +-
 ...et_parallel_for_reduction_task_codegen.cpp |   7 +-
 ...target_parallel_reduction_task_codegen.cpp |   7 +-
 ...te_parallel_for_reduction_task_codegen.cpp |   7 +-
 ...te_parallel_for_reduction_task_codegen.cpp |   7 +-
 12 files changed, 131 insertions(+), 93 deletions(-)

diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index a493f3114dc299..53917ef98acdff 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -70,12 +70,15 @@ class DSAStackTy {
     const Expr *RefExpr = nullptr;
     DeclRefExpr *PrivateCopy = nullptr;
     SourceLocation ImplicitDSALoc;
+    bool AppliedToPointee = false;
     DSAVarData() = default;
     DSAVarData(OpenMPDirectiveKind DKind, OpenMPClauseKind CKind,
                const Expr *RefExpr, DeclRefExpr *PrivateCopy,
-               SourceLocation ImplicitDSALoc, unsigned Modifier)
+               SourceLocation ImplicitDSALoc, unsigned Modifier,
+               bool AppliedToPointee)
         : DKind(DKind), CKind(CKind), Modifier(Modifier), RefExpr(RefExpr),
-          PrivateCopy(PrivateCopy), ImplicitDSALoc(ImplicitDSALoc) {}
+          PrivateCopy(PrivateCopy), ImplicitDSALoc(ImplicitDSALoc),
+          AppliedToPointee(AppliedToPointee) {}
   };
   using OperatorOffsetTy =
       llvm::SmallVector<std::pair<Expr *, OverloadedOperatorKind>, 4>;
@@ -99,6 +102,9 @@ class DSAStackTy {
     /// variable is marked as lastprivate(true) or not (false).
     llvm::PointerIntPair<const Expr *, 1, bool> RefExpr;
     DeclRefExpr *PrivateCopy = nullptr;
+    /// true if the attribute is applied to the pointee, not the variable
+    /// itself.
+    bool AppliedToPointee = false;
   };
   using DeclSAMapTy = llvm::SmallDenseMap<const ValueDecl *, DSAInfo, 8>;
   using UsedRefMapTy = llvm::SmallDenseMap<const ValueDecl *, const Expr *, 8>;
@@ -511,7 +517,8 @@ class DSAStackTy {
 
   /// Adds explicit data sharing attribute to the specified declaration.
   void addDSA(const ValueDecl *D, const Expr *E, OpenMPClauseKind A,
-              DeclRefExpr *PrivateCopy = nullptr, unsigned Modifier = 0);
+              DeclRefExpr *PrivateCopy = nullptr, unsigned Modifier = 0,
+              bool AppliedToPointee = false);
 
   /// Adds additional information for the reduction items with the reduction id
   /// represented as an operator.
@@ -563,7 +570,8 @@ class DSAStackTy {
   /// match specified \a CPred predicate in any directive which matches \a DPred
   /// predicate.
   const DSAVarData
-  hasDSA(ValueDecl *D, const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
+  hasDSA(ValueDecl *D,
+         const llvm::function_ref<bool(OpenMPClauseKind, bool)> CPred,
          const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
          bool FromParent) const;
   /// Checks if the specified variables has data-sharing attributes which
@@ -571,15 +579,16 @@ class DSAStackTy {
   /// matches \a DPred predicate.
   const DSAVarData
   hasInnermostDSA(ValueDecl *D,
-                  const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
+                  const llvm::function_ref<bool(OpenMPClauseKind, bool)> CPred,
                   const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
                   bool FromParent) const;
   /// Checks if the specified variables has explicit data-sharing
   /// attributes which match specified \a CPred predicate at the specified
   /// OpenMP region.
-  bool hasExplicitDSA(const ValueDecl *D,
-                      const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
-                      unsigned Level, bool NotLastprivate = false) const;
+  bool
+  hasExplicitDSA(const ValueDecl *D,
+                 const llvm::function_ref<bool(OpenMPClauseKind, bool)> CPred,
+                 unsigned Level, bool NotLastprivate = false) const;
 
   /// Returns true if the directive at level \Level matches in the
   /// specified \a DPred predicate.
@@ -1185,6 +1194,7 @@ DSAStackTy::DSAVarData DSAStackTy::getDSA(const_iterator &Iter,
     DVar.CKind = Data.Attributes;
     DVar.ImplicitDSALoc = Iter->DefaultAttrLoc;
     DVar.Modifier = Data.Modifier;
+    DVar.AppliedToPointee = Data.AppliedToPointee;
     return DVar;
   }
 
@@ -1341,7 +1351,8 @@ const ValueDecl *DSAStackTy::getParentLoopControlVariable(unsigned I) const {
 }
 
 void DSAStackTy::addDSA(const ValueDecl *D, const Expr *E, OpenMPClauseKind A,
-                        DeclRefExpr *PrivateCopy, unsigned Modifier) {
+                        DeclRefExpr *PrivateCopy, unsigned Modifier,
+                        bool AppliedToPointee) {
   D = getCanonicalDecl(D);
   if (A == OMPC_threadprivate) {
     DSAInfo &Data = Threadprivates[D];
@@ -1365,12 +1376,14 @@ void DSAStackTy::addDSA(const ValueDecl *D, const Expr *E, OpenMPClauseKind A,
     Data.Attributes = A;
     Data.RefExpr.setPointerAndInt(E, IsLastprivate);
     Data.PrivateCopy = PrivateCopy;
+    Data.AppliedToPointee = AppliedToPointee;
     if (PrivateCopy) {
       DSAInfo &Data = getTopOfStack().SharingMap[PrivateCopy->getDecl()];
       Data.Modifier = Modifier;
       Data.Attributes = A;
       Data.RefExpr.setPointerAndInt(PrivateCopy, IsLastprivate);
       Data.PrivateCopy = nullptr;
+      Data.AppliedToPointee = AppliedToPointee;
     }
   }
 }
@@ -1480,7 +1493,8 @@ const DSAStackTy::DSAVarData DSAStackTy::getTopMostTaskgroupReductionData(
                                        "set.");
     TaskgroupDescriptor = I->TaskgroupReductionRef;
     return DSAVarData(I->Directive, OMPC_reduction, Data.RefExpr.getPointer(),
-                      Data.PrivateCopy, I->DefaultAttrLoc, OMPC_REDUCTION_task);
+                      Data.PrivateCopy, I->DefaultAttrLoc, OMPC_REDUCTION_task,
+                      /*AppliedToPointee=*/false);
   }
   return DSAVarData();
 }
@@ -1506,7 +1520,8 @@ const DSAStackTy::DSAVarData DSAStackTy::getTopMostTaskgroupReductionData(
                                        "set.");
     TaskgroupDescriptor = I->TaskgroupReductionRef;
     return DSAVarData(I->Directive, OMPC_reduction, Data.RefExpr.getPointer(),
-                      Data.PrivateCopy, I->DefaultAttrLoc, OMPC_REDUCTION_task);
+                      Data.PrivateCopy, I->DefaultAttrLoc, OMPC_REDUCTION_task,
+                      /*AppliedToPointee=*/false);
   }
   return DSAVarData();
 }
@@ -1675,6 +1690,7 @@ const DSAStackTy::DSAVarData DSAStackTy::getTopDSA(ValueDecl *D,
         DVar.ImplicitDSALoc = I->DefaultAttrLoc;
         DVar.DKind = I->Directive;
         DVar.Modifier = Data.Modifier;
+        DVar.AppliedToPointee = Data.AppliedToPointee;
         return DVar;
       }
     }
@@ -1696,7 +1712,7 @@ const DSAStackTy::DSAVarData DSAStackTy::getTopDSA(ValueDecl *D,
       // listed in a firstprivate clause, even if they are static data members.
       DSAVarData DVarTemp = hasInnermostDSA(
           D,
-          [](OpenMPClauseKind C) {
+          [](OpenMPClauseKind C, bool) {
             return C == OMPC_firstprivate || C == OMPC_shared;
           },
           MatchesAlways, FromParent);
@@ -1725,6 +1741,7 @@ const DSAStackTy::DSAVarData DSAStackTy::getTopDSA(ValueDecl *D,
     DVar.ImplicitDSALoc = I->DefaultAttrLoc;
     DVar.DKind = I->Directive;
     DVar.Modifier = Data.Modifier;
+    DVar.AppliedToPointee = Data.AppliedToPointee;
   }
 
   return DVar;
@@ -1755,7 +1772,7 @@ const DSAStackTy::DSAVarData DSAStackTy::getImplicitDSA(ValueDecl *D,
 
 const DSAStackTy::DSAVarData
 DSAStackTy::hasDSA(ValueDecl *D,
-                   const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
+                   const llvm::function_ref<bool(OpenMPClauseKind, bool)> CPred,
                    const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
                    bool FromParent) const {
   if (isStackEmpty())
@@ -1771,14 +1788,14 @@ DSAStackTy::hasDSA(ValueDecl *D,
       continue;
     const_iterator NewI = I;
     DSAVarData DVar = getDSA(NewI, D);
-    if (I == NewI && CPred(DVar.CKind))
+    if (I == NewI && CPred(DVar.CKind, DVar.AppliedToPointee))
       return DVar;
   }
   return {};
 }
 
 const DSAStackTy::DSAVarData DSAStackTy::hasInnermostDSA(
-    ValueDecl *D, const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
+    ValueDecl *D, const llvm::function_ref<bool(OpenMPClauseKind, bool)> CPred,
     const llvm::function_ref<bool(OpenMPDirectiveKind)> DPred,
     bool FromParent) const {
   if (isStackEmpty())
@@ -1792,26 +1809,28 @@ const DSAStackTy::DSAVarData DSAStackTy::hasInnermostDSA(
     return {};
   const_iterator NewI = StartI;
   DSAVarData DVar = getDSA(NewI, D);
-  return (NewI == StartI && CPred(DVar.CKind)) ? DVar : DSAVarData();
+  return (NewI == StartI && CPred(DVar.CKind, DVar.AppliedToPointee))
+             ? DVar
+             : DSAVarData();
 }
 
 bool DSAStackTy::hasExplicitDSA(
-    const ValueDecl *D, const llvm::function_ref<bool(OpenMPClauseKind)> CPred,
+    const ValueDecl *D,
+    const llvm::function_ref<bool(OpenMPClauseKind, bool)> CPred,
     unsigned Level, bool NotLastprivate) const {
   if (getStackSize() <= Level)
     return false;
   D = getCanonicalDecl(D);
   const SharingMapTy &StackElem = getStackElemAtLevel(Level);
   auto I = StackElem.SharingMap.find(D);
-  if (I != StackElem.SharingMap.end() &&
-      I->getSecond().RefExpr.getPointer() &&
-      CPred(I->getSecond().Attributes) &&
+  if (I != StackElem.SharingMap.end() && I->getSecond().RefExpr.getPointer() &&
+      CPred(I->getSecond().Attributes, I->getSecond().AppliedToPointee) &&
       (!NotLastprivate || !I->getSecond().RefExpr.getInt()))
     return true;
   // Check predetermined rules for the loop control variables.
   auto LI = StackElem.LCVMap.find(D);
   if (LI != StackElem.LCVMap.end())
-    return CPred(OMPC_private);
+    return CPred(OMPC_private, /*AppliedToPointee=*/false);
   return false;
 }
 
@@ -2057,14 +2076,17 @@ bool Sema::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
       // By default, all the data that has a scalar type is mapped by copy
       // (except for reduction variables).
       // Defaultmap scalar is mutual exclusive to defaultmap pointer
-      IsByRef =
-          (DSAStack->isForceCaptureByReferenceInTargetExecutable() &&
-           !Ty->isAnyPointerType()) ||
-          !Ty->isScalarType() ||
-          DSAStack->isDefaultmapCapturedByRef(
-              Level, getVariableCategoryFromDecl(LangOpts, D)) ||
-          DSAStack->hasExplicitDSA(
-              D, [](OpenMPClauseKind K) { return K == OMPC_reduction; }, Level);
+      IsByRef = (DSAStack->isForceCaptureByReferenceInTargetExecutable() &&
+                 !Ty->isAnyPointerType()) ||
+                !Ty->isScalarType() ||
+                DSAStack->isDefaultmapCapturedByRef(
+                    Level, getVariableCategoryFromDecl(LangOpts, D)) ||
+                DSAStack->hasExplicitDSA(
+                    D,
+                    [](OpenMPClauseKind K, bool AppliedToPointee) {
+                      return K == OMPC_reduction && !AppliedToPointee;
+                    },
+                    Level);
     }
   }
 
@@ -2075,8 +2097,9 @@ bool Sema::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
               OMPD_target) ||
          !(DSAStack->hasExplicitDSA(
                D,
-               [](OpenMPClauseKind K) -> bool {
-                 return K == OMPC_firstprivate;
+               [](OpenMPClauseKind K, bool AppliedToPointee) -> bool {
+                 return K == OMPC_firstprivate ||
+                        (K == OMPC_reduction && AppliedToPointee);
                },
                Level, /*NotLastprivate=*/true) ||
            DSAStack->isUsesAllocatorsDecl(Level, D))) &&
@@ -2088,7 +2111,8 @@ bool Sema::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
         // copy
         !(DSAStack->getDefaultDSA() == DSA_firstprivate &&
           !DSAStack->hasExplicitDSA(
-              D, [](OpenMPClauseKind K) { return K != OMPC_unknown; }, Level) &&
+              D, [](OpenMPClauseKind K, bool) { return K != OMPC_unknown; },
+              Level) &&
           !DSAStack->isLoopControlVariable(D, Level).first);
   }
 
@@ -2151,7 +2175,8 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
           !OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
         checkDeclIsAllowedInOpenMPTarget(nullptr, VD);
       return nullptr;
-    } else if (isInOpenMPTargetExecutionDirective()) {
+    }
+    if (isInOpenMPTargetExecutionDirective()) {
       // If the declaration is enclosed in a 'declare target' directive,
       // then it should not be captured.
       //
@@ -2204,7 +2229,8 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
       return VD ? VD : Info.second;
     DSAStackTy::DSAVarData DVarTop =
         DSAStack->getTopDSA(D, DSAStack->isClauseParsingMode());
-    if (DVarTop.CKind != OMPC_unknown && isOpenMPPrivate(DVarTop.CKind))
+    if (DVarTop.CKind != OMPC_unknown && isOpenMPPrivate(DVarTop.CKind) &&
+        (!VD || VD->hasLocalStorage() || !DVarTop.AppliedToPointee))
       return VD ? VD : cast<VarDecl>(DVarTop.PrivateCopy->getDecl());
     // Threadprivate variables must not be captured.
     if (isOpenMPThreadPrivate(DVarTop.CKind))
@@ -2212,7 +2238,11 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
     // The variable is not private or it is the variable in the directive with
     // default(none) clause and not used in any clause.
     DSAStackTy::DSAVarData DVarPrivate = DSAStack->hasDSA(
-        D, isOpenMPPrivate, [](OpenMPDirectiveKind) { return true; },
+        D,
+        [](OpenMPClauseKind C, bool AppliedToPointee) {
+          return isOpenMPPrivate(C) && !AppliedToPointee;
+        },
+        [](OpenMPDirectiveKind) { return true; },
         DSAStack->isClauseParsingMode());
     // Global shared must not be captured.
     if (VD && !VD->hasLocalStorage() && DVarPrivate.CKind == OMPC_unknown &&
@@ -2266,7 +2296,8 @@ OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
         (IsTriviallyCopyable ||
          !isOpenMPTaskLoopDirective(CaptureRegions[CapLevel]))) {
       if (DSAStack->hasExplicitDSA(
-              D, [](OpenMPClauseKind K) { return K == OMPC_firstprivate; },
+              D,
+              [](OpenMPClauseKind K, bool) { return K == OMPC_firstprivate; },
               Level, /*NotLastprivate=*/true))
         return OMPC_firstprivate;
       DSAStackTy::DSAVarData DVar = DSAStack->getImplicitDSA(D, Level);
@@ -2287,7 +2318,8 @@ OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
     if ((DSAStack->getPossiblyLoopCunter() == D->getCanonicalDecl() ||
          DSAStack->isLoopControlVariable(D).first) &&
         !DSAStack->hasExplicitDSA(
-            D, [](OpenMPClauseKind K) { return K != OMPC_private; }, Level) &&
+            D, [](OpenMPClauseKind K, bool) { return K != OMPC_private; },
+            Level) &&
         !isOpenMPSimdDirective(DSAStack->getCurrentDirective()))
       return OMPC_private;
   }
@@ -2295,7 +2327,8 @@ OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
     if (DSAStack->isThreadPrivate(const_cast<VarDecl *>(VD)) &&
         DSAStack->isForceVarCapturing() &&
         !DSAStack->hasExplicitDSA(
-            D, [](OpenMPClauseKind K) { return K == OMPC_copyin; }, Level))
+            D, [](OpenMPClauseKind K, bool) { return K == OMPC_copyin; },
+            Level))
       return OMPC_private;
   }
   // User-defined allocators are private since they must be defined in the
@@ -2306,7 +2339,8 @@ OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
           DSAStackTy::UsesAllocatorsDeclKind::UserDefinedAllocator)
     return OMPC_private;
   return (DSAStack->hasExplicitDSA(
-              D, [](OpenMPClauseKind K) { return K == OMPC_private; }, Level) ||
+              D, [](OpenMPClauseKind K, bool) { return K == OMPC_private; },
+              Level) ||
           (DSAStack->isClauseParsingMode() &&
            DSAStack->getClauseParsingMode() == OMPC_private) ||
           // Consider taskgroup reduction descriptor variable a private
@@ -2331,15 +2365,16 @@ void Sema::setOpenMPCaptureKind(FieldDecl *FD, const ValueDecl *D,
   OpenMPClauseKind OMPC = OMPC_unknown;
   for (unsigned I = DSAStack->getNestingLevel() + 1; I > Level; --I) {
     const unsigned NewLevel = I - 1;
-    if (DSAStack->hasExplicitDSA(D,
-                                 [&OMPC](const OpenMPClauseKind K) {
-                                   if (isOpenMPPrivate(K)) {
-                                     OMPC = K;
-                                     return true;
-                                   }
-                                   return false;
-                                 },
-                                 NewLevel))
+    if (DSAStack->hasExplicitDSA(
+            D,
+            [&OMPC](const OpenMPClauseKind K, bool AppliedToPointee) {
+              if (isOpenMPPrivate(K) && !AppliedToPointee) {
+                OMPC = K;
+                return true;
+              }
+              return false;
+            },
+            NewLevel))
       break;
     if (DSAStack->checkMappableExprComponentListsForDeclAtLevel(
             D, NewLevel,
@@ -3474,7 +3509,10 @@ class DSAAttrChecker final : public StmtVisitor<DSAAttrChecker, void> {
       //  enclosing worksharing or parallel construct may not be accessed in an
       //  explicit task.
       DVar = Stack->hasInnermostDSA(
-          VD, [](OpenMPClauseKind C) { return C == OMPC_reduction; },
+          VD,
+          [](OpenMPClauseKind C, bool AppliedToPointee) {
+            return C == OMPC_reduction && !AppliedToPointee;
+          },
           [](OpenMPDirectiveKind K) {
             return isOpenMPParallelDirective(K) ||
                    isOpenMPWorksharingDirective(K) || isOpenMPTeamsDirective(K);
@@ -3559,7 +3597,10 @@ class DSAAttrChecker final : public StmtVisitor<DSAAttrChecker, void> {
       //  enclosing worksharing or parallel construct may not be accessed in
       //  an  explicit task.
       DVar = Stack->hasInnermostDSA(
-          FD, [](OpenMPClauseKind C) { return C == OMPC_reduction; },
+          FD,
+          [](OpenMPClauseKind C, bool AppliedToPointee) {
+            return C == OMPC_reduction && !AppliedToPointee;
+          },
           [](OpenMPDirectiveKind K) {
             return isOpenMPParallelDirective(K) ||
                    isOpenMPWorksharingDirective(K) || isOpenMPTeamsDirective(K);
@@ -14044,7 +14085,10 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
       //  from the worksharing construct.
       if (isOpenMPTaskingDirective(CurrDir)) {
         DVar = DSAStack->hasInnermostDSA(
-            D, [](OpenMPClauseKind C) { return C == OMPC_reduction; },
+            D,
+            [](OpenMPClauseKind C, bool AppliedToPointee) {
+              return C == OMPC_reduction && !AppliedToPointee;
+            },
             [](OpenMPDirectiveKind K) {
               return isOpenMPParallelDirective(K) ||
                      isOpenMPWorksharingDirective(K) ||
@@ -14435,7 +14479,11 @@ class DSARefChecker : public StmtVisitor<DSARefChecker, bool> {
       if (DVar.CKind != OMPC_unknown)
         return true;
       DSAStackTy::DSAVarData DVarPrivate = Stack->hasDSA(
-          VD, isOpenMPPrivate, [](OpenMPDirectiveKind) { return true; },
+          VD,
+          [](OpenMPClauseKind C, bool AppliedToPointee) {
+            return isOpenMPPrivate(C) && !AppliedToPointee;
+          },
+          [](OpenMPDirectiveKind) { return true; },
           /*FromParent=*/true);
       return DVarPrivate.CKind != OMPC_unknown;
     }
@@ -15513,7 +15561,8 @@ static bool actOnOMPReductionKindClause(
     // correct analysis of in_reduction clauses.
     if (CurrDir == OMPD_taskgroup && ClauseKind == OMPC_task_reduction)
       Modifier = OMPC_REDUCTION_task;
-    Stack->addDSA(D, RefExpr->IgnoreParens(), OMPC_reduction, Ref, Modifier);
+    Stack->addDSA(D, RefExpr->IgnoreParens(), OMPC_reduction, Ref, Modifier,
+                  ASE || OASE);
     if (Modifier == OMPC_REDUCTION_task &&
         (CurrDir == OMPD_taskgroup ||
          ((isOpenMPParallelDirective(CurrDir) ||
diff --git a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
index 971e9be8534ba6..995ded43db3d84 100644
--- a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
@@ -20,9 +20,9 @@ int main(int argc, char **argv) {
   }
 }
 
-// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*, i8***)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i64 %{{.+}}, i64 %{{.+}}, i32* %{{.+}}, i8*** %{{.+}})
+// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*, i8**)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i64 %{{.+}}, i64 %{{.+}}, i32* %{{.+}}, i8** %{{.+}})
 
-// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i64 %{{.+}}, i64 %{{.+}}, i32* {{.+}}, i8*** {{.+}})
+// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i64 %{{.+}}, i64 %{{.+}}, i32* {{.+}}, i8** {{.+}})
 // CHECK: alloca i32,
 // CHECK: [[ARGC_FP_ADDR:%.+]] = alloca i32,
 // CHECK: [[TR:%.+]] = alloca [2 x %struct.kmp_taskred_input_t],
@@ -124,7 +124,6 @@ int main(int argc, char **argv) {
 // CHECK_DAG: [[TG]] = load i8*, i8** [[TG_ADDR]],
 // CHECK-DAG: [[ARGV_REF]] = load i8*, i8** [[ARGV_ADDR:%.+]],
 // CHECK-DAG: [[ARGV_ADDR]] = load i8**, i8*** [[ARGV_ADDR_REF:%.+]],
-// CHECK-DAG: [[ARGV_ADDR_REF:%.+]] = load i8***, i8**** [[ARGV:%.+]],
-// CHECK-DAG: [[ARGV]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-DAG: [[ARGV_ADDR_REF]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
 
 #endif
diff --git a/clang/test/OpenMP/for_reduction_task_codegen.cpp b/clang/test/OpenMP/for_reduction_task_codegen.cpp
index ea8fc55d9cb2f9..0018e109aaed9a 100644
--- a/clang/test/OpenMP/for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_task_codegen.cpp
@@ -124,7 +124,6 @@ int main(int argc, char **argv) {
 // CHECK_DAG: [[TG]] = load i8*, i8** [[TG_ADDR]],
 // CHECK-DAG: [[ARGV_REF]] = load i8*, i8** [[ARGV_ADDR:%.+]],
 // CHECK-DAG: [[ARGV_ADDR]] = load i8**, i8*** [[ARGV_ADDR_REF:%.+]],
-// CHECK-DAG: [[ARGV_ADDR_REF:%.+]] = load i8***, i8**** [[ARGV:%.+]],
-// CHECK-DAG: [[ARGV]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-DAG: [[ARGV_ADDR_REF]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
 
 #endif
diff --git a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
index b4f4f83ec95549..fcee3d645b4ae1 100644
--- a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
@@ -19,9 +19,9 @@ int main(int argc, char **argv) {
   }
 }
 
-// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i8***)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i8*** %{{.+}})
+// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i8**)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i8** %{{.+}})
 
-// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{.+}}, i8*** {{.+}})
+// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{.+}}, i8** {{.+}})
 // CHECK: alloca i32,
 // CHECK: [[ARGC_FP_ADDR:%.+]] = alloca i32,
 // CHECK: [[TR:%.+]] = alloca [2 x %struct.kmp_taskred_input_t],
@@ -123,7 +123,6 @@ int main(int argc, char **argv) {
 // CHECK_DAG: [[TG]] = load i8*, i8** [[TG_ADDR]],
 // CHECK-DAG: [[ARGV_REF]] = load i8*, i8** [[ARGV_ADDR:%.+]],
 // CHECK-DAG: [[ARGV_ADDR]] = load i8**, i8*** [[ARGV_ADDR_REF:%.+]],
-// CHECK-DAG: [[ARGV_ADDR_REF:%.+]] = load i8***, i8**** [[ARGV:%.+]],
-// CHECK-DAG: [[ARGV]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-DAG: [[ARGV_ADDR_REF]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
 
 #endif
diff --git a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
index 0f8366fa95e338..ab76987a59c931 100644
--- a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
@@ -19,9 +19,9 @@ int main(int argc, char **argv) {
   }
 }
 
-// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i8***)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i8*** %{{.+}})
+// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i8**)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i8** %{{.+}})
 
-// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{.+}}, i8*** {{.+}})
+// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{.+}}, i8** {{.+}})
 // CHECK: [[ARGC_FP_ADDR:%.+]] = alloca i32,
 // CHECK: [[TR:%.+]] = alloca [2 x %struct.kmp_taskred_input_t],
 // CHECK: [[TG:%.+]] = alloca i8*,
@@ -122,7 +122,6 @@ int main(int argc, char **argv) {
 // CHECK_DAG: [[TG]] = load i8*, i8** [[TG_ADDR]],
 // CHECK-DAG: [[ARGV_REF]] = load i8*, i8** [[ARGV_ADDR:%.+]],
 // CHECK-DAG: [[ARGV_ADDR]] = load i8**, i8*** [[ARGV_ADDR_REF:%.+]],
-// CHECK-DAG: [[ARGV_ADDR_REF:%.+]] = load i8***, i8**** [[ARGV:%.+]],
-// CHECK-DAG: [[ARGV]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-DAG: [[ARGV_ADDR_REF]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
 
 #endif
diff --git a/clang/test/OpenMP/parallel_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
index 5e04aa8c1ec287..c64ffb50800648 100644
--- a/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
@@ -19,9 +19,9 @@ int main(int argc, char **argv) {
   }
 }
 
-// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i8***)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i8*** %{{.+}})
+// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i8**)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i8** %{{.+}})
 
-// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{.+}}, i8*** {{.+}})
+// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{.+}}, i8** {{.+}})
 // CHECK: [[ARGC_FP_ADDR:%.+]] = alloca i32,
 // CHECK: [[TR:%.+]] = alloca [2 x %struct.kmp_taskred_input_t],
 // CHECK: [[TG:%.+]] = alloca i8*,
@@ -122,7 +122,6 @@ int main(int argc, char **argv) {
 // CHECK_DAG: [[TG]] = load i8*, i8** [[TG_ADDR]],
 // CHECK-DAG: [[ARGV_REF]] = load i8*, i8** [[ARGV_ADDR:%.+]],
 // CHECK-DAG: [[ARGV_ADDR]] = load i8**, i8*** [[ARGV_ADDR_REF:%.+]],
-// CHECK-DAG: [[ARGV_ADDR_REF:%.+]] = load i8***, i8**** [[ARGV:%.+]],
-// CHECK-DAG: [[ARGV]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-DAG: [[ARGV_ADDR_REF]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
 
 #endif
diff --git a/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
index 867eb45a1332b6..5481f0b2daa4b3 100644
--- a/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
@@ -19,9 +19,9 @@ int main(int argc, char **argv) {
   }
 }
 
-// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i8***)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i8*** %{{.+}})
+// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i8**)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i8** %{{.+}})
 
-// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{.+}}, i8*** {{.+}})
+// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{.+}}, i8** {{.+}})
 // CHECK: alloca i32,
 // CHECK: alloca i32,
 // CHECK: alloca i32,
@@ -127,7 +127,6 @@ int main(int argc, char **argv) {
 // CHECK_DAG: [[TG]] = load i8*, i8** [[TG_ADDR]],
 // CHECK-DAG: [[ARGV_REF]] = load i8*, i8** [[ARGV_ADDR:%.+]],
 // CHECK-DAG: [[ARGV_ADDR]] = load i8**, i8*** [[ARGV_ADDR_REF:%.+]],
-// CHECK-DAG: [[ARGV_ADDR_REF:%.+]] = load i8***, i8**** [[ARGV:%.+]],
-// CHECK-DAG: [[ARGV]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-DAG: [[ARGV_ADDR_REF]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
 
 #endif
diff --git a/clang/test/OpenMP/sections_reduction_task_codegen.cpp b/clang/test/OpenMP/sections_reduction_task_codegen.cpp
index be67a2a174004f..1c0be118a03ca4 100644
--- a/clang/test/OpenMP/sections_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/sections_reduction_task_codegen.cpp
@@ -128,7 +128,6 @@ int main(int argc, char **argv) {
 // CHECK_DAG: [[TG]] = load i8*, i8** [[TG_ADDR]],
 // CHECK-DAG: [[ARGV_REF]] = load i8*, i8** [[ARGV_ADDR:%.+]],
 // CHECK-DAG: [[ARGV_ADDR]] = load i8**, i8*** [[ARGV_ADDR_REF:%.+]],
-// CHECK-DAG: [[ARGV_ADDR_REF:%.+]] = load i8***, i8**** [[ARGV:%.+]],
-// CHECK-DAG: [[ARGV]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-DAG: [[ARGV_ADDR_REF]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
 
 #endif
diff --git a/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
index 5c5ea6b90d5297..66a20141df0394 100644
--- a/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
@@ -19,9 +19,9 @@ int main(int argc, char **argv) {
   }
 }
 
-// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i8***)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i8*** %{{.+}})
+// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i8**)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i8** %{{.+}})
 
-// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{.+}}, i8*** {{.+}})
+// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{.+}}, i8** {{.+}})
 // CHECK: alloca i32,
 // CHECK: [[ARGC_FP_ADDR:%.+]] = alloca i32,
 // CHECK: [[TR:%.+]] = alloca [2 x %struct.kmp_taskred_input_t],
@@ -123,7 +123,6 @@ int main(int argc, char **argv) {
 // CHECK_DAG: [[TG]] = load i8*, i8** [[TG_ADDR]],
 // CHECK-DAG: [[ARGV_REF]] = load i8*, i8** [[ARGV_ADDR:%.+]],
 // CHECK-DAG: [[ARGV_ADDR]] = load i8**, i8*** [[ARGV_ADDR_REF:%.+]],
-// CHECK-DAG: [[ARGV_ADDR_REF:%.+]] = load i8***, i8**** [[ARGV:%.+]],
-// CHECK-DAG: [[ARGV]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-DAG: [[ARGV_ADDR_REF]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
 
 #endif
diff --git a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
index 2fc49d44c1e904..e42e372ea67a72 100644
--- a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
@@ -19,9 +19,9 @@ int main(int argc, char **argv) {
   }
 }
 
-// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i8***)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i8*** %{{.+}})
+// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i8**)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i32* %{{.+}}, i8** %{{.+}})
 
-// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{.+}}, i8*** {{.+}})
+// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* {{.+}}, i8** {{.+}})
 // CHECK: [[ARGC_FP_ADDR:%.+]] = alloca i32,
 // CHECK: [[TR:%.+]] = alloca [2 x %struct.kmp_taskred_input_t],
 // CHECK: [[TG:%.+]] = alloca i8*,
@@ -122,7 +122,6 @@ int main(int argc, char **argv) {
 // CHECK_DAG: [[TG]] = load i8*, i8** [[TG_ADDR]],
 // CHECK-DAG: [[ARGV_REF]] = load i8*, i8** [[ARGV_ADDR:%.+]],
 // CHECK-DAG: [[ARGV_ADDR]] = load i8**, i8*** [[ARGV_ADDR_REF:%.+]],
-// CHECK-DAG: [[ARGV_ADDR_REF:%.+]] = load i8***, i8**** [[ARGV:%.+]],
-// CHECK-DAG: [[ARGV]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-DAG: [[ARGV_ADDR_REF]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
 
 #endif
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
index 06c0f8744e8cca..fbd990699d8327 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
@@ -19,9 +19,9 @@ int main(int argc, char **argv) {
   }
 }
 
-// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*, i8***)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i64 %{{.+}}, i64 %{{.+}}, i32* %{{.+}}, i8*** %{{.+}})
+// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*, i8**)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i64 %{{.+}}, i64 %{{.+}}, i32* %{{.+}}, i8** %{{.+}})
 
-// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i64 %{{.+}}, i64 %{{.+}}, i32* {{.+}}, i8*** {{.+}})
+// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i64 %{{.+}}, i64 %{{.+}}, i32* {{.+}}, i8** {{.+}})
 // CHECK: alloca i32,
 // CHECK: [[ARGC_FP_ADDR:%.+]] = alloca i32,
 // CHECK: [[TR:%.+]] = alloca [2 x [[TASKRED_TY:%struct.kmp_taskred_input_t.*]]],
@@ -123,7 +123,6 @@ int main(int argc, char **argv) {
 // CHECK_DAG: [[TG]] = load i8*, i8** [[TG_ADDR]],
 // CHECK-DAG: [[ARGV_REF]] = load i8*, i8** [[ARGV_ADDR:%.+]],
 // CHECK-DAG: [[ARGV_ADDR]] = load i8**, i8*** [[ARGV_ADDR_REF:%.+]],
-// CHECK-DAG: [[ARGV_ADDR_REF:%.+]] = load i8***, i8**** [[ARGV:%.+]],
-// CHECK-DAG: [[ARGV]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-DAG: [[ARGV_ADDR_REF]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
 
 #endif
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
index 194999f8cbb05c..26ca2352cc9ac5 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
@@ -20,9 +20,9 @@ int main(int argc, char **argv) {
   }
 }
 
-// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*, i8***)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i64 %{{.+}}, i64 %{{.+}}, i32* %{{.+}}, i8*** %{{.+}})
+// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*, i8**)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), i64 %{{.+}}, i64 %{{.+}}, i32* %{{.+}}, i8** %{{.+}})
 
-// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i64 %{{.+}}, i64 %{{.+}}, i32* {{.+}}, i8*** {{.+}})
+// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i64 %{{.+}}, i64 %{{.+}}, i32* {{.+}}, i8** {{.+}})
 // CHECK: alloca i32,
 // CHECK: [[ARGC_FP_ADDR:%.+]] = alloca i32,
 // CHECK: [[TR:%.+]] = alloca [2 x [[TASKRED_TY:%struct.kmp_taskred_input_t.*]]],
@@ -124,7 +124,6 @@ int main(int argc, char **argv) {
 // CHECK_DAG: [[TG]] = load i8*, i8** [[TG_ADDR]],
 // CHECK-DAG: [[ARGV_REF]] = load i8*, i8** [[ARGV_ADDR:%.+]],
 // CHECK-DAG: [[ARGV_ADDR]] = load i8**, i8*** [[ARGV_ADDR_REF:%.+]],
-// CHECK-DAG: [[ARGV_ADDR_REF:%.+]] = load i8***, i8**** [[ARGV:%.+]],
-// CHECK-DAG: [[ARGV]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-DAG: [[ARGV_ADDR_REF]] = getelementptr inbounds [[CAPS_TY]], [[CAPS_TY]]* [[CAP]], i32 0, i32 2
 
 #endif

From bd7daf5ceb92db00d3fc5d1ce8d4f74dcd03ebb9 Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Tue, 18 Aug 2020 15:52:09 +0300
Subject: [PATCH 036/101] [yaml2obj] - Don't crash when `FileHeader` declares
 an empty `Flags` key in specific situations.

We currently call the `llvm_unreachable` for the following YAML:

```
--- !ELF
FileHeader:
  Class:   ELFCLASS32
  Data:    ELFDATA2LSB
  Type:    ET_REL
  Machine: EM_NONE
  Flags:   [ ]
```

it happens because the `Flags` key is present, though `EM_NONE` is a
machine type that has no known `EF_*` values and we call `llvm_unreachable` by mistake.

Differential revision: https://reviews.llvm.org/D86138
---
 llvm/lib/ObjectYAML/ELFYAML.cpp          |  4 +---
 llvm/test/tools/yaml2obj/ELF/eflags.yaml | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/tools/yaml2obj/ELF/eflags.yaml

diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 319e37022c85a9..e5d5e6a01bc6bf 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -434,10 +434,8 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCase(EF_AMDGPU_XNACK);
     BCase(EF_AMDGPU_SRAM_ECC);
     break;
-  case ELF::EM_X86_64:
-    break;
   default:
-    llvm_unreachable("Unsupported architecture");
+    break;
   }
 #undef BCase
 #undef BCaseMask
diff --git a/llvm/test/tools/yaml2obj/ELF/eflags.yaml b/llvm/test/tools/yaml2obj/ELF/eflags.yaml
new file mode 100644
index 00000000000000..8b90a2b2c94451
--- /dev/null
+++ b/llvm/test/tools/yaml2obj/ELF/eflags.yaml
@@ -0,0 +1,16 @@
+## Check how the 'Flags' key can be used to encode e_flags field values.
+
+## Check we are able to produce no flags for EM_NONE. EM_NONE is an arbitrary
+## e_machine type that has no EF_* values defined for it.
+# RUN: yaml2obj %s -o %t-no-flags
+# RUN: llvm-readelf --file-headers %t-no-flags | FileCheck %s --check-prefix=NOFLAGS
+
+# NOFLAGS: Flags: 0x0{{$}}
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS32
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_NONE
+  Flags:   [ ]

From e0aa335334813b15d2106ccdcf4930d72aa33772 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 18 Aug 2020 08:24:37 -0400
Subject: [PATCH 037/101] [InstCombine] add tests for fneg+fabs; NFC

---
 llvm/test/Transforms/InstCombine/fabs.ll | 63 +++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/fabs.ll b/llvm/test/Transforms/InstCombine/fabs.ll
index 0b474045d59675..c0d2c3b701f033 100644
--- a/llvm/test/Transforms/InstCombine/fabs.ll
+++ b/llvm/test/Transforms/InstCombine/fabs.ll
@@ -4,6 +4,7 @@
 ; Make sure libcalls are replaced with intrinsic calls.
 
 declare float @llvm.fabs.f32(float)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
 declare double @llvm.fabs.f64(double)
 declare fp128 @llvm.fabs.f128(fp128)
 
@@ -13,6 +14,8 @@ declare fp128 @fabsl(fp128)
 declare float @llvm.fma.f32(float, float, float)
 declare float @llvm.fmuladd.f32(float, float, float)
 
+declare void @use(float)
+
 define float @replace_fabs_call_f32(float %x) {
 ; CHECK-LABEL: @replace_fabs_call_f32(
 ; CHECK-NEXT:    [[FABSF:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]])
@@ -116,8 +119,8 @@ define float @square_fabs_shrink_call1(float %x) {
 define float @square_fabs_shrink_call2(float %x) {
 ; CHECK-LABEL: @square_fabs_shrink_call2(
 ; CHECK-NEXT:    [[SQ:%.*]] = fmul float [[X:%.*]], [[X]]
-; CHECK-NEXT:    [[TRUNC:%.*]] = call float @llvm.fabs.f32(float [[SQ]])
-; CHECK-NEXT:    ret float [[TRUNC]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[SQ]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %sq = fmul float %x, %x
   %ext = fpext float %sq to double
@@ -745,3 +748,59 @@ define half @select_fcmp_nnan_nsz_uge_negzero_unary_fneg(half %x) {
   %fabs = select i1 %gezero, half %x, half %negx
   ret half %fabs
 }
+
+define float @select_fneg(i1 %c, float %x) {
+; CHECK-LABEL: @select_fneg(
+; CHECK-NEXT:    [[N:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C:%.*]], float [[N]], float [[X]]
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[S]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %n = fneg float %x
+  %s = select i1 %c, float %n, float %x
+  %fabs = call float @llvm.fabs.f32(float %s)
+  ret float %fabs
+}
+
+define float @select_fneg_use1(i1 %c, float %x) {
+; CHECK-LABEL: @select_fneg_use1(
+; CHECK-NEXT:    [[N:%.*]] = fneg float [[X:%.*]]
+; CHECK-NEXT:    call void @use(float [[N]])
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C:%.*]], float [[X]], float [[N]]
+; CHECK-NEXT:    [[FABS:%.*]] = call fast float @llvm.fabs.f32(float [[S]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %n = fneg float %x
+  call void @use(float %n)
+  %s = select i1 %c, float %x, float %n
+  %fabs = call fast float @llvm.fabs.f32(float %s)
+  ret float %fabs
+}
+
+define float @select_fneg_use2(i1 %c, float %x) {
+; CHECK-LABEL: @select_fneg_use2(
+; CHECK-NEXT:    [[N:%.*]] = fneg arcp float [[X:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[C:%.*]], float [[N]], float [[X]]
+; CHECK-NEXT:    call void @use(float [[S]])
+; CHECK-NEXT:    [[FABS:%.*]] = call nnan nsz float @llvm.fabs.f32(float [[S]])
+; CHECK-NEXT:    ret float [[FABS]]
+;
+  %n = fneg arcp float %x
+  %s = select i1 %c, float %n, float %x
+  call void @use(float %s)
+  %fabs = call nnan nsz float @llvm.fabs.f32(float %s)
+  ret float %fabs
+}
+
+define <2 x float> @select_fneg_vec(<2 x i1> %c, <2 x float> %x) {
+; CHECK-LABEL: @select_fneg_vec(
+; CHECK-NEXT:    [[N:%.*]] = fneg <2 x float> [[X:%.*]]
+; CHECK-NEXT:    [[S:%.*]] = select fast <2 x i1> [[C:%.*]], <2 x float> [[X]], <2 x float> [[N]]
+; CHECK-NEXT:    [[FABS:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[S]])
+; CHECK-NEXT:    ret <2 x float> [[FABS]]
+;
+  %n = fneg <2 x float> %x
+  %s = select fast <2 x i1> %c, <2 x float> %x, <2 x float> %n
+  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %s)
+  ret <2 x float> %fabs
+}

From 139da9c4d74391cd9d12600650ef95d5d68d8b59 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 18 Aug 2020 09:19:03 -0400
Subject: [PATCH 038/101] [InstCombine] fold fabs of select with negated
 operand

This is the FP example shown in:
https://bugs.llvm.org/PR39474
---
 .../InstCombine/InstCombineCalls.cpp          | 20 +++++++++++++------
 llvm/test/Transforms/InstCombine/fabs.ll      | 13 ++++--------
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 6a188f6a4da416..fa9c6e184e3858 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1230,13 +1230,21 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     break;
   }
   case Intrinsic::fabs: {
-    Value *Cond;
-    Constant *LHS, *RHS;
+    Value *Cond, *TVal, *FVal;
     if (match(II->getArgOperand(0),
-              m_Select(m_Value(Cond), m_Constant(LHS), m_Constant(RHS)))) {
-      CallInst *Call0 = Builder.CreateCall(II->getCalledFunction(), {LHS});
-      CallInst *Call1 = Builder.CreateCall(II->getCalledFunction(), {RHS});
-      return SelectInst::Create(Cond, Call0, Call1);
+              m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))) {
+      // fabs (select Cond, TrueC, FalseC) --> select Cond, AbsT, AbsF
+      if (isa<Constant>(TVal) && isa<Constant>(FVal)) {
+        CallInst *AbsT = Builder.CreateCall(II->getCalledFunction(), {TVal});
+        CallInst *AbsF = Builder.CreateCall(II->getCalledFunction(), {FVal});
+        return SelectInst::Create(Cond, AbsT, AbsF);
+      }
+      // fabs (select Cond, -FVal, FVal) --> fabs FVal
+      if (match(TVal, m_FNeg(m_Specific(FVal))))
+        return replaceOperand(*II, 0, FVal);
+      // fabs (select Cond, TVal, -TVal) --> fabs TVal
+      if (match(FVal, m_FNeg(m_Specific(TVal))))
+        return replaceOperand(*II, 0, TVal);
     }
 
     LLVM_FALLTHROUGH;
diff --git a/llvm/test/Transforms/InstCombine/fabs.ll b/llvm/test/Transforms/InstCombine/fabs.ll
index c0d2c3b701f033..f8b70afea3803c 100644
--- a/llvm/test/Transforms/InstCombine/fabs.ll
+++ b/llvm/test/Transforms/InstCombine/fabs.ll
@@ -751,9 +751,7 @@ define half @select_fcmp_nnan_nsz_uge_negzero_unary_fneg(half %x) {
 
 define float @select_fneg(i1 %c, float %x) {
 ; CHECK-LABEL: @select_fneg(
-; CHECK-NEXT:    [[N:%.*]] = fneg float [[X:%.*]]
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[C:%.*]], float [[N]], float [[X]]
-; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[S]])
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[X:%.*]])
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %n = fneg float %x
@@ -766,8 +764,7 @@ define float @select_fneg_use1(i1 %c, float %x) {
 ; CHECK-LABEL: @select_fneg_use1(
 ; CHECK-NEXT:    [[N:%.*]] = fneg float [[X:%.*]]
 ; CHECK-NEXT:    call void @use(float [[N]])
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[C:%.*]], float [[X]], float [[N]]
-; CHECK-NEXT:    [[FABS:%.*]] = call fast float @llvm.fabs.f32(float [[S]])
+; CHECK-NEXT:    [[FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %n = fneg float %x
@@ -782,7 +779,7 @@ define float @select_fneg_use2(i1 %c, float %x) {
 ; CHECK-NEXT:    [[N:%.*]] = fneg arcp float [[X:%.*]]
 ; CHECK-NEXT:    [[S:%.*]] = select i1 [[C:%.*]], float [[N]], float [[X]]
 ; CHECK-NEXT:    call void @use(float [[S]])
-; CHECK-NEXT:    [[FABS:%.*]] = call nnan nsz float @llvm.fabs.f32(float [[S]])
+; CHECK-NEXT:    [[FABS:%.*]] = call nnan nsz float @llvm.fabs.f32(float [[X]])
 ; CHECK-NEXT:    ret float [[FABS]]
 ;
   %n = fneg arcp float %x
@@ -794,9 +791,7 @@ define float @select_fneg_use2(i1 %c, float %x) {
 
 define <2 x float> @select_fneg_vec(<2 x i1> %c, <2 x float> %x) {
 ; CHECK-LABEL: @select_fneg_vec(
-; CHECK-NEXT:    [[N:%.*]] = fneg <2 x float> [[X:%.*]]
-; CHECK-NEXT:    [[S:%.*]] = select fast <2 x i1> [[C:%.*]], <2 x float> [[X]], <2 x float> [[N]]
-; CHECK-NEXT:    [[FABS:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[S]])
+; CHECK-NEXT:    [[FABS:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[X:%.*]])
 ; CHECK-NEXT:    ret <2 x float> [[FABS]]
 ;
   %n = fneg <2 x float> %x

From 740332b6cce3e59dca4e50d3e2fd0d008f5e9529 Mon Sep 17 00:00:00 2001
From: Georgii Rymar <grimar@accesssoftek.com>
Date: Wed, 12 Aug 2020 16:54:49 +0300
Subject: [PATCH 039/101] [llvm-readobj/elf] - Refine testing of broken
 Android's packed relocation sections.

This uses modern `split-file` tool to merge 5 `packed-relocs-error*.s` tests to a
new `packed-relocs-errors.s` and adds testing for GNU style.

Differential revision: https://reviews.llvm.org/D85835
---
 .../llvm-readobj/ELF/packed-relocs-error1.s   |  8 ---
 .../llvm-readobj/ELF/packed-relocs-error2.s   |  8 ---
 .../llvm-readobj/ELF/packed-relocs-error3.s   | 10 ---
 .../llvm-readobj/ELF/packed-relocs-error4.s   | 14 ----
 .../llvm-readobj/ELF/packed-relocs-error5.s   | 14 ----
 .../llvm-readobj/ELF/packed-relocs-errors.s   | 66 +++++++++++++++++++
 6 files changed, 66 insertions(+), 54 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-readobj/ELF/packed-relocs-error1.s
 delete mode 100644 llvm/test/tools/llvm-readobj/ELF/packed-relocs-error2.s
 delete mode 100644 llvm/test/tools/llvm-readobj/ELF/packed-relocs-error3.s
 delete mode 100644 llvm/test/tools/llvm-readobj/ELF/packed-relocs-error4.s
 delete mode 100644 llvm/test/tools/llvm-readobj/ELF/packed-relocs-error5.s
 create mode 100644 llvm/test/tools/llvm-readobj/ELF/packed-relocs-errors.s

diff --git a/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error1.s b/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error1.s
deleted file mode 100644
index 07fbd78b09ece2..00000000000000
--- a/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error1.s
+++ /dev/null
@@ -1,8 +0,0 @@
-// REQUIRES: x86-registered-target
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t
-// RUN: llvm-readobj --relocations %t 2>&1 | FileCheck %s -DFILE=%t
-
-// CHECK: warning: '[[FILE]]': unable to read relocations from SHT_ANDROID_REL section with index 3: invalid packed relocation header
-
-.section .rela.dyn, "a", @0x60000001
-.ascii "APS9"
diff --git a/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error2.s b/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error2.s
deleted file mode 100644
index ea14995e0ded11..00000000000000
--- a/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error2.s
+++ /dev/null
@@ -1,8 +0,0 @@
-// REQUIRES: x86-registered-target
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t
-// RUN: llvm-readobj --relocations %t 2>&1 | FileCheck %s -DFILE=%t
-
-// CHECK: warning: '[[FILE]]': unable to read relocations from SHT_ANDROID_REL section with index 3: malformed sleb128, extends past end
-
-.section .rela.dyn, "a", @0x60000001
-.ascii "APS2"
diff --git a/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error3.s b/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error3.s
deleted file mode 100644
index 766c551295ae62..00000000000000
--- a/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error3.s
+++ /dev/null
@@ -1,10 +0,0 @@
-// REQUIRES: x86-registered-target
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t
-// RUN: llvm-readobj --relocations %t 2>&1 | FileCheck %s -DFILE=%t
-
-// CHECK: warning: '[[FILE]]': unable to read relocations from SHT_ANDROID_REL section with index 3: malformed sleb128, extends past end
-
-.section .rela.dyn, "a", @0x60000001
-.ascii "APS2"
-.sleb128 4 // Number of relocations
-.sleb128 0 // Initial offset
diff --git a/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error4.s b/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error4.s
deleted file mode 100644
index 191e0b7885c46e..00000000000000
--- a/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error4.s
+++ /dev/null
@@ -1,14 +0,0 @@
-// REQUIRES: x86-registered-target
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t
-// RUN: llvm-readobj --relocations %t 2>&1 | FileCheck %s -DFILE=%t
-
-// CHECK: warning: '[[FILE]]': unable to read relocations from SHT_ANDROID_REL section with index 3: malformed sleb128, extends past end
-
-.section .rela.dyn, "a", @0x60000001
-.ascii "APS2"
-.sleb128 4 // Number of relocations
-.sleb128 0 // Initial offset
-
-.sleb128 2 // Number of relocations in group
-.sleb128 2 // RELOCATION_GROUPED_BY_OFFSET_DELTA_FLAG
-.sleb128 8 // offset delta
diff --git a/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error5.s b/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error5.s
deleted file mode 100644
index 8a6d6560f52056..00000000000000
--- a/llvm/test/tools/llvm-readobj/ELF/packed-relocs-error5.s
+++ /dev/null
@@ -1,14 +0,0 @@
-// REQUIRES: x86-registered-target
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t
-// RUN: llvm-readobj --relocations %t 2>&1 | FileCheck %s -DFILE=%t
-
-// CHECK: warning: '[[FILE]]': unable to read relocations from SHT_ANDROID_REL section with index 3: relocation group unexpectedly large
-
-.section .rela.dyn, "a", @0x60000001
-.ascii "APS2"
-.sleb128 4 // Number of relocations
-.sleb128 0 // Initial offset
-
-.sleb128 5 // Number of relocations in group
-.sleb128 2 // RELOCATION_GROUPED_BY_OFFSET_DELTA_FLAG
-.sleb128 8 // offset delta
diff --git a/llvm/test/tools/llvm-readobj/ELF/packed-relocs-errors.s b/llvm/test/tools/llvm-readobj/ELF/packed-relocs-errors.s
new file mode 100644
index 00000000000000..4f2e65ed220f7a
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/packed-relocs-errors.s
@@ -0,0 +1,66 @@
+# REQUIRES: x86-registered-target
+
+## Test that we report meaningful warnings when dumping
+## broken Android's packed relocation sections.
+
+# RUN: split-file %s %t
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %t/asm1.s -o %t1.o
+# RUN: llvm-readobj --relocations %t1.o 2>&1 | FileCheck %s -DFILE=%t1.o --check-prefix=ERR-HEADER
+# RUN: llvm-readelf --relocations %t1.o 2>&1 | FileCheck %s -DFILE=%t1.o --check-prefix=ERR-HEADER
+
+#--- asm1.s
+.section .rela.dyn, "a", @0x60000001
+.ascii "APS9"
+
+# ERR-HEADER: warning: '[[FILE]]': unable to read relocations from SHT_ANDROID_REL section with index 3: invalid packed relocation header
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %t/asm2.s -o %t2.o
+# RUN: llvm-readobj --relocations %t2.o 2>&1 | FileCheck %s -DFILE=%t2.o --check-prefix=ERR-PAST-END
+# RUN: llvm-readelf --relocations %t2.o 2>&1 | FileCheck %s -DFILE=%t2.o --check-prefix=ERR-PAST-END
+
+#--- asm2.s
+.section .rela.dyn, "a", @0x60000001
+.ascii "APS2"
+
+# ERR-PAST-END: warning: '[[FILE]]': unable to read relocations from SHT_ANDROID_REL section with index 3: malformed sleb128, extends past end
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %t/asm3.s -o %t3.o
+# RUN: llvm-readobj --relocations %t3.o 2>&1 | FileCheck %s -DFILE=%t3.o --check-prefix=ERR-PAST-END
+# RUN: llvm-readelf --relocations %t3.o 2>&1 | FileCheck %s -DFILE=%t3.o --check-prefix=ERR-PAST-END
+
+#--- asm3.s
+.section .rela.dyn, "a", @0x60000001
+.ascii "APS2"
+.sleb128 4 ## Number of relocations
+.sleb128 0 ## Initial offset
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %t/asm4.s -o %t4.o
+# RUN: llvm-readobj --relocations %t4.o 2>&1 | FileCheck %s -DFILE=%t4.o --check-prefix=ERR-PAST-END
+# RUN: llvm-readelf --relocations %t4.o 2>&1 | FileCheck %s -DFILE=%t4.o --check-prefix=ERR-PAST-END
+
+#--- asm4.s
+.section .rela.dyn, "a", @0x60000001
+.ascii "APS2"
+.sleb128 4 ## Number of relocations
+.sleb128 0 ## Initial offset
+
+.sleb128 2 ## Number of relocations in group
+.sleb128 2 ## RELOCATION_GROUPED_BY_OFFSET_DELTA_FLAG
+.sleb128 8 ## offset delta
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %t/asm5.s -o %t5.o
+# RUN: llvm-readobj --relocations %t5.o 2>&1 | FileCheck %s -DFILE=%t5.o --check-prefix=ERR-LARGE
+# RUN: llvm-readelf --relocations %t5.o 2>&1 | FileCheck %s -DFILE=%t5.o --check-prefix=ERR-LARGE
+
+# ERR-LARGE: warning: '[[FILE]]': unable to read relocations from SHT_ANDROID_REL section with index 3: relocation group unexpectedly large
+
+#--- asm5.s
+.section .rela.dyn, "a", @0x60000001
+.ascii "APS2"
+.sleb128 4 ## Number of relocations
+.sleb128 0 ## Initial offset
+
+.sleb128 5 ## Number of relocations in group
+.sleb128 2 ## RELOCATION_GROUPED_BY_OFFSET_DELTA_FLAG
+.sleb128 8 ## offset delta

From 3ba7777b94d887af594ba8d6c1378166bd361a20 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sat, 25 Jul 2020 13:21:31 -0400
Subject: [PATCH 040/101] AMDGPU/GlobalISel: Fix selection of s1/s16
 G_[F]CONSTANT

The code to determine the value size was overcomplicated and only
correct in the case where the result register already had a register
class assigned. We can always take the size directly from the
register's type.
---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  33 +-
 .../GlobalISel/divergent-control-flow.ll      |  26 +-
 .../AMDGPU/GlobalISel/inst-select-and.mir     |  34 ++-
 .../GlobalISel/inst-select-constant.mir       | 282 +++++++++++++-----
 .../GlobalISel/inst-select-fconstant.mir      |  33 +-
 .../AMDGPU/GlobalISel/inst-select-or.mir      |  34 ++-
 .../AMDGPU/GlobalISel/inst-select-xor.mir     |  34 ++-
 7 files changed, 318 insertions(+), 158 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 7e842835a5b44d..f2ecc50d472e54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2043,6 +2043,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   MachineOperand &ImmOp = I.getOperand(1);
+  Register DstReg = I.getOperand(0).getReg();
+  unsigned Size = MRI->getType(DstReg).getSizeInBits();
 
   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
   if (ImmOp.isFPImm()) {
@@ -2050,26 +2052,27 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
     ImmOp.ChangeToImmediate(Imm.getZExtValue());
   } else if (ImmOp.isCImm()) {
     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
+  } else {
+    llvm_unreachable("Not supported by g_constants");
   }
 
-  Register DstReg = I.getOperand(0).getReg();
-  unsigned Size;
-  bool IsSgpr;
-  const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
-  if (RB) {
-    IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
-    Size = MRI->getType(DstReg).getSizeInBits();
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+  const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
+
+  unsigned Opcode;
+  if (DstRB->getID() == AMDGPU::VCCRegBankID) {
+    Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
   } else {
-    const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
-    IsSgpr = TRI.isSGPRClass(RC);
-    Size = TRI.getRegSizeInBits(*RC);
-  }
+    Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
 
-  if (Size != 32 && Size != 64)
-    return false;
+    // We should never produce s1 values on banks other than VCC. If the user of
+    // this already constrained the register, we may incorrectly think it's VCC
+    // if it wasn't originally.
+    if (Size == 1)
+      return false;
+  }
 
-  unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
-  if (Size == 32) {
+  if (Size != 64) {
     I.setDesc(TII.get(Opcode));
     I.addImplicitDefUseOperands(*MF);
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 1f9c3bc60876e2..7564251c755d93 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -135,24 +135,24 @@ define void @constrained_if_register_class() {
 ; CHECK-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
-; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
-; CHECK-NEXT:    s_xor_b32 s5, s5, -1
-; CHECK-NEXT:    s_and_b32 s5, s5, 1
-; CHECK-NEXT:    s_mov_b32 s4, -1
-; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_xor_b32 s4, s4, -1
+; CHECK-NEXT:    s_and_b32 s4, s4, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
 ; CHECK-NEXT:    s_cbranch_scc0 BB4_6
 ; CHECK-NEXT:  ; %bb.1: ; %bb2
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, const.ptr@gotpcrel32@hi+4
-; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, const.ptr@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, const.ptr@gotpcrel32@hi+4
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, 1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, s7
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
 ; CHECK-NEXT:    flat_load_dword v0, v[0:1]
-; CHECK-NEXT:    v_cmp_ne_u32_e64 s[6:7], 0, 1
+; CHECK-NEXT:    s_mov_b32 s4, -1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, 1.0, v0
 ; CHECK-NEXT:    s_xor_b64 s[8:9], vcc, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir
index 81437acbbbc53b..7907608432ff19 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-and.mir
@@ -420,34 +420,35 @@ regBankSelected: true
 tracksRegLiveness: true
 body:             |
   bb.0:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr0
 
     ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave32
-    ; WAVE64: liveins: $vgpr0
+    ; WAVE64: liveins: $vgpr0, $sgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: %sgpr0:sreg_32 = COPY $sgpr0
     ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec
     ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
-    ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc
+    ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
     ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
     ; WAVE64: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_AND_B64_]]
     ; WAVE64: S_ENDPGM 0, implicit [[COPY1]]
     ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32
-    ; WAVE32: liveins: $vgpr0
+    ; WAVE32: liveins: $vgpr0, $sgpr0
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: %sgpr0:sreg_32 = COPY $sgpr0
     ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec
     ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
-    ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc
+    ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
     ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE32: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
     ; WAVE32: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_AND_B32_1]]
     ; WAVE32: S_ENDPGM 0, implicit [[COPY1]]
     %1:vgpr(s32) = COPY $vgpr0
     %0:vgpr(s1) = G_TRUNC %1(s32)
-    %2:sgpr(s1) = G_CONSTANT i1 true
+    %sgpr0:sgpr(s32) = COPY $sgpr0
+    %2:sgpr(s1) = G_TRUNC %sgpr0
     %6:sgpr(s32) = G_CONSTANT i32 0
     %7:sgpr(p1) = G_IMPLICIT_DEF
     %9:vcc(s1) = COPY %0(s1)
@@ -466,34 +467,35 @@ regBankSelected: true
 tracksRegLiveness: true
 body:             |
   bb.0:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr0
 
     ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave64
-    ; WAVE64: liveins: $vgpr0
+    ; WAVE64: liveins: $vgpr0, $sgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: %sgpr0:sreg_32 = COPY $sgpr0
     ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec
     ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
-    ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc
+    ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
     ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE64: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
     ; WAVE64: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_AND_B64_]]
     ; WAVE64: S_ENDPGM 0, implicit [[COPY1]]
     ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64
-    ; WAVE32: liveins: $vgpr0
+    ; WAVE32: liveins: $vgpr0, $sgpr0
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: %sgpr0:sreg_32 = COPY $sgpr0
     ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec
     ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
-    ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc
+    ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
     ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE32: [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
     ; WAVE32: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_AND_B32_1]]
     ; WAVE32: S_ENDPGM 0, implicit [[COPY1]]
     %1:vgpr(s32) = COPY $vgpr0
     %0:vgpr(s1) = G_TRUNC %1(s32)
-    %2:sgpr(s1) = G_CONSTANT i1 true
+    %sgpr0:sgpr(s32) = COPY $sgpr0
+    %2:sgpr(s1) = G_TRUNC %sgpr0
     %6:sgpr(s32) = G_CONSTANT i32 0
     %7:sgpr(p1) = G_IMPLICIT_DEF
     %9:vcc(s1) = COPY %0(s1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
index c8762c0d578eb0..20b886ebdadfa3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN
+# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=WAVE64
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=WAVE32
 
 ---
 name:            constant_v_s32
@@ -9,13 +10,21 @@ tracksRegLiveness: true
 
 body: |
   bb.0:
-    ; GCN-LABEL: name: constant_v_s32
-    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
-    ; GCN: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    ; WAVE64-LABEL: name: constant_v_s32
+    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE64: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    ; WAVE32-LABEL: name: constant_v_s32
+    ; WAVE32: $vcc_hi = IMPLICIT_DEF
+    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE32: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
     %0:vgpr(s32) = G_CONSTANT i32 0
     %1:vgpr(s32) = G_CONSTANT i32 1
     %2:vgpr(s32) = G_CONSTANT i32 -1
@@ -32,13 +41,21 @@ tracksRegLiveness: true
 
 body: |
   bb.0:
-    ; GCN-LABEL: name: constant_s_s32
-    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
-    ; GCN: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
-    ; GCN: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    ; WAVE64-LABEL: name: constant_s_s32
+    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE64: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE64: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE64: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    ; WAVE32-LABEL: name: constant_s_s32
+    ; WAVE32: $vcc_hi = IMPLICIT_DEF
+    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE32: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE32: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE32: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
     %0:sgpr(s32) = G_CONSTANT i32 0
     %1:sgpr(s32) = G_CONSTANT i32 1
     %2:sgpr(s32) = G_CONSTANT i32 -1
@@ -47,22 +64,67 @@ body: |
     S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
 ...
 
-# FIXME
-# ---
-# name:            constant_v_s16
-# legalized:       true
-# regBankSelected: true
-# tracksRegLiveness: true
-
-# body: |
-#   bb.0:
-#     %0:vgpry(s16) = G_CONSTANT i16 0
-#     %1:vgpr(s16) = G_CONSTANT i16 1
-#     %2:vgpr(s16) = G_CONSTANT i16 -1
-#     %3:vgpr(s16) = G_CONSTANT i16 -54
-#     %4:vgpr(s16) = G_CONSTANT i16 27
-#     S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
-# ...
+---
+name:            constant_v_s16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; WAVE64-LABEL: name: constant_v_s16
+    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE64: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    ; WAVE32-LABEL: name: constant_v_s16
+    ; WAVE32: $vcc_hi = IMPLICIT_DEF
+    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -54, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE32: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]], implicit [[V_MOV_B32_e32_4]]
+    %0:vgpr(s16) = G_CONSTANT i16 0
+    %1:vgpr(s16) = G_CONSTANT i16 1
+    %2:vgpr(s16) = G_CONSTANT i16 -1
+    %3:vgpr(s16) = G_CONSTANT i16 -54
+    %4:vgpr(s16) = G_CONSTANT i16 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
+
+---
+name:            constant_s_s16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; WAVE64-LABEL: name: constant_s_s16
+    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE64: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE64: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE64: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    ; WAVE32-LABEL: name: constant_s_s16
+    ; WAVE32: $vcc_hi = IMPLICIT_DEF
+    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE32: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -54
+    ; WAVE32: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 27
+    ; WAVE32: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]], implicit [[S_MOV_B32_4]]
+    %0:sgpr(s16) = G_CONSTANT i16 0
+    %1:sgpr(s16) = G_CONSTANT i16 1
+    %2:sgpr(s16) = G_CONSTANT i16 -1
+    %3:sgpr(s16) = G_CONSTANT i16 -54
+    %4:sgpr(s16) = G_CONSTANT i16 27
+    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4
+...
 
 ---
 name:            constant_v_s64
@@ -72,32 +134,59 @@ tracksRegLiveness: true
 
 body: |
   bb.0:
-    ; GCN-LABEL: name: constant_v_s64
-    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; GCN: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; GCN: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GCN: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; GCN: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
-    ; GCN: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GCN: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; GCN: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; GCN: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
-    ; GCN: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; GCN: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
-    ; GCN: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; GCN: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
-    ; GCN: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; GCN: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; GCN: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
-    ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE64-LABEL: name: constant_v_s64
+    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; WAVE64: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
+    ; WAVE64: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE64: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
+    ; WAVE64: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE64: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
+    ; WAVE64: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
+    ; WAVE64: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
+    ; WAVE64: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE64: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
+    ; WAVE64: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE64: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
+    ; WAVE64: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
+    ; WAVE64: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE32-LABEL: name: constant_v_s64
+    ; WAVE32: $vcc_hi = IMPLICIT_DEF
+    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; WAVE32: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
+    ; WAVE32: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE32: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
+    ; WAVE32: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
+    ; WAVE32: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
+    ; WAVE32: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
+    ; WAVE32: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
+    ; WAVE32: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    ; WAVE32: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
+    ; WAVE32: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE32: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
+    ; WAVE32: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
+    ; WAVE32: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
     %0:vgpr(s64) = G_CONSTANT i64 0
     %1:vgpr(s64) = G_CONSTANT i64 1
     %2:vgpr(s64) = G_CONSTANT i64 -1
@@ -117,24 +206,43 @@ tracksRegLiveness: true
 
 body: |
   bb.0:
-    ; GCN-LABEL: name: constant_s_s64
-    ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GCN: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-    ; GCN: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
-    ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GCN: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GCN: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
-    ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
-    ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GCN: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; GCN: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GCN: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; GCN: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
-    ; GCN: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; GCN: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; GCN: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
-    ; GCN: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE64-LABEL: name: constant_s_s64
+    ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; WAVE64: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+    ; WAVE64: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
+    ; WAVE64: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; WAVE64: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
+    ; WAVE64: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
+    ; WAVE64: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
+    ; WAVE64: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE64: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
+    ; WAVE64: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE64: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
+    ; WAVE64: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
+    ; WAVE64: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE32-LABEL: name: constant_s_s64
+    ; WAVE32: $vcc_hi = IMPLICIT_DEF
+    ; WAVE32: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; WAVE32: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
+    ; WAVE32: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
+    ; WAVE32: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; WAVE32: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
+    ; WAVE32: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
+    ; WAVE32: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
+    ; WAVE32: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
+    ; WAVE32: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
+    ; WAVE32: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE32: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
+    ; WAVE32: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
+    ; WAVE32: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
     %0:sgpr(s64) = G_CONSTANT i64 0
     %1:sgpr(s64) = G_CONSTANT i64 1
     %2:sgpr(s64) = G_CONSTANT i64 -1
@@ -145,3 +253,27 @@ body: |
     %7:sgpr(s64) = G_CONSTANT i64 18446744004990098135
     S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
 ...
+
+---
+
+name:            constant_i1_vcc
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    ; WAVE64-LABEL: name: constant_i1_vcc
+    ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; WAVE64: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+    ; WAVE64: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]]
+    ; WAVE32-LABEL: name: constant_i1_vcc
+    ; WAVE32: $vcc_hi = IMPLICIT_DEF
+    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]]
+    %0:vcc(s1) = G_CONSTANT i1 true
+    %1:vcc(s1) = G_CONSTANT i1 false
+    S_ENDPGM 0 , implicit %0 , implicit %1
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
index 9afa4b08c0ecb8..96e65617e33608 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
@@ -14,12 +14,16 @@ body: |
     ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1090519040, implicit $exec
     ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
     ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1090519040, implicit $exec
-    ; GCN: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]], implicit [[V_MOV_B32_e32_1]], implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]]
+    ; GCN: $vgpr0 = COPY [[V_MOV_B32_e32_]]
+    ; GCN: $vgpr1 = COPY [[V_MOV_B32_e32_1]]
+    ; GCN: S_ENDPGM 0, implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]]
     %0:vgpr(s32) = G_FCONSTANT float 1.0
     %1:vgpr(s32) = G_FCONSTANT float 8.0
     %2:vgpr(s32) = G_FCONSTANT float 1.0
     %3:vgpr(s32) = G_FCONSTANT float 8.0
-    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2 , implicit %3
+    $vgpr0 = COPY %0
+    $vgpr1 = COPY %1
+    S_ENDPGM 0, implicit %2 , implicit %3
 ...
 
 ---
@@ -37,14 +41,14 @@ body: |
     ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 3238002688
     ; GCN: $sgpr0 = COPY [[S_MOV_B32_]]
     ; GCN: $sgpr1 = COPY [[S_MOV_B32_1]]
-    ; GCN: S_ENDPGM 0, implicit [[S_MOV_B32_]], implicit [[S_MOV_B32_1]], implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]]
+    ; GCN: S_ENDPGM 0, implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]]
     %0:sgpr(s32) = G_FCONSTANT float 1.0
     %1:sgpr(s32) = G_FCONSTANT float 8.0
     %2:sgpr(s32) = G_FCONSTANT float -1.0
     %3:sgpr(s32) = G_FCONSTANT float -8.0
     $sgpr0 = COPY %0
     $sgpr1 = COPY %1
-    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2 , implicit %3
+    S_ENDPGM 0, implicit %2 , implicit %3
 
 ...
 
@@ -71,14 +75,14 @@ body: |
     ; GCN: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
     ; GCN: $vgpr0_vgpr1 = COPY [[REG_SEQUENCE]]
     ; GCN: $vgpr2_vgpr3 = COPY [[REG_SEQUENCE1]]
-    ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; GCN: S_ENDPGM 0, implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
     %0:vgpr(s64) = G_FCONSTANT double 1.0
     %1:vgpr(s64) = G_FCONSTANT double 8.0
     %2:vgpr(s64) = G_FCONSTANT double -2.0
     %3:vgpr(s64) = G_FCONSTANT double 10.0
     $vgpr0_vgpr1 = COPY %0
     $vgpr2_vgpr3 = COPY %1
-    S_ENDPGM 0, implicit %0 , implicit %1 , implicit %2 , implicit %3
+    S_ENDPGM 0, implicit %2 , implicit %3
 
 ...
 
@@ -122,14 +126,22 @@ body: |
     ; GCN-LABEL: name: fconstant_v_s16
     ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec
     ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 18432, implicit $exec
+    ; GCN: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 15360, implicit $exec
+    ; GCN: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 18432, implicit $exec
     ; GCN: $vgpr0 = COPY [[V_MOV_B32_e32_]]
     ; GCN: $vgpr1 = COPY [[V_MOV_B32_e32_1]]
+    ; GCN: S_ENDPGM 0, implicit [[V_MOV_B32_e32_2]], implicit [[V_MOV_B32_e32_3]]
     %0:vgpr(s16) = G_FCONSTANT half 1.0
     %1:vgpr(s16) = G_FCONSTANT half 8.0
     %2:vgpr(s32) = G_ANYEXT %0
     %3:vgpr(s32) = G_ANYEXT %1
+
+    ; Test without already assigned register class
+    %4:vgpr(s16) = G_FCONSTANT half 1.0
+    %5:vgpr(s16) = G_FCONSTANT half 8.0
     $vgpr0 = COPY %2
     $vgpr1 = COPY %3
+    S_ENDPGM 0, implicit %4, implicit %5
 
 ...
 
@@ -146,14 +158,21 @@ body: |
     ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 18432
     ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
     ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+    ; GCN: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 15360
+    ; GCN: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 18432
     ; GCN: $sgpr0 = COPY [[COPY]]
     ; GCN: $sgpr1 = COPY [[COPY1]]
+    ; GCN: S_ENDPGM 0, implicit [[S_MOV_B32_2]], implicit [[S_MOV_B32_3]]
     %0:sgpr(s16) = G_FCONSTANT half 1.0
     %1:sgpr(s16) = G_FCONSTANT half 8.0
     %2:vgpr(s32) = G_ANYEXT %0
     %3:vgpr(s32) = G_ANYEXT %1
+
+    ; Test without already assigned register class
+    %4:sgpr(s16) = G_FCONSTANT half 1.0
+    %5:sgpr(s16) = G_FCONSTANT half 8.0
     $sgpr0 = COPY %2
     $sgpr1 = COPY %3
+    S_ENDPGM 0, implicit %4, implicit %5
 
 ...
-
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir
index 7f1f52d2c522ac..966bb8c629500a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-or.mir
@@ -420,34 +420,35 @@ regBankSelected: true
 tracksRegLiveness: true
 body:             |
   bb.0:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr0
 
     ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave32
-    ; WAVE64: liveins: $vgpr0
+    ; WAVE64: liveins: $vgpr0, $sgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: %sgpr0:sreg_32 = COPY $sgpr0
     ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec
     ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
-    ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc
+    ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
     ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
     ; WAVE64: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_OR_B64_]]
     ; WAVE64: S_ENDPGM 0, implicit [[COPY1]]
     ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32
-    ; WAVE32: liveins: $vgpr0
+    ; WAVE32: liveins: $vgpr0, $sgpr0
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: %sgpr0:sreg_32 = COPY $sgpr0
     ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec
     ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
-    ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc
+    ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
     ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
     ; WAVE32: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_OR_B32_]]
     ; WAVE32: S_ENDPGM 0, implicit [[COPY1]]
     %1:vgpr(s32) = COPY $vgpr0
     %0:vgpr(s1) = G_TRUNC %1(s32)
-    %2:sgpr(s1) = G_CONSTANT i1 true
+    %sgpr0:sgpr(s32) = COPY $sgpr0
+    %2:sgpr(s1) = G_TRUNC %sgpr0
     %6:sgpr(s32) = G_CONSTANT i32 0
     %7:sgpr(p1) = G_IMPLICIT_DEF
     %9:vcc(s1) = COPY %0(s1)
@@ -466,34 +467,35 @@ regBankSelected: true
 tracksRegLiveness: true
 body:             |
   bb.0:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr0
 
     ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave64
-    ; WAVE64: liveins: $vgpr0
+    ; WAVE64: liveins: $vgpr0, $sgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: %sgpr0:sreg_32 = COPY $sgpr0
     ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec
     ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
-    ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc
+    ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
     ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE64: [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
     ; WAVE64: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_OR_B64_]]
     ; WAVE64: S_ENDPGM 0, implicit [[COPY1]]
     ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64
-    ; WAVE32: liveins: $vgpr0
+    ; WAVE32: liveins: $vgpr0, $sgpr0
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: %sgpr0:sreg_32 = COPY $sgpr0
     ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec
     ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
-    ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc
+    ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
     ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE32: [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_OR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
     ; WAVE32: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_OR_B32_]]
     ; WAVE32: S_ENDPGM 0, implicit [[COPY1]]
     %1:vgpr(s32) = COPY $vgpr0
     %0:vgpr(s1) = G_TRUNC %1(s32)
-    %2:sgpr(s1) = G_CONSTANT i1 true
+    %sgpr0:sgpr(s32) = COPY $sgpr0
+    %2:sgpr(s1) = G_TRUNC %sgpr0
     %6:sgpr(s32) = G_CONSTANT i32 0
     %7:sgpr(p1) = G_IMPLICIT_DEF
     %9:vcc(s1) = COPY %0(s1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir
index f923a4c9f02b81..0364cb736c601a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-xor.mir
@@ -421,34 +421,35 @@ regBankSelected: true
 tracksRegLiveness: true
 body:             |
   bb.0:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr0
 
     ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave32
-    ; WAVE64: liveins: $vgpr0
+    ; WAVE64: liveins: $vgpr0, $sgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: %sgpr0:sreg_32 = COPY $sgpr0
     ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec
     ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
-    ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc
+    ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
     ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
     ; WAVE64: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_XOR_B64_]]
     ; WAVE64: S_ENDPGM 0, implicit [[COPY1]]
     ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave32
-    ; WAVE32: liveins: $vgpr0
+    ; WAVE32: liveins: $vgpr0, $sgpr0
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: %sgpr0:sreg_32 = COPY $sgpr0
     ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec
     ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
-    ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc
+    ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
     ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
     ; WAVE32: [[COPY1:%[0-9]+]]:sreg_32_xm0 = COPY [[S_XOR_B32_]]
     ; WAVE32: S_ENDPGM 0, implicit [[COPY1]]
     %1:vgpr(s32) = COPY $vgpr0
     %0:vgpr(s1) = G_TRUNC %1(s32)
-    %2:sgpr(s1) = G_CONSTANT i1 true
+    %sgpr0:sgpr(s32) = COPY $sgpr0
+    %2:sgpr(s1) = G_TRUNC %sgpr0
     %6:sgpr(s32) = G_CONSTANT i32 0
     %7:sgpr(p1) = G_IMPLICIT_DEF
     %9:vcc(s1) = COPY %0(s1)
@@ -467,34 +468,35 @@ regBankSelected: true
 tracksRegLiveness: true
 body:             |
   bb.0:
-    liveins: $vgpr0
+    liveins: $vgpr0, $sgpr0
 
     ; WAVE64-LABEL: name: copy_select_constrain_vcc_result_reg_wave64
-    ; WAVE64: liveins: $vgpr0
+    ; WAVE64: liveins: $vgpr0, $sgpr0
     ; WAVE64: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE64: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: %sgpr0:sreg_32 = COPY $sgpr0
     ; WAVE64: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec
     ; WAVE64: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
-    ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc
+    ; WAVE64: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
     ; WAVE64: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE64: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
     ; WAVE64: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_XOR_B64_]]
     ; WAVE64: S_ENDPGM 0, implicit [[COPY1]]
     ; WAVE32-LABEL: name: copy_select_constrain_vcc_result_reg_wave64
-    ; WAVE32: liveins: $vgpr0
+    ; WAVE32: liveins: $vgpr0, $sgpr0
     ; WAVE32: $vcc_hi = IMPLICIT_DEF
     ; WAVE32: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; WAVE32: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: %sgpr0:sreg_32 = COPY $sgpr0
     ; WAVE32: [[V_AND_B32_e32_:%[0-9]+]]:vgpr_32 = V_AND_B32_e32 1, [[COPY]], implicit $exec
     ; WAVE32: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[V_AND_B32_e32_]], implicit $exec
-    ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, [[S_MOV_B32_]], implicit-def $scc
+    ; WAVE32: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, %sgpr0, implicit-def $scc
     ; WAVE32: [[V_CMP_NE_U32_e64_1:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[S_AND_B32_]], implicit $exec
     ; WAVE32: [[S_XOR_B32_:%[0-9]+]]:sreg_32_xm0_xexec = S_XOR_B32 [[V_CMP_NE_U32_e64_]], [[V_CMP_NE_U32_e64_1]], implicit-def dead $scc
     ; WAVE32: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY [[S_XOR_B32_]]
     ; WAVE32: S_ENDPGM 0, implicit [[COPY1]]
     %1:vgpr(s32) = COPY $vgpr0
     %0:vgpr(s1) = G_TRUNC %1(s32)
-    %2:sgpr(s1) = G_CONSTANT i1 true
+    %sgpr0:sgpr(s32) = COPY $sgpr0
+    %2:sgpr(s1) = G_TRUNC %sgpr0
     %6:sgpr(s32) = G_CONSTANT i32 0
     %7:sgpr(p1) = G_IMPLICIT_DEF
     %9:vcc(s1) = COPY %0(s1)

From 2f5f5febf3e4fa9bc80e8a8f63a99d3e6813c499 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Sun, 26 Jul 2020 15:43:48 -0400
Subject: [PATCH 041/101] AMDGPU/GlobalISel: Select llvm.amdgcn.groupstaticsize

Previously, it would successfully select and assert if not HSA or PAL
when expanding the pseudoinstruction. We don't need the
pseudoinstruction anymore since we know the total size after
legalization.
---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 29 ++++++++++++
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |  1 +
 .../inst-select-amdgcn.groupstaticsize.mir    | 46 +++++++++++++++++++
 .../AMDGPU/llvm.amdgcn.groupstaticsize.ll     |  4 ++
 4 files changed, 80 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.groupstaticsize.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index f2ecc50d472e54..c9f9eb6988f15a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -930,6 +930,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
     return selectBallot(I);
   case Intrinsic::amdgcn_reloc_constant:
     return selectRelocConstant(I);
+  case Intrinsic::amdgcn_groupstaticsize:
+    return selectGroupStaticSize(I);
   case Intrinsic::returnaddress:
     return selectReturnAddress(I);
   default:
@@ -1137,6 +1139,33 @@ bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
   return true;
 }
 
+bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
+  Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
+
+  Register DstReg = I.getOperand(0).getReg();
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+  unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
+    AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+
+  MachineBasicBlock *MBB = I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+
+  auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
+
+  if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
+    const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+    MIB.addImm(MFI->getLDSSize());
+  } else {
+    Module *M = MF->getFunction().getParent();
+    const GlobalValue *GV
+      = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
+    MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
+  }
+
+  I.eraseFromParent();
+  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
   MachineBasicBlock *MBB = I.getParent();
   MachineFunction &MF = *MBB->getParent();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index b18867299baf9a..969ef59363085f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -110,6 +110,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectIntrinsicIcmp(MachineInstr &MI) const;
   bool selectBallot(MachineInstr &I) const;
   bool selectRelocConstant(MachineInstr &I) const;
+  bool selectGroupStaticSize(MachineInstr &I) const;
   bool selectReturnAddress(MachineInstr &I) const;
   bool selectG_INTRINSIC(MachineInstr &I) const;
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.groupstaticsize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.groupstaticsize.mir
new file mode 100644
index 00000000000000..4e45fe689dd74f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.groupstaticsize.mir
@@ -0,0 +1,46 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=HSAPAL %s
+# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=HSAPAL %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=MESA %s
+
+---
+name: groupstaticsize_v
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  ldsSize: 4096
+
+body: |
+  bb.0:
+
+    ; HSAPAL-LABEL: name: groupstaticsize_v
+    ; HSAPAL: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
+    ; HSAPAL: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]]
+    ; MESA-LABEL: name: groupstaticsize_v
+    ; MESA: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @llvm.amdgcn.groupstaticsize, implicit $exec
+    ; MESA: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]]
+    %0:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize)
+    S_ENDPGM 0, implicit %0
+...
+
+---
+name: groupstaticsize_s
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  ldsSize: 1024
+
+body: |
+  bb.0:
+
+    ; HSAPAL-LABEL: name: groupstaticsize_s
+    ; HSAPAL: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024
+    ; HSAPAL: S_ENDPGM 0, implicit [[S_MOV_B32_]]
+    ; MESA-LABEL: name: groupstaticsize_s
+    ; MESA: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @llvm.amdgcn.groupstaticsize
+    ; MESA: S_ENDPGM 0, implicit [[S_MOV_B32_]]
+    %0:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.groupstaticsize)
+    S_ENDPGM 0, implicit %0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
index 3224d8a3594ad8..db4032efceabb3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
@@ -2,6 +2,10 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
 
+; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,NOHSA %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
+; RUN: llc -global-isel -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
+
 @lds0 = addrspace(3) global [512 x float] undef, align 4
 @lds1 = addrspace(3) global [256 x float] undef, align 4
 

From d5621b83a58c1faaa0e413ac7c4f0ca8811d0c61 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 18 Aug 2020 14:52:23 +0100
Subject: [PATCH 042/101] [X86][AVX] lowerShuffleWithVTRUNC - pull out
 TRUNCATE/VTRUNC creation into helper code. NFCI.

Prep work toward adding v16i16/v32i8 support for lowerShuffleWithVTRUNC and improving lowerShuffleWithVPMOV.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 59 +++++++++++++++----------
 1 file changed, 36 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 27dee97edb2fd8..6238366f6c268c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11286,6 +11286,37 @@ static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
   return false;
 }
 
+// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
+// element padding to the final DstVT.
+static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
+                                  const X86Subtarget &Subtarget,
+                                  SelectionDAG &DAG, bool ZeroUppers) {
+  MVT SrcVT = Src.getSimpleValueType();
+  unsigned NumDstElts = DstVT.getVectorNumElements();
+  unsigned NumSrcElts = SrcVT.getVectorNumElements();
+
+  // Perform a direct ISD::TRUNCATE if possible.
+  if (NumSrcElts == NumDstElts)
+    return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
+
+  if (NumSrcElts > NumDstElts) {
+    MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
+    return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
+  }
+
+  // Non-VLX targets must truncate from a 512-bit type, so we need to
+  // widen, truncate and then possibly extract the original subvector.
+  if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
+    SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
+    return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
+  }
+
+  // Fallback to a X86ISD::VTRUNC.
+  // TODO: Handle cases where we go from 512-bit vectors to sub-128-bit vectors.
+  return DAG.getNode(X86ISD::VTRUNC, DL, DstVT, Src);
+}
+
 static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
                                 int Delta) {
   int Size = (int)Mask.size();
@@ -11388,7 +11419,7 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
 
   unsigned NumElts = VT.getVectorNumElements();
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
-  unsigned MaxScale = 64 / VT.getScalarSizeInBits();
+  unsigned MaxScale = 64 / EltSizeInBits;
   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
     // TODO: Support non-BWI VPMOVWB truncations?
     unsigned SrcEltBits = EltSizeInBits * Scale;
@@ -11408,36 +11439,18 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
     if (UpperElts > 0 &&
         !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
       continue;
+    bool UndefUppers =
+        UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
 
     // As we're using both sources then we need to concat them together
-    // and truncate from the 256-bit src.
+    // and truncate from the double-sized src.
     MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
     SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
 
     MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
     MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
     Src = DAG.getBitcast(SrcVT, Src);
-
-    if (SrcVT.getVectorNumElements() == NumElts)
-      return DAG.getNode(ISD::TRUNCATE, DL, VT, Src);
-
-    if (!Subtarget.hasVLX()) {
-      // Non-VLX targets must truncate from a 512-bit type, so we need to
-      // widen, truncate and then possibly extract the original 128-bit
-      // vector.
-      bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
-      Src = widenSubVector(Src, !UndefUppers, Subtarget, DAG, DL, 512);
-      unsigned NumWideSrcElts = Src.getValueType().getVectorNumElements();
-      if (NumWideSrcElts >= NumElts) {
-        // Widening means we can now use a regular TRUNCATE.
-        MVT WideVT = MVT::getVectorVT(VT.getScalarType(), NumWideSrcElts);
-        SDValue WideRes = DAG.getNode(ISD::TRUNCATE, DL, WideVT, Src);
-        if (!WideVT.is128BitVector())
-          WideRes = extract128BitVector(WideRes, 0, DAG, DL);
-        return WideRes;
-      }
-    }
-    return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
+    return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
   }
 
   return SDValue();

From c98fcba55cf615b078b3943ee9e65356ba23414b Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 18 Aug 2020 10:14:07 -0400
Subject: [PATCH 043/101] [SLP] remove instcombine dependency from regression
 test; NFC

InstCombine doesn't do that much here - sinks some instructions
and improves alignments - but that should not be part of the
SLP pass unit testing.
---
 .../Transforms/SLPVectorizer/X86/limit.ll     | 49 ++++++++++---------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/limit.ll b/llvm/test/Transforms/SLPVectorizer/X86/limit.ll
index 41db490a754f6b..e6d78c0c0e378e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/limit.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/limit.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s --instcombine -slp-vectorizer -S | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -S | FileCheck %s
+
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -31,40 +32,40 @@ define void @addsub() {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %0 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 0), align 4
-  %1 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 0), align 4
-  %add = add nsw i32 %0, %1
   br label %bb1
-bb1:                                              ; preds = %entry
-  %2 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 0), align 4
-  %3 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 0), align 4
+
+bb1:
+  %0 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i64 0), align 16
+  %1 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i64 0, i64 0), align 16
+  %add = add nsw i32 %0, %1
+  %2 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i64 0, i64 0), align 16
+  %3 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i64 0, i64 0), align 16
   %add1 = add nsw i32 %2, %3
   %add2 = add nsw i32 %add, %add1
-  store i32 %add2, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 0), align 4
-  %4 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 1), align 4
-  %5 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 1), align 4
+  store i32 %add2, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 0), align 16
+  %4 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i64 1), align 4
+  %5 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i64 0, i64 1), align 4
   %add3 = add nsw i32 %4, %5
-  %6 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 1), align 4
-  %7 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 1), align 4
+  %6 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i64 0, i64 1), align 4
+  %7 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i64 0, i64 1), align 4
   %add4 = add nsw i32 %6, %7
   %sub = sub nsw i32 %add3, %add4
-  store i32 %sub, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 1), align 4
-  %8 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 2), align 4
-  %9 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 2), align 4
+  store i32 %sub, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 1), align 4
+  %8 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i64 2), align 8
+  %9 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i64 0, i64 2), align 8
   %add5 = add nsw i32 %8, %9
-  %10 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 2), align 4
-  %11 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 2), align 4
+  %10 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i64 0, i64 2), align 8
+  %11 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i64 0, i64 2), align 8
   %add6 = add nsw i32 %10, %11
   %add7 = add nsw i32 %add5, %add6
-  store i32 %add7, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 2), align 4
-  %12 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i32 0, i64 3), align 4
-  %13 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i32 0, i64 3), align 4
+  store i32 %add7, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 2), align 8
+  %12 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @b, i64 0, i64 3), align 4
+  %13 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @c, i64 0, i64 3), align 4
   %add8 = add nsw i32 %12, %13
-  %14 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i32 0, i64 3), align 4
-  %15 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i32 0, i64 3), align 4
+  %14 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @d, i64 0, i64 3), align 4
+  %15 = load i32, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @e, i64 0, i64 3), align 4
   %add9 = add nsw i32 %14, %15
   %sub10 = sub nsw i32 %add8, %add9
-  store i32 %sub10, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i32 0, i64 3), align 4
+  store i32 %sub10, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 3), align 4
   ret void
 }
-

From 011bf4fd9679c8a7dd7e3a6fc9a696e417ce3c53 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 18 Aug 2020 15:24:28 +0100
Subject: [PATCH 044/101] [X86][AVX] lowerShuffleWithVTRUNC - extend to support
 v16i16/v32i8 binary shuffles.

This requires a few additional SrcVT vs DstVT padding cases in getAVX512TruncNode.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  34 ++-
 llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 251 ++----------------
 .../CodeGen/X86/vector-shuffle-256-v32.ll     |  20 +-
 .../CodeGen/X86/x86-interleaved-access.ll     |  53 ++--
 4 files changed, 79 insertions(+), 279 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6238366f6c268c..0fbabdc5dfdf0b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11292,19 +11292,28 @@ static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
                                   const X86Subtarget &Subtarget,
                                   SelectionDAG &DAG, bool ZeroUppers) {
   MVT SrcVT = Src.getSimpleValueType();
+  MVT DstSVT = DstVT.getScalarType();
   unsigned NumDstElts = DstVT.getVectorNumElements();
   unsigned NumSrcElts = SrcVT.getVectorNumElements();
+  unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
 
   // Perform a direct ISD::TRUNCATE if possible.
   if (NumSrcElts == NumDstElts)
     return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
 
   if (NumSrcElts > NumDstElts) {
-    MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
+    MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
     return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
   }
 
+  if ((NumSrcElts * DstEltSizeInBits) >= 128) {
+    MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
+    return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
+                          DstVT.getSizeInBits());
+  }
+
   // Non-VLX targets must truncate from a 512-bit type, so we need to
   // widen, truncate and then possibly extract the original subvector.
   if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
@@ -11312,9 +11321,13 @@ static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
     return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
   }
 
-  // Fallback to a X86ISD::VTRUNC.
-  // TODO: Handle cases where we go from 512-bit vectors to sub-128-bit vectors.
-  return DAG.getNode(X86ISD::VTRUNC, DL, DstVT, Src);
+  // Fallback to a X86ISD::VTRUNC, padding if necessary.
+  MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
+  SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
+  if (DstVT != TruncVT)
+    Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
+                           DstVT.getSizeInBits());
+  return Trunc;
 }
 
 static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
@@ -11413,7 +11426,8 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
                                     const APInt &Zeroable,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
-  assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Unexpected VTRUNC type");
   if (!Subtarget.hasAVX512())
     return SDValue();
 
@@ -16893,6 +16907,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        Subtarget))
     return V;
 
+  // Try to use lower using a truncation.
+  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
+                                       Subtarget, DAG))
+    return V;
+
   // Try to use shift instructions.
   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
                                           Zeroable, Subtarget, DAG))
@@ -17003,6 +17022,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                        Subtarget))
     return V;
 
+  // Try to use lower using a truncation.
+  if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
+                                       Subtarget, DAG))
+    return V;
+
   // Try to use shift instructions.
   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
                                           Zeroable, Subtarget, DAG))
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 6c5f5125109dbd..de13135ebb5310 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -176,89 +176,12 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
 }
 
 define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512VL-NEXT:    vpmovdb %ymm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa 48(%rdi), %xmm0
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BWVL-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BWVL-NEXT:    vpmovdb %ymm1, %xmm1
-; AVX512BWVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512BWVL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm0
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VBMI-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX512VBMI-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512VBMI-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512VBMI-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512VBMI-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VBMI-NEXT:    vzeroupper
-; AVX512VBMI-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512VBMIVL-NEXT:    vpermt2b 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
+; AVX512-LABEL: shuffle_v64i8_to_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
   store <16 x i8> %strided.vec, <16 x i8>* %S
@@ -280,80 +203,12 @@ define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 }
 
 define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512F-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm0
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512VL-NEXT:    vpmovqw %ymm1, %xmm1
-; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512BW-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BWVL-NEXT:    vpermt2w 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT:    vmovdqa %xmm1, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512VBMI-NEXT:    vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
-; AVX512VBMI-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512VBMI-NEXT:    vpmovqw %zmm1, %xmm1
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512VBMI-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX512VBMI-NEXT:    vzeroupper
-; AVX512VBMI-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512VBMIVL-NEXT:    vpermt2w 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovdqa %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
+; AVX512-LABEL: shuffle_v32i16_to_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovqw %zmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %L
   %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
   store <8 x i16> %strided.vec, <8 x i16>* %S
@@ -375,81 +230,13 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 }
 
 define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vpmovqb %ymm1, %xmm1
-; AVX512VL-NEXT:    vpmovqb %ymm0, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512BW-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512BWVL-NEXT:    vpmovqb %ymm1, %xmm1
-; AVX512BWVL-NEXT:    vpmovqb %ymm0, %xmm0
-; AVX512BWVL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BWVL-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512BWVL-NEXT:    vzeroupper
-; AVX512BWVL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa 48(%rdi), %xmm0
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512VBMI-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
-; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512VBMI-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512VBMI-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX512VBMI-NEXT:    vmovq %xmm0, (%rsi)
-; AVX512VBMI-NEXT:    vzeroupper
-; AVX512VBMI-NEXT:    retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
-; AVX512VBMIVL-NEXT:    vpermi2b 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT:    vmovq %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT:    vzeroupper
-; AVX512VBMIVL-NEXT:    retq
+; AVX512-LABEL: shuffle_v64i8_to_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
   store <8 x i8> %strided.vec, <8 x i8>* %S
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index e5285aebda69e3..b2c0acdf9b2287 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -4843,19 +4843,13 @@ define <32 x i8> @shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VLBW-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
-; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VLBW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpsrlw $8, %zmm0, %zmm0
-; AVX512VLBW-NEXT:    vpmovwb %zmm0, %ymm0
-; AVX512VLBW-NEXT:    retq
-;
-; AVX512VLVBMI-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
-; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
-; AVX512VLVBMI-NEXT:    vpermt2b %ymm1, %ymm2, %ymm0
-; AVX512VLVBMI-NEXT:    retq
+; AVX512VL-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpsrlw $8, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512VL-NEXT:    retq
 ;
 ; XOPAVX1-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
 ; XOPAVX1:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 047978c8a0dac7..a540d04626ae83 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -576,46 +576,41 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
 ;
 ; AVX512-LABEL: interleaved_load_vf16_i8_stride4:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
-; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX512-NEXT:    vmovdqa (%rdi), %ymm5
-; AVX512-NEXT:    vpmovdb %zmm5, %xmm5
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
+; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm8
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm3
+; AVX512-NEXT:    vmovdqa 48(%rdi), %xmm4
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
-; AVX512-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
+; AVX512-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
+; AVX512-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
-; AVX512-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
+; AVX512-NEXT:    vpshufb %xmm6, %xmm2, %xmm7
+; AVX512-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
 ; AVX512-NEXT:    vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
-; AVX512-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
+; AVX512-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
+; AVX512-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
 ; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm4
-; AVX512-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3]
+; AVX512-NEXT:    vpshufb %xmm7, %xmm2, %xmm0
+; AVX512-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
 ; AVX512-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX512-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
-; AVX512-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
 ; AVX512-NEXT:    vpcmpeqb %zmm5, %zmm8, %k0
-; AVX512-NEXT:    vpcmpeqb %zmm0, %zmm4, %k1
+; AVX512-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxnorw %k1, %k0, %k0
 ; AVX512-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0

From abd33bf5eff2419e0f49ce494039bceefe8e1085 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 18 Aug 2020 15:46:02 +0100
Subject: [PATCH 045/101] [X86][AVX] lowerShuffleWithPERMV - pad 128/256-bit
 shuffles on non-VLX targets

Allow non-VLX targets to use 512-bits VPERMV/VPERMV3 for 128/256-bit shuffles.

TBH I'm not sure these targets actually exist in the wild, but we're testing for them and its good test coverage for shuffle lowering/combines across different subvector widths.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 59 ++++++++++------
 .../X86/shuffle-strided-with-offset-512.ll    | 67 ++++++-------------
 llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 30 ++++-----
 3 files changed, 70 insertions(+), 86 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0fbabdc5dfdf0b..ec4d236dc3ea19 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -14969,17 +14969,35 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Mask, Subtarget, DAG);
 }
 
+// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
+// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
+// the active subvector is extracted.
 static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
-                                     ArrayRef<int> Mask, SDValue V1,
-                                     SDValue V2, SelectionDAG &DAG) {
+                                     ArrayRef<int> Mask, SDValue V1, SDValue V2,
+                                     const X86Subtarget &Subtarget,
+                                     SelectionDAG &DAG) {
   MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
   MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
-
   SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
+
+  MVT ShuffleVT = VT;
+  if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
+    V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
+    V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
+    MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
+    ShuffleVT = V1.getSimpleValueType();
+  }
+
+  SDValue Result;
   if (V2.isUndef())
-    return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
+    Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
+  else
+    Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
+
+  if (VT != ShuffleVT)
+    Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
 
-  return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
+  return Result;
 }
 
 /// Generic lowering of v16i8 shuffles.
@@ -15208,9 +15226,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
               DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
         return Unpack;
 
-      // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
-      if (Subtarget.hasVBMI() && Subtarget.hasVLX())
-        return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
+      // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
+      if (Subtarget.hasVBMI())
+        return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
+                                     DAG);
 
       // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
       if (Subtarget.hasXOP()) {
@@ -16964,9 +16983,9 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Zeroable, Subtarget, DAG))
     return PSHUFB;
 
-  // AVX512BWVL can lower to VPERMW.
-  if (Subtarget.hasBWI() && Subtarget.hasVLX())
-    return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
+  // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
+  if (Subtarget.hasBWI())
+    return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
@@ -17069,9 +17088,9 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Zeroable, Subtarget, DAG))
     return PSHUFB;
 
-  // AVX512VBMIVL can lower to VPERMB.
-  if (Subtarget.hasVBMI() && Subtarget.hasVLX())
-    return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
+  // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
+  if (Subtarget.hasVBMI())
+    return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
 
   // Try to simplify this by merging 128-bit lanes to enable a lane-based
   // shuffle.
@@ -17325,7 +17344,7 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                           Zeroable, Subtarget, DAG))
     return Blend;
 
-  return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
+  return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
 }
 
 /// Handle lowering of 16-lane 32-bit floating point shuffles.
@@ -17384,7 +17403,7 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                              V1, V2, DAG, Subtarget))
     return V;
 
-  return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
+  return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
 }
 
 /// Handle lowering of 8-lane 64-bit integer shuffles.
@@ -17447,7 +17466,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                           Zeroable, Subtarget, DAG))
     return Blend;
 
-  return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
+  return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
 }
 
 /// Handle lowering of 16-lane 32-bit integer shuffles.
@@ -17524,7 +17543,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                           Zeroable, Subtarget, DAG))
     return Blend;
 
-  return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
+  return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
 }
 
 /// Handle lowering of 32-lane 16-bit integer shuffles.
@@ -17587,7 +17606,7 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Zeroable, Subtarget, DAG))
     return PSHUFB;
 
-  return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
+  return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
 }
 
 /// Handle lowering of 64-lane 8-bit integer shuffles.
@@ -17643,7 +17662,7 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // VBMI can use VPERMV/VPERMV3 byte shuffles.
   if (Subtarget.hasVBMI())
-    return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
+    return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
 
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
index 969ac375a70e33..40cd2fcd4fdeb7 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-512.ll
@@ -85,12 +85,10 @@ define void @shuffle_v32i16_to_v16i16_1(<32 x i16>* %L, <16 x i16>* %S) nounwind
 ;
 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16_1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,3,5,7,17,19,21,23,9,11,13,15,25,27,29,31]
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm1[0,2,1,3]
 ; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
@@ -260,20 +258,11 @@ define void @shuffle_v32i16_to_v8i16_1(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ;
 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_1:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = <1,5,9,13,17,21,25,29,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa %xmm1, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_1:
@@ -327,20 +316,11 @@ define void @shuffle_v32i16_to_v8i16_2(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ;
 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = <2,6,10,14,18,22,26,30,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa %xmm1, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_2:
@@ -394,20 +374,11 @@ define void @shuffle_v32i16_to_v8i16_3(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ;
 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16_3:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = <3,7,11,15,19,23,27,31,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
+; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa %xmm1, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16_3:
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index de13135ebb5310..9e3c92aca5da3a 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -328,8 +328,8 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
 ;
 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
-; AVX512VBMI-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpermt2b %zmm0, %zmm1, %zmm0
 ; AVX512VBMI-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
@@ -413,8 +413,8 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
 ;
 ; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
-; AVX512VBMI-NEXT:    vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vpermt2b %zmm0, %zmm1, %zmm0
 ; AVX512VBMI-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
@@ -457,13 +457,10 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
 ;
 ; AVX512BW-LABEL: PR34175:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqu 32(%rdi), %xmm1
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm1
+; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX512BW-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -478,13 +475,10 @@ define <4 x double> @PR34175(<32 x i16>* %p) {
 ;
 ; AVX512VBMI-LABEL: PR34175:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512VBMI-NEXT:    vmovdqu 32(%rdi), %xmm1
-; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VBMI-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512VBMI-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; AVX512VBMI-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512VBMI-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,8,16,24,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vmovdqu (%rdi), %ymm1
+; AVX512VBMI-NEXT:    vpermt2w %zmm0, %zmm0, %zmm1
+; AVX512VBMI-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
 ; AVX512VBMI-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512VBMI-NEXT:    retq
 ;

From 8c9ffe34d932e2e17cbcf351d6e37783ea5453ae Mon Sep 17 00:00:00 2001
From: Nathan James <n.james93@hotmail.co.uk>
Date: Tue, 18 Aug 2020 15:52:37 +0100
Subject: [PATCH 046/101] [NFC][clang-tidy] Put abseil headers in alphabetical
 order

---
 .../clang-tidy/abseil/AbseilMatcher.h         | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/abseil/AbseilMatcher.h b/clang-tools-extra/clang-tidy/abseil/AbseilMatcher.h
index f58ff5bc44b214..335c333573f43b 100644
--- a/clang-tools-extra/clang-tidy/abseil/AbseilMatcher.h
+++ b/clang-tools-extra/clang-tidy/abseil/AbseilMatcher.h
@@ -47,14 +47,18 @@ AST_POLYMORPHIC_MATCHER(
   if (PrefixPosition == StringRef::npos)
     return false;
   Path = Path.drop_front(PrefixPosition + AbslPrefix.size());
-  static const char *AbseilLibraries[] = {
-      "algorithm", "base",     "container",       "debugging", "flags",
-      "hash",      "iterator", "memory",          "meta",      "numeric",
-      "random",    "strings",  "synchronization", "status",    "time",
-      "types",     "utility"};
-  return std::any_of(
-      std::begin(AbseilLibraries), std::end(AbseilLibraries),
-      [&](const char *Library) { return Path.startswith(Library); });
+  static const char *AbseilLibraries[] = {"algorithm", "base",
+                                          "container", "debugging",
+                                          "flags",     "hash",
+                                          "iterator",  "memory",
+                                          "meta",      "numeric",
+                                          "random",    "status",
+                                          "strings",   "synchronization",
+                                          "time",      "types",
+                                          "utility"};
+  return llvm::any_of(AbseilLibraries, [&](const char *Library) {
+    return Path.startswith(Library);
+  });
 }
 
 } // namespace ast_matchers

From b8088ada05269819dbc95542ea125d074b451abf Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 18 Aug 2020 16:02:21 +0100
Subject: [PATCH 047/101] [LV] Predicated reduction tests. NFC

---
 .../ARM/mve-reduction-predselect.ll           | 644 ++++++++++++++++++
 .../LoopVectorize/reduction-predselect.ll     | 305 +++++++++
 2 files changed, 949 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/reduction-predselect.ll

diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
new file mode 100644
index 00000000000000..da5b5a60a400ca
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
@@ -0,0 +1,644 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -loop-vectorize -tail-predication=enabled -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+define i32 @reduction_sum_single(i32* noalias nocapture %A) {
+; CHECK-LABEL: @reduction_sum_single(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !2
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+entry:
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %entry, %.lr.ph
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
+  %sum.02 = phi i32 [ %l7, %.lr.ph ], [ 0, %entry ]
+  %l2 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l3 = load i32, i32* %l2, align 4
+  %l7 = add i32 %sum.02, %l3
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph
+  %sum.0.lcssa = phi i32 [ %l7, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B) {
+; CHECK-LABEL: @reduction_sum(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP6]] = add <4 x i32> [[TMP5]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !5
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
+;
+entry:
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %entry, %.lr.ph
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
+  %sum.02 = phi i32 [ %l9, %.lr.ph ], [ 0, %entry ]
+  %l2 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l3 = load i32, i32* %l2, align 4
+  %l4 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
+  %l5 = load i32, i32* %l4, align 4
+  %l7 = add i32 %sum.02, %indvars.iv
+  %l8 = add i32 %l7, %l3
+  %l9 = add i32 %l8, %l5
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph
+  %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B) {
+; CHECK-LABEL: @reduction_prod(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP5]] = mul <4 x i32> [[TMP4]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    br i1 false, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
+; CHECK:       .lr.ph:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[PROD_02:%.*]] = phi i32 [ [[L9:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L3:%.*]] = load i32, i32* [[L2]], align 4
+; CHECK-NEXT:    [[L4:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L5:%.*]] = load i32, i32* [[L4]], align 4
+; CHECK-NEXT:    [[L8:%.*]] = mul i32 [[PROD_02]], [[L3]]
+; CHECK-NEXT:    [[L9]] = mul i32 [[L8]], [[L5]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !7
+; CHECK:       ._crit_edge:
+; CHECK-NEXT:    [[PROD_0_LCSSA:%.*]] = phi i32 [ [[L9]], [[DOTLR_PH]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PROD_0_LCSSA]]
+;
+entry:
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %entry, %.lr.ph
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
+  %prod.02 = phi i32 [ %l9, %.lr.ph ], [ 1, %entry ]
+  %l2 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l3 = load i32, i32* %l2, align 4
+  %l4 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
+  %l5 = load i32, i32* %l4, align 4
+  %l8 = mul i32 %prod.02, %l3
+  %l9 = mul i32 %l8, %l5
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph
+  %prod.0.lcssa = phi i32 [ %l9, %.lr.ph ]
+  ret i32 %prod.0.lcssa
+}
+
+define i32 @reduction_and(i32* nocapture %A, i32* nocapture %B) {
+; CHECK-LABEL: @reduction_and(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP5]] = and <4 x i32> [[TMP4]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ -1, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = and i32 [[RESULT_08]], [[L0]]
+; CHECK-NEXT:    [[AND]] = and i32 [[ADD]], [[L1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !9
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
+  %l1 = load i32, i32* %arrayidx2, align 4
+  %add = and i32 %result.08, %l0
+  %and = and i32 %add, %l1
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ %and, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+define i32 @reduction_or(i32* nocapture %A, i32* nocapture %B) {
+; CHECK-LABEL: @reduction_or(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP5]] = or <4 x i32> [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]]
+; CHECK-NEXT:    [[OR]] = or i32 [[ADD]], [[RESULT_08]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !11
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
+  %l1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %l1, %l0
+  %or = or i32 %add, %result.08
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ %or, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+define i32 @reduction_xor(i32* nocapture %A, i32* nocapture %B) {
+; CHECK-LABEL: @reduction_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP5]] = xor <4 x i32> [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
+; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[XOR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]]
+; CHECK-NEXT:    [[XOR]] = xor i32 [[ADD]], [[RESULT_08]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !13
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
+  %l1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %l1, %l0
+  %xor = xor i32 %add, %result.08
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ %xor, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+define float @reduction_fadd(float* nocapture %A, float* nocapture %B) {
+; CHECK-LABEL: @reduction_fadd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP5]] = fadd fast <4 x float> [[TMP4]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[RESULT_08]], [[L0]]
+; CHECK-NEXT:    [[FADD]] = fadd fast float [[ADD]], [[L1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !15
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi float [ %fadd, %for.body ], [ 0.0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %indvars.iv
+  %l0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %B, i32 %indvars.iv
+  %l1 = load float, float* %arrayidx2, align 4
+  %add = fadd fast float %result.08, %l0
+  %fadd = fadd fast float %add, %l1
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi float [ %fadd, %for.body ]
+  ret float %result.0.lcssa
+}
+
+define float @reduction_fmul(float* nocapture %A, float* nocapture %B) {
+; CHECK-LABEL: @reduction_fmul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP5]] = fmul fast <4 x float> [[TMP4]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
+; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi float [ [[FMUL:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fmul fast float [[RESULT_08]], [[L0]]
+; CHECK-NEXT:    [[FMUL]] = fmul fast float [[ADD]], [[L1]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !17
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi float [ [[FMUL]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[RESULT_0_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %indvars.iv
+  %l0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %B, i32 %indvars.iv
+  %l1 = load float, float* %arrayidx2, align 4
+  %add = fmul fast float %result.08, %l0
+  %fmul = fmul fast float %add, %l1
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi float [ %fmul, %for.body ]
+  ret float %result.0.lcssa
+}
+
+define i32 @reduction_min(i32* nocapture %A, i32* nocapture %B) {
+; CHECK-LABEL: @reduction_min(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !18
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 1000, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[V0:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[C0:%.*]] = icmp slt i32 [[RESULT_08]], [[L0]]
+; CHECK-NEXT:    [[V0]] = select i1 [[C0]], i32 [[RESULT_08]], i32 [[L0]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !19
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[V0]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l0 = load i32, i32* %arrayidx, align 4
+  %c0 = icmp slt i32 %result.08, %l0
+  %v0 = select i1 %c0, i32 %result.08, i32 %l0
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ %v0, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+define i32 @reduction_max(i32* nocapture %A, i32* nocapture %B) {
+; CHECK-LABEL: @reduction_max(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !20
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ 1000, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi i32 [ [[V0:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[C0:%.*]] = icmp ugt i32 [[RESULT_08]], [[L0]]
+; CHECK-NEXT:    [[V0]] = select i1 [[C0]], i32 [[RESULT_08]], i32 [[L0]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !21
+; CHECK:       for.end:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[V0]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l0 = load i32, i32* %arrayidx, align 4
+  %c0 = icmp ugt i32 %result.08, %l0
+  %v0 = select i1 %c0, i32 %result.08, i32 %l0
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ %v0, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+define float @reduction_fmax(float* nocapture %A, float* nocapture %B) {
+; CHECK-LABEL: @reduction_fmax(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RESULT_08:%.*]] = phi float [ [[V0:%.*]], [[FOR_BODY]] ], [ 1.000000e+03, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[C0:%.*]] = fcmp ogt float [[RESULT_08]], [[L0]]
+; CHECK-NEXT:    [[V0]] = select i1 [[C0]], float [[RESULT_08]], float [[L0]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret float [[V0]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi float [ %v0, %for.body ], [ 1000.0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %indvars.iv
+  %l0 = load float, float* %arrayidx, align 4
+  %c0 = fcmp ogt float %result.08, %l0
+  %v0 = select i1 %c0, float %result.08, float %l0
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi float [ %v0, %for.body ]
+  ret float %result.0.lcssa
+}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
new file mode 100644
index 00000000000000..aaae03b9fb3a32
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll
@@ -0,0 +1,305 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -prefer-predicate-over-epilog -force-reduction-intrinsics -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define i32 @reduction_sum_single(i32* noalias nocapture %A) {
+; CHECK-LABEL: @reduction_sum_single(
+; CHECK:       vector.body:
+; CHECK:         [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP24:%.*]], %pred.load.continue6 ]
+; CHECK:         [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23:%.*]]
+; CHECK:       middle.block:
+; CHECK:         [[TMP26:%.*]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]]
+; CHECK:         [[TMP27:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP26]])
+;
+entry:
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %entry, %.lr.ph
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
+  %sum.02 = phi i32 [ %l7, %.lr.ph ], [ 0, %entry ]
+  %l2 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l3 = load i32, i32* %l2, align 4
+  %l7 = add i32 %sum.02, %l3
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph
+  %sum.0.lcssa = phi i32 [ %l7, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B) {
+; CHECK-LABEL: @reduction_sum(
+; CHECK:       vector.body:
+; CHECK:         [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP46:%.*]], %pred.load.continue14 ]
+; CHECK:         [[TMP44:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND:%.*]]
+; CHECK:         [[TMP45:%.*]] = add <4 x i32> [[TMP44]], [[TMP23:%.*]]
+; CHECK:         [[TMP46]] = add <4 x i32> [[TMP45]], [[TMP43:%.*]]
+; CHECK:       middle.block:
+; CHECK:         [[TMP48:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
+; CHECK:         [[TMP49:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP48]])
+;
+entry:
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %entry, %.lr.ph
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
+  %sum.02 = phi i32 [ %l9, %.lr.ph ], [ 0, %entry ]
+  %l2 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l3 = load i32, i32* %l2, align 4
+  %l4 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
+  %l5 = load i32, i32* %l4, align 4
+  %l7 = add i32 %sum.02, %indvars.iv
+  %l8 = add i32 %l7, %l3
+  %l9 = add i32 %l8, %l5
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph
+  %sum.0.lcssa = phi i32 [ %l9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B) {
+; CHECK-LABEL: @reduction_prod(
+; CHECK:       vector.body:
+; CHECK:         [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %vector.ph ], [ [[TMP45:%.*]], %pred.load.continue14 ]
+; CHECK:         [[TMP44:%.*]] = mul <4 x i32> [[VEC_PHI]], [[TMP23:%.*]]
+; CHECK:         [[TMP45]] = mul <4 x i32> [[TMP44]], [[TMP43:%.*]]
+; CHECK:       middle.block:
+; CHECK:         [[TMP47:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]]
+; CHECK:         [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP47]])
+;
+entry:
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %entry, %.lr.ph
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %entry ]
+  %prod.02 = phi i32 [ %l9, %.lr.ph ], [ 1, %entry ]
+  %l2 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l3 = load i32, i32* %l2, align 4
+  %l4 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
+  %l5 = load i32, i32* %l4, align 4
+  %l8 = mul i32 %prod.02, %l3
+  %l9 = mul i32 %l8, %l5
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph
+  %prod.0.lcssa = phi i32 [ %l9, %.lr.ph ]
+  ret i32 %prod.0.lcssa
+}
+
+define i32 @reduction_and(i32* nocapture %A, i32* nocapture %B) {
+; CHECK-LABEL: @reduction_and(
+; CHECK:       vector.body:
+; CHECK:         [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ [[TMP45:%.*]], %pred.load.continue14 ]
+; CHECK:         [[TMP44:%.*]] = and <4 x i32> [[VEC_PHI]], [[TMP42:%.*]]
+; CHECK:         [[TMP45]] = and <4 x i32> [[TMP44]], [[TMP43]]
+; CHECK:       middle.block:
+; CHECK:         [[TMP47:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]]
+; CHECK:         [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP47]])
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
+  %l1 = load i32, i32* %arrayidx2, align 4
+  %add = and i32 %result.08, %l0
+  %and = and i32 %add, %l1
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ %and, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+define i32 @reduction_or(i32* nocapture %A, i32* nocapture %B) {
+; CHECK-LABEL: @reduction_or(
+; CHECK:       vector.body:
+; CHECK:         [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP45:%.*]], %pred.load.continue14 ]
+; CHECK:         [[TMP45]] = or <4 x i32> [[TMP44:%.*]], [[VEC_PHI]]
+; CHECK:       middle.block:
+; CHECK:         [[TMP47:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]]
+; CHECK:         [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP47]])
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
+  %l1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %l1, %l0
+  %or = or i32 %add, %result.08
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ %or, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+define i32 @reduction_xor(i32* nocapture %A, i32* nocapture %B) {
+; CHECK-LABEL: @reduction_xor(
+; CHECK:         [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP45:%.*]], %pred.load.continue14 ]
+; CHECK:         [[TMP45]] = xor <4 x i32> [[TMP44:%.*]], [[VEC_PHI]]
+; CHECK:       middle.block:
+; CHECK:         [[TMP47:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]]
+; CHECK:         [[TMP48:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP47]])
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %indvars.iv
+  %l1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %l1, %l0
+  %xor = xor i32 %add, %result.08
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ %xor, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+define float @reduction_fadd(float* nocapture %A, float* nocapture %B) {
+; CHECK-LABEL: @reduction_fadd(
+; CHECK:       vector.body:
+; CHECK:         [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, %vector.ph ], [ [[TMP45:%.*]], %pred.load.continue14 ]
+; CHECK:         [[TMP44:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP23:%.*]]
+; CHECK:         [[TMP45]] = fadd fast <4 x float> [[TMP44]], [[TMP43]]
+; CHECK:       middle.block:
+; CHECK:         [[TMP47:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x float> [[TMP45]], <4 x float> [[VEC_PHI]]
+; CHECK:         [[TMP48:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP47]])
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi float [ %fadd, %for.body ], [ 0.0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %indvars.iv
+  %l0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %B, i32 %indvars.iv
+  %l1 = load float, float* %arrayidx2, align 4
+  %add = fadd fast float %result.08, %l0
+  %fadd = fadd fast float %add, %l1
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi float [ %fadd, %for.body ]
+  ret float %result.0.lcssa
+}
+
+define float @reduction_fmul(float* nocapture %A, float* nocapture %B) {
+; CHECK-LABEL: @reduction_fmul(
+; CHECK:       vector.body:
+; CHECK:         [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %vector.ph ], [ [[TMP45:%.*]], %pred.load.continue14 ]
+; CHECK:         [[TMP44:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[TMP23:%.*]]
+; CHECK:         [[TMP45]] = fmul fast <4 x float> [[TMP44]], [[TMP43]]
+; CHECK:       middle.block:
+; CHECK:         [[TMP47:%.*]] = select <4 x i1> [[TMP3:%.*]], <4 x float> [[TMP45]], <4 x float> [[VEC_PHI]]
+; CHECK:         [[TMP48:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP47]])
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %A, i32 %indvars.iv
+  %l0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %B, i32 %indvars.iv
+  %l1 = load float, float* %arrayidx2, align 4
+  %add = fmul fast float %result.08, %l0
+  %fmul = fmul fast float %add, %l1
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi float [ %fmul, %for.body ]
+  ret float %result.0.lcssa
+}
+
+define i32 @reduction_min(i32* nocapture %A, i32* nocapture %B) {
+; CHECK-LABEL: @reduction_min(
+; CHECK:       vector.body:
+; CHECK:         [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, %vector.ph ], [ [[TMP25:%.*]], %pred.load.continue6 ]
+; CHECK:         [[TMP24:%.*]] = icmp slt <4 x i32> [[VEC_PHI]], [[TMP23:%.*]]
+; CHECK:         [[TMP25]] = select <4 x i1> [[TMP24]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]]
+; CHECK:       middle.block:
+; CHECK:         [[TMP27:%.*]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP25]], <4 x i32> [[VEC_PHI]]
+; CHECK:         [[TMP28:%.*]] = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> [[TMP27]])
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l0 = load i32, i32* %arrayidx, align 4
+  %c0 = icmp slt i32 %result.08, %l0
+  %v0 = select i1 %c0, i32 %result.08, i32 %l0
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ %v0, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+define i32 @reduction_max(i32* nocapture %A, i32* nocapture %B) {
+; CHECK-LABEL: @reduction_max(
+; CHECK:       vector.body:
+; CHECK:         [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1000, i32 1000, i32 1000, i32 1000>, %vector.ph ], [ [[TMP25:%.*]], %pred.load.continue6 ]
+; CHECK:         [[TMP24:%.*]] = icmp ugt <4 x i32> [[VEC_PHI]], [[TMP23:%.*]]
+; CHECK:         [[TMP25]] = select <4 x i1> [[TMP24]], <4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]]
+; CHECK:       middle.block:
+; CHECK:         [[TMP27:%.*]] = select <4 x i1> [[TMP0:%.*]], <4 x i32> [[TMP25]], <4 x i32> [[VEC_PHI]]
+; CHECK:         [[TMP28:%.*]] = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> [[TMP27]])
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %v0, %for.body ], [ 1000, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
+  %l0 = load i32, i32* %arrayidx, align 4
+  %c0 = icmp ugt i32 %result.08, %l0
+  %v0 = select i1 %c0, i32 %result.08, i32 %l0
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 257
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ %v0, %for.body ]
+  ret i32 %result.0.lcssa
+}

From 87122c3480e2115951045102bb26eedc200c8473 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 18 Aug 2020 16:08:15 +0100
Subject: [PATCH 048/101] [X86] Regenerate load-slice test labels. NFCI.

Pulled out a superfluous diff from D66004
---
 llvm/test/CodeGen/X86/load-slice.ll | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/X86/load-slice.ll b/llvm/test/CodeGen/X86/load-slice.ll
index 3cbb70bd70d788..3bf95778f5647b 100644
--- a/llvm/test/CodeGen/X86/load-slice.ll
+++ b/llvm/test/CodeGen/X86/load-slice.ll
@@ -16,7 +16,7 @@
 ; Low slice starts at 0 (base) and is 8-bytes aligned.
 ; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
 ;
-; STRESS-LABEL: t1:
+; STRESS-LABEL: _t1:
 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
 ; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]]
 ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
@@ -31,7 +31,7 @@
 ; STRESS-NEXT: vmovlps [[RES_Vec]], ([[BASE]])
 ;
 ; Same for REGULAR, we eliminate register bank copy with each slices.
-; REGULAR-LABEL: t1:
+; REGULAR-LABEL: _t1:
 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
 ; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]]
 ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
@@ -90,14 +90,14 @@ declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 ; Low slice starts at 0 (base) and is 8-bytes aligned.
 ; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned.
 ;
-; STRESS-LABEL: t2:
+; STRESS-LABEL: _t2:
 ; STRESS: movzwl 6([[BASE:[^)]+]]), %eax
 ; STRESS-NEXT: addl ([[BASE]]), %eax
 ; STRESS-NEXT: ret
 ;
 ; For the REGULAR heuristic, this is not profitable to slice things that are not
 ; next to each other in memory. Here we have a hole with bytes #4-5.
-; REGULAR-LABEL: t2:
+; REGULAR-LABEL: _t2:
 ; REGULAR: shrq $48
 define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) {
   %arrayidx = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %out_start
@@ -117,11 +117,11 @@ define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) {
 ; Second slice uses bytes numbered 6 and 7.
 ; Third slice uses bytes numbered 4 to 7.
 ;
-; STRESS-LABEL: t3:
+; STRESS-LABEL: _t3:
 ; STRESS: shrq $48
 ; STRESS: shrq $32
 ;
-; REGULAR-LABEL: t3:
+; REGULAR-LABEL: _t3:
 ; REGULAR: shrq $48
 ; REGULAR: shrq $32
 define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {

From a65a50540e3b5dd1938a1d14f31b912a311537fb Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <ravishankarm@google.com>
Date: Tue, 18 Aug 2020 08:16:25 -0700
Subject: [PATCH 049/101] [mlir][Linalg] Canonicalize
 tensor_reshape(splat-constant) -> splat-constant.

When the operand to the linalg.tensor_reshape op is a splat constant,
the result can be replaced with a splat constant of the same value but
different type.

Differential Revision: https://reviews.llvm.org/D86117
---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp   | 22 ++++++++-
 mlir/test/Dialect/Linalg/canonicalize.mlir | 57 ++++++++++++++++++++++
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 009699be526321..308272d66d567a 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -18,6 +18,7 @@
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Function.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/Module.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
@@ -734,9 +735,28 @@ static LogicalResult verify(TensorReshapeOp op) {
   return success();
 }
 
+/// Reshape of a splat constant can be replaced with a constant of the result
+/// type.
+struct FoldReshapeWithConstant : OpRewritePattern<TensorReshapeOp> {
+  using OpRewritePattern<TensorReshapeOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(TensorReshapeOp reshapeOp,
+                                PatternRewriter &rewriter) const override {
+    DenseElementsAttr attr;
+    if (!matchPattern(reshapeOp.src(), m_Constant(&attr)))
+      return failure();
+    if (!attr || !attr.isSplat())
+      return failure();
+    DenseElementsAttr newAttr = DenseElementsAttr::getFromRawBuffer(
+        reshapeOp.getResultType(), attr.getRawData(), true);
+    rewriter.replaceOpWithNewOp<ConstantOp>(reshapeOp, newAttr);
+    return success();
+  }
+};
+
 void TensorReshapeOp::getCanonicalizationPatterns(
     OwningRewritePatternList &results, MLIRContext *context) {
-  results.insert<CollapseReshapeOps<TensorReshapeOp>>(context);
+  results.insert<CollapseReshapeOps<TensorReshapeOp>, FoldReshapeWithConstant>(
+      context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index 005bd1c874458e..85321084cd0c68 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -203,3 +203,60 @@ func @dce_zero_memref(%arg0 : memref<0xf32>, %arg1: tensor<0xf32>) -> tensor<0xf
 //   CHECK-NOT:   linalg.copy
 //  CHECK-NEXT:   linalg.generic
 
+// -----
+
+func @reshape_splat_constant_int32() -> tensor<2x4x2xi32>
+{
+  %c0 = constant dense<42> : tensor<2x8xi32>
+  %0 = linalg.tensor_reshape %c0
+         [affine_map<(d0, d1, d2) -> (d0)>,
+          affine_map<(d0, d1, d2) -> (d1, d2)>]
+       : tensor<2x8xi32> into tensor<2x4x2xi32>
+  return %0 : tensor<2x4x2xi32>
+}
+// CHECK-LABEL: @reshape_splat_constant_int32
+//       CHECK:   %[[CST:.*]] = constant dense<{{.*}}> : tensor<2x4x2xi32>
+//   CHECK-NOT:   linalg.tensor_reshape
+//       CHECK:   return %[[CST]]
+
+func @reshape_splat_constant_int16() -> tensor<2x4x2xi16>
+{
+  %c0 = constant dense<42> : tensor<2x8xi16>
+  %0 = linalg.tensor_reshape %c0
+         [affine_map<(d0, d1, d2) -> (d0)>,
+          affine_map<(d0, d1, d2) -> (d1, d2)>]
+       : tensor<2x8xi16> into tensor<2x4x2xi16>
+  return %0 : tensor<2x4x2xi16>
+}
+// CHECK-LABEL: @reshape_splat_constant_int16
+//       CHECK:   %[[CST:.*]] = constant dense<{{.*}}> : tensor<2x4x2xi16>
+//   CHECK-NOT:   linalg.tensor_reshape
+//       CHECK:   return %[[CST]]
+
+func @reshape_splat_constant_float32() -> tensor<2x4x2xf32>
+{
+  %c0 = constant dense<42.0> : tensor<2x8xf32>
+  %0 = linalg.tensor_reshape %c0
+         [affine_map<(d0, d1, d2) -> (d0)>,
+          affine_map<(d0, d1, d2) -> (d1, d2)>]
+       : tensor<2x8xf32> into tensor<2x4x2xf32>
+  return %0 : tensor<2x4x2xf32>
+}
+// CHECK-LABEL: @reshape_splat_constant_float32
+//       CHECK:   %[[CST:.*]] = constant dense<{{.*}}> : tensor<2x4x2xf32>
+//   CHECK-NOT:   linalg.tensor_reshape
+//       CHECK:   return %[[CST]]
+
+func @reshape_splat_constant_float64() -> tensor<2x4x2xf64>
+{
+  %c0 = constant dense<42.0> : tensor<2x8xf64>
+  %0 = linalg.tensor_reshape %c0
+         [affine_map<(d0, d1, d2) -> (d0)>,
+          affine_map<(d0, d1, d2) -> (d1, d2)>]
+       : tensor<2x8xf64> into tensor<2x4x2xf64>
+  return %0 : tensor<2x4x2xf64>
+}
+// CHECK-LABEL: @reshape_splat_constant_float64
+//       CHECK:   %[[CST:.*]] = constant dense<{{.*}}> : tensor<2x4x2xf64>
+//   CHECK-NOT:   linalg.tensor_reshape
+//       CHECK:   return %[[CST]]

From f48eced390dcda54766e1c510af10bbcbaebcd7e Mon Sep 17 00:00:00 2001
From: jasonliu <jasonliu.development@gmail.com>
Date: Tue, 18 Aug 2020 14:18:53 +0000
Subject: [PATCH 050/101] [XCOFF] emit .rename for .lcomm when necessary

Summary:

This is a follow up for D82481. For .lcomm directive, although it's
not necessary to have .rename emitted, it's still desirable to do
it so that we do not see internal 'Rename..' gets print out in
symbol table. And we could have consistent naming between TC entry
and .lcomm. And also have consistent naming between IR and final
object file.

Reviewed By: hubert.reinterpretcast

Differential Revision: https://reviews.llvm.org/D86075
---
 llvm/lib/MC/MCAsmStreamer.cpp                 |  6 ++
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     |  2 +-
 .../PowerPC/aix-xcoff-symbol-rename.ll        | 90 ++++++++++++-------
 3 files changed, 63 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index db0ed9a73d226f..490557a2db0878 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -797,6 +797,12 @@ void MCAsmStreamer::emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym,
   OS << ',' << Log2_32(ByteAlignment);
 
   EmitEOL();
+
+  // Print symbol's rename (original name contains invalid character(s)) if
+  // there is one.
+  MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(CsectSym);
+  if (XSym->hasRename())
+    emitXCOFFRenameDirective(XSym, XSym->getSymbolTableName());
 }
 
 void MCAsmStreamer::emitXCOFFSymbolLinkageWithVisibility(
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 920d6ae1f0d6cf..c7510ec05b2406 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1740,7 +1740,7 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
 
     if (GVKind.isBSSLocal())
       OutStreamer->emitXCOFFLocalCommonSymbol(
-          OutContext.getOrCreateSymbol(GVSym->getUnqualifiedName()), Size,
+          OutContext.getOrCreateSymbol(GVSym->getSymbolTableName()), Size,
           GVSym, Alignment.value());
     else
       OutStreamer->emitCommonSymbol(GVSym, Size, Alignment.value());
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-symbol-rename.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-symbol-rename.ll
index 72502f925d29de..f486fc9524c3d3 100644
--- a/llvm/test/CodeGen/PowerPC/aix-xcoff-symbol-rename.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-symbol-rename.ll
@@ -18,6 +18,9 @@
 ; This is f"o"
 @"f\22o\22" = common global i32 0, align 4
 
+; This is f=o
+@"f\3do" = internal global i32 0, align 4
+
 define internal i32 @f$o() {
 entry:
   %call = call i32 bitcast (i32 (...)* @"f\40o" to i32 ()*)()
@@ -27,8 +30,10 @@ entry:
 ; This is f&o
 define i32 @"f\26o"() {
 entry:
-  %call = call i32 @f$o()
-  ret i32 %call
+  %tmp = call i32 @f$o()
+  %tmp1 = load i32, i32* @"f\3do"
+  %tmp2 = add i32 %tmp, %tmp1
+  ret i32 %tmp2
 }
 
 ; This is f&_o
@@ -84,12 +89,17 @@ declare i32 @"f\40o"(...)
 ; ASM-NEXT:    .vbyte  4, 10                   # 0xa
 ; ASM-NEXT:    .comm _Renamed..2222f_o_[RW],4,2
 ; ASM-NEXT:    .rename _Renamed..2222f_o_[RW],"f""o"""
+; ASM-NEXT:    .lcomm  _Renamed..3df_o,4,_Renamed..3df_o[BS],2
+; ASM-NEXT:    .rename _Renamed..3df_o[BS],"f=o"
 ; ASM-NEXT:    .extern ._Renamed..40f_o[PR]
 ; ASM-NEXT:    .rename ._Renamed..40f_o[PR],".f@o"
 ; ASM-NEXT:    .extern _Renamed..40f_o[DS]
 ; ASM-NEXT:    .rename _Renamed..40f_o[DS],"f@o"
 ; ASM-NEXT:    .toc
 ; ASM-NEXT:  L..C0:
+; ASM-NEXT:    .tc _Renamed..3df_o[TC],_Renamed..3df_o[BS]
+; ASM-NEXT:    .rename _Renamed..3df_o[TC],"f=o"
+; ASM-NEXT:  L..C1:
 ; ASM-NEXT:    .tc _Renamed..40f_o[TC],_Renamed..40f_o[DS]
 ; ASM-NEXT:    .rename _Renamed..40f_o[TC],"f@o"
 
@@ -115,47 +125,59 @@ declare i32 @"f\40o"(...)
 ; OBJ-NEXT:        34: 90 01 00 08   stw 0, 8(1)
 ; OBJ-NEXT:        38: 94 21 ff c0   stwu 1, -64(1)
 ; OBJ-NEXT:        3c: 4b ff ff c5   bl 0x0
-; OBJ-NEXT:        40: 38 21 00 40   addi 1, 1, 64
-; OBJ-NEXT:        44: 80 01 00 08   lwz 0, 8(1)
-; OBJ-NEXT:        48: 7c 08 03 a6   mtlr 0
-; OBJ-NEXT:        4c: 4e 80 00 20   blr
+; OBJ-NEXT:        40: 80 82 00 00   lwz 4, 0(2)
+; OBJ-NEXT:                          00000042:  R_TOC        (idx: 24) f=o[TC]
+; OBJ-NEXT:        44: 80 84 00 00   lwz 4, 0(4)
+; OBJ-NEXT:        48: 7c 63 22 14   add 3, 3, 4
+; OBJ-NEXT:        4c: 38 21 00 40   addi 1, 1, 64
+; OBJ-NEXT:        50: 80 01 00 08   lwz 0, 8(1)
+; OBJ-NEXT:        54: 7c 08 03 a6   mtlr 0
+; OBJ-NEXT:        58: 4e 80 00 20   blr
+; OBJ-NEXT:        5c: 60 00 00 00   nop
 ; OBJ-EMPTY:
-; OBJ-NEXT:  00000050 (idx: 10) .f&_o:
-; OBJ-NEXT:        50: 80 62 00 00   lwz 3, 0(2)
-; OBJ-NEXT:                          00000052:  R_TOC        (idx: 24) f@o[TC]
-; OBJ-NEXT:        54: 4e 80 00 20   blr
+; OBJ-NEXT:  00000060 (idx: 10) .f&_o:
+; OBJ-NEXT:        60: 80 62 00 04   lwz 3, 4(2)
+; OBJ-NEXT:                          00000062:  R_TOC        (idx: 26) f@o[TC]
+; OBJ-NEXT:        64: 4e 80 00 20   blr
 ; OBJ-EMPTY:
 ; OBJ-NEXT:  Disassembly of section .data:
 ; OBJ-EMPTY:
-; OBJ-NEXT:  00000058 (idx: 14) f`o:
-; OBJ-NEXT:        58: 00 00 00 0a   <unknown>
+; OBJ-NEXT:  00000068 (idx: 14) f`o:
+; OBJ-NEXT:        68: 00 00 00 0a   <unknown>
 ; OBJ-EMPTY:
-; OBJ-NEXT:  0000005c (idx: 16) f$o[DS]:
-; OBJ-NEXT:        5c: 00 00 00 00   <unknown>
-; OBJ-NEXT:                          0000005c:  R_POS        (idx: 6) .f$o
-; OBJ-NEXT:        60: 00 00 00 80   <unknown>
-; OBJ-NEXT:                          00000060:  R_POS        (idx: 22) TOC[TC0]
-; OBJ-NEXT:        64: 00 00 00 00   <unknown>
+; OBJ-NEXT:  0000006c (idx: 16) f$o[DS]:
+; OBJ-NEXT:        6c: 00 00 00 00   <unknown>
+; OBJ-NEXT:                          0000006c:  R_POS        (idx: 6) .f$o
+; OBJ-NEXT:        70: 00 00 00 90   <unknown>
+; OBJ-NEXT:                          00000070:  R_POS        (idx: 22) TOC[TC0]
+; OBJ-NEXT:        74: 00 00 00 00   <unknown>
 ; OBJ-EMPTY:
-; OBJ-NEXT:  00000068 (idx: 18) f&o[DS]:
-; OBJ-NEXT:        68: 00 00 00 30   <unknown>
-; OBJ-NEXT:                          00000068:  R_POS        (idx: 8) .f&o
-; OBJ-NEXT:        6c: 00 00 00 80   <unknown>
-; OBJ-NEXT:                          0000006c:  R_POS        (idx: 22) TOC[TC0]
-; OBJ-NEXT:        70: 00 00 00 00   <unknown>
+; OBJ-NEXT:  00000078 (idx: 18) f&o[DS]:
+; OBJ-NEXT:        78: 00 00 00 30   <unknown>
+; OBJ-NEXT:                          00000078:  R_POS        (idx: 8) .f&o
+; OBJ-NEXT:        7c: 00 00 00 90   <unknown>
+; OBJ-NEXT:                          0000007c:  R_POS        (idx: 22) TOC[TC0]
+; OBJ-NEXT:        80: 00 00 00 00   <unknown>
 ; OBJ-EMPTY:
-; OBJ-NEXT:  00000074 (idx: 20) f&_o[DS]:
-; OBJ-NEXT:        74: 00 00 00 50   <unknown>
-; OBJ-NEXT:                          00000074:  R_POS        (idx: 10) .f&_o
-; OBJ-NEXT:        78: 00 00 00 80   <unknown>
-; OBJ-NEXT:                          00000078:  R_POS        (idx: 22) TOC[TC0]
-; OBJ-NEXT:        7c: 00 00 00 00   <unknown>
+; OBJ-NEXT:  00000084 (idx: 20) f&_o[DS]:
+; OBJ-NEXT:        84: 00 00 00 60   <unknown>
+; OBJ-NEXT:                          00000084:  R_POS        (idx: 10) .f&_o
+; OBJ-NEXT:        88: 00 00 00 90   <unknown>
+; OBJ-NEXT:                          00000088:  R_POS        (idx: 22) TOC[TC0]
+; OBJ-NEXT:        8c: 00 00 00 00   <unknown>
 ; OBJ-EMPTY:
-; OBJ-NEXT:  00000080 (idx: 24) f@o[TC]:
-; OBJ-NEXT:        80: 00 00 00 00   <unknown>
-; OBJ-NEXT:                          00000080:  R_POS        (idx: 2) f@o[DS]
+; OBJ-NEXT:  00000090 (idx: 24) f=o[TC]:
+; OBJ-NEXT:        90: 00 00 00 9c   <unknown>
+; OBJ-NEXT:                          00000090:  R_POS        (idx: 30) f=o[BS]
+; OBJ-EMPTY:
+; OBJ-NEXT:  00000094 (idx: 26) f@o[TC]:
+; OBJ-NEXT:        94: 00 00 00 00   <unknown>
+; OBJ-NEXT:                          00000094:  R_POS        (idx: 2) f@o[DS]
 ; OBJ-EMPTY:
 ; OBJ-NEXT:  Disassembly of section .bss:
 ; OBJ-EMPTY:
-; OBJ-NEXT:  00000084 (idx: 26) f"o"[RW]:
+; OBJ-NEXT:  00000098 (idx: 28) f"o"[RW]:
+; OBJ-NEXT:  ...
+; OBJ-EMPTY:
+; OBJ-NEXT:  0000009c (idx: 30) f=o[BS]:
 ; OBJ-NEXT:  ...

From 224a8c639eeb36b7a5ac6f8a50295f9ee2cb2518 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Mon, 17 Aug 2020 16:42:28 -0700
Subject: [PATCH 051/101] [GlobalISel][CallLowering] Look through call
 parameters for flags

We weren't looking through the parameters on calls at all.

E.g., say you had

```
declare i32 @zext(i32 zeroext %x)

...
%y = call i32 @zext(i32 %something)
...

```

At the point of the call, we wouldn't know that the %something should have the
zeroext attribute.

This sets flags in about the same way as
TargetLoweringBase::ArgListEntry::setAttributes.

Differential Revision: https://reviews.llvm.org/D86125
---
 .../llvm/CodeGen/GlobalISel/CallLowering.h    |  5 ++
 llvm/lib/CodeGen/GlobalISel/CallLowering.cpp  | 30 ++++++++-
 .../AArch64/GlobalISel/call-translator.ll     | 36 +++++++++++
 .../CodeGen/AArch64/GlobalISel/swifterror.ll  | 64 +++++++++++++++++++
 .../CodeGen/AArch64/GlobalISel/swiftself.ll   | 11 ++++
 .../GlobalISel/irtranslator-call-sret.ll      | 17 +++--
 6 files changed, 155 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
index 38afed764f2939..1eec08f5106220 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -208,6 +208,11 @@ class CallLowering {
     return static_cast<const XXXTargetLowering *>(TLI);
   }
 
+  /// \returns Flags corresponding to the attributes on the \p ArgIdx-th
+  /// parameter of \p Call.
+  ISD::ArgFlagsTy getAttributesForArgIdx(const CallBase &Call,
+                                         unsigned ArgIdx) const;
+
   template <typename FuncInfoTy>
   void setArgFlags(ArgInfo &Arg, unsigned OpIdx, const DataLayout &DL,
                    const FuncInfoTy &FuncInfo) const;
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 661a8560a1c998..e443f603def6b3 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -30,6 +30,34 @@ using namespace llvm;
 
 void CallLowering::anchor() {}
 
+ISD::ArgFlagsTy CallLowering::getAttributesForArgIdx(const CallBase &Call,
+                                                     unsigned ArgIdx) const {
+  ISD::ArgFlagsTy Flags;
+  if (Call.paramHasAttr(ArgIdx, Attribute::SExt))
+    Flags.setSExt();
+  if (Call.paramHasAttr(ArgIdx, Attribute::ZExt))
+    Flags.setZExt();
+  if (Call.paramHasAttr(ArgIdx, Attribute::InReg))
+    Flags.setInReg();
+  if (Call.paramHasAttr(ArgIdx, Attribute::StructRet))
+    Flags.setSRet();
+  if (Call.paramHasAttr(ArgIdx, Attribute::Nest))
+    Flags.setNest();
+  if (Call.paramHasAttr(ArgIdx, Attribute::ByVal))
+    Flags.setByVal();
+  if (Call.paramHasAttr(ArgIdx, Attribute::Preallocated))
+    Flags.setPreallocated();
+  if (Call.paramHasAttr(ArgIdx, Attribute::InAlloca))
+    Flags.setInAlloca();
+  if (Call.paramHasAttr(ArgIdx, Attribute::Returned))
+    Flags.setReturned();
+  if (Call.paramHasAttr(ArgIdx, Attribute::SwiftSelf))
+    Flags.setSwiftSelf();
+  if (Call.paramHasAttr(ArgIdx, Attribute::SwiftError))
+    Flags.setSwiftError();
+  return Flags;
+}
+
 bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
                              ArrayRef<Register> ResRegs,
                              ArrayRef<ArrayRef<Register>> ArgRegs,
@@ -44,7 +72,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   unsigned i = 0;
   unsigned NumFixedArgs = CB.getFunctionType()->getNumParams();
   for (auto &Arg : CB.args()) {
-    ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{},
+    ArgInfo OrigArg{ArgRegs[i], Arg->getType(), getAttributesForArgIdx(CB, i),
                     i < NumFixedArgs};
     setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CB);
     Info.OrigArgs.push_back(OrigArg);
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/call-translator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/call-translator.ll
index ad38b2bb8b9c23..7eb21c21b86cc5 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/call-translator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/call-translator.ll
@@ -151,6 +151,42 @@ define void @test_abi_exts_call(i8* %addr) {
   ret void
 }
 
+; CHECK-LABEL: name: test_zext_in_callee
+; CHECK: bb.1 (%ir-block.0):
+; CHECK:   liveins: $x0
+; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+; CHECK:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load 1 from %ir.addr)
+; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+; CHECK:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s8)
+; CHECK:   $w0 = COPY [[ZEXT]](s32)
+; CHECK:   BL @has_zext_param, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $w0
+; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+; CHECK:   RET_ReallyLR
+declare void @has_zext_param(i8 zeroext)
+define void @test_zext_in_callee(i8* %addr) {
+  %val = load i8, i8* %addr
+  call void @has_zext_param(i8 %val)
+  ret void
+}
+
+; CHECK-LABEL: name: test_sext_in_callee
+; CHECK: bb.1 (%ir-block.0):
+; CHECK:   liveins: $x0
+; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+; CHECK:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[COPY]](p0) :: (load 1 from %ir.addr)
+; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+; CHECK:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s8)
+; CHECK:   $w0 = COPY [[SEXT]](s32)
+; CHECK:   BL @has_sext_param, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $w0
+; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+; CHECK:   RET_ReallyLR
+declare void @has_sext_param(i8 signext)
+define void @test_sext_in_callee(i8* %addr) {
+  %val = load i8, i8* %addr
+  call void @has_sext_param(i8 %val)
+  ret void
+}
+
 ; CHECK-LABEL: name: test_abi_sext_ret
 ; CHECK: [[VAL:%[0-9]+]]:_(s8) = G_LOAD
 ; CHECK: [[SVAL:%[0-9]+]]:_(s32) = G_SEXT [[VAL]](s8)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll b/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll
index 4a3e5b04681476..a4a1747b05af9b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/swifterror.ll
@@ -513,3 +513,67 @@ a:
   %error = load %swift_error*, %swift_error** %error_ptr
   ret %swift_error* %error
 }
+
+; foo takes a swifterror parameter. We should be able to see that even when
+; it isn't explicitly on the call.
+define float @swifterror_param_not_on_call(i8* %error_ref) {
+; CHECK-LABEL: swifterror_param_not_on_call:
+; CHECK: mov [[ID:x[0-9]+]], x0
+; CHECK: bl {{.*}}foo
+; CHECK: mov x0, x21
+; CHECK: cbnz x21
+; Access part of the error object and save it to error_ref
+; CHECK: ldrb [[CODE:w[0-9]+]], [x0, #8]
+; CHECK: strb [[CODE]], [{{.*}}[[ID]]]
+; CHECK: bl {{.*}}free
+
+entry:
+  %error_ptr_ref = alloca swifterror %swift_error*
+  store %swift_error* null, %swift_error** %error_ptr_ref
+  %call = call float @foo(%swift_error** %error_ptr_ref)
+  %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+  %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+  %tmp = bitcast %swift_error* %error_from_foo to i8*
+  br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+  %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+  %t = load i8, i8* %v1
+  store i8 %t, i8* %error_ref
+  br label %handler
+handler:
+  call void @free(i8* %tmp)
+  ret float 1.0
+}
+
+; foo_sret takes an sret parameter and a swifterror parameter. We should be
+; able to see that, even if it's not explicitly on the call.
+define float @swifterror_param_not_on_call2(i8* %error_ref) {
+; CHECK-LABEL: swifterror_param_not_on_call2:
+; CHECK: mov [[ID:x[0-9]+]], x0
+; CHECK: mov [[ZERO:x[0-9]+]], xzr
+; CHECK: bl {{.*}}foo_sret
+; CHECK: mov x0, x21
+; CHECK: cbnz x21
+; Access part of the error object and save it to error_ref
+; CHECK: ldrb [[CODE:w[0-9]+]], [x0, #8]
+; CHECK: strb [[CODE]], [{{.*}}[[ID]]]
+; CHECK: bl {{.*}}free
+
+entry:
+  %s = alloca %struct.S, align 8
+  %error_ptr_ref = alloca swifterror %swift_error*
+  store %swift_error* null, %swift_error** %error_ptr_ref
+  call void @foo_sret(%struct.S* %s, i32 1, %swift_error** %error_ptr_ref)
+  %error_from_foo = load %swift_error*, %swift_error** %error_ptr_ref
+  %had_error_from_foo = icmp ne %swift_error* %error_from_foo, null
+  %tmp = bitcast %swift_error* %error_from_foo to i8*
+  br i1 %had_error_from_foo, label %handler, label %cont
+cont:
+  %v1 = getelementptr inbounds %swift_error, %swift_error* %error_from_foo, i64 0, i32 1
+  %t = load i8, i8* %v1
+  store i8 %t, i8* %error_ref
+  br label %handler
+handler:
+  call void @free(i8* %tmp)
+  ret float 1.0
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/swiftself.ll b/llvm/test/CodeGen/AArch64/GlobalISel/swiftself.ll
index 8ed06f23383c4f..0f090d488cf109 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/swiftself.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/swiftself.ll
@@ -60,3 +60,14 @@ entry:
   store i8* %3, i8** %0, align 8
   ret void
 }
+
+; Check that x20 is used to pass a swiftself argument when the parameter is
+; only in the declaration's arguments.
+; CHECK-LABEL: _swiftself_not_on_call_params:
+; CHECK: mov x20, x0
+; CHECK: bl {{_?}}swiftself_param
+; CHECK: ret
+define i8 *@swiftself_not_on_call_params(i8* %arg) {
+  %res = call i8 *@swiftself_param(i8* %arg)
+  ret i8 *%res
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
index d53cfe688f53cf..f244a840476daf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
@@ -49,9 +49,12 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
   ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[C5]](s32)
   ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
   ; GCN:   $vgpr0 = COPY [[FRAME_INDEX1]](p5)
-  ; GCN:   $vgpr1 = COPY [[FRAME_INDEX]](p5)
-  ; GCN:   [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
-  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
+  ; GCN:   [[COPY21:%[0-9]+]]:_(p5) = COPY $sp_reg
+  ; GCN:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GCN:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C6]](s32)
+  ; GCN:   G_STORE [[FRAME_INDEX]](p5), [[PTR_ADD2]](p5) :: (store 4 into stack, align 16, addrspace 5)
+  ; GCN:   [[COPY22:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY22]](<4 x s32>)
   ; GCN:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
   ; GCN:   $sgpr6_sgpr7 = COPY [[COPY12]](p4)
   ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4)
@@ -60,11 +63,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
   ; GCN:   $sgpr13 = COPY [[COPY16]](s32)
   ; GCN:   $sgpr14 = COPY [[COPY17]](s32)
   ; GCN:   $vgpr31 = COPY [[OR1]](s32)
-  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
-  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; GCN:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C2]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; GCN:   ADJCALLSTACKDOWN 0, 8, implicit-def $scc
+  ; GCN:   [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C2]](s32)
   ; GCN:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX1]](p5) :: (dereferenceable load 1 from %ir.out.gep02, addrspace 5)
-  ; GCN:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (dereferenceable load 4 from %ir.out.gep1, addrspace 5)
+  ; GCN:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (dereferenceable load 4 from %ir.out.gep1, addrspace 5)
   ; GCN:   G_STORE [[LOAD]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
   ; GCN:   G_STORE [[LOAD1]](s32), [[COPY10]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
   ; GCN:   S_ENDPGM 0

From ec29538af2e0886a65f479d6a533956a1c478132 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 13 Aug 2020 09:00:26 -0700
Subject: [PATCH 052/101] [ELF] Assign file offsets of non-SHF_ALLOC after
 SHF_ALLOC and set sh_addr=0 to non-SHF_ALLOC

* GNU ld places non-SHF_ALLOC sections after SHF_ALLOC sections. This has the
  advantage that the file offsets of a non-SHF_ALLOC cannot be contained in
  a PT_LOAD. This patch matches the behavior.
* For non-SHF_ALLOC non-orphan sections, GNU ld may assign non-zero sh_addr and
  treat them similar to SHT_NOBITS (not advance location counter). This
  is an alternative approach to what we have done in D85100.
  By placing non-SHF_ALLOC sections at the end, we can drop special
  cases in createSection and findOrphanPos added by D85100.

  Different from GNU ld, we set sh_addr to 0 for non-SHF_ALLOC sections. 0
  arguably is better because non-SHF_ALLOC sections don't appear in the memory
  image.

ELF spec says:

> sh_addr - If the section will appear in the memory image of a process, this
> member gives the address at which the section's first byte should
> reside. Otherwise, the member contains 0.

D85100 appeared to take a detour. If we take a combined view on D85100 and this
patch, the overall complexity slightly increases (one more 3-line loop) and
compatibility with GNU ld improves.

The behavior we don't want to match is the special treatment of .symtab
.shstrtab .strtab: they can be matched in LLD but not in GNU ld.

Reviewed By: jhenderson, psmith

Differential Revision: https://reviews.llvm.org/D85867
---
 lld/ELF/LinkerScript.cpp                      | 37 +++++---
 lld/ELF/Writer.cpp                            | 13 +--
 .../linkerscript/memory-region-alignment.test | 21 ++---
 lld/test/ELF/linkerscript/sections-nonalloc.s | 90 +++++++++++++++++++
 lld/test/ELF/linkerscript/sections.s          | 39 +-------
 .../ELF/linkerscript/symbols-non-alloc.test   |  7 +-
 6 files changed, 138 insertions(+), 69 deletions(-)
 create mode 100644 lld/test/ELF/linkerscript/sections-nonalloc.s

diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index a187aa1eb05a20..7e97576923c975 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -586,8 +586,6 @@ static OutputSection *findByName(ArrayRef<BaseCommand *> vec,
 static OutputSection *createSection(InputSectionBase *isec,
                                     StringRef outsecName) {
   OutputSection *sec = script->createOutputSection(outsecName, "<internal>");
-  if (!(isec->flags & SHF_ALLOC))
-    sec->addrExpr = [] { return 0; };
   sec->recordSection(isec);
   return sec;
 }
@@ -852,21 +850,27 @@ static OutputSection *findFirstSection(PhdrEntry *load) {
 void LinkerScript::assignOffsets(OutputSection *sec) {
   const bool sameMemRegion = ctx->memRegion == sec->memRegion;
   const bool prevLMARegionIsDefault = ctx->lmaRegion == nullptr;
+  const uint64_t savedDot = dot;
   ctx->memRegion = sec->memRegion;
   ctx->lmaRegion = sec->lmaRegion;
-  if (ctx->memRegion)
-    dot = ctx->memRegion->curPos;
-
-  if (sec->addrExpr)
-    setDot(sec->addrExpr, sec->location, false);
 
-  // If the address of the section has been moved forward by an explicit
-  // expression so that it now starts past the current curPos of the enclosing
-  // region, we need to expand the current region to account for the space
-  // between the previous section, if any, and the start of this section.
-  if (ctx->memRegion && ctx->memRegion->curPos < dot)
-    expandMemoryRegion(ctx->memRegion, dot - ctx->memRegion->curPos,
-                       ctx->memRegion->name, sec->name);
+  if (sec->flags & SHF_ALLOC) {
+    if (ctx->memRegion)
+      dot = ctx->memRegion->curPos;
+    if (sec->addrExpr)
+      setDot(sec->addrExpr, sec->location, false);
+
+    // If the address of the section has been moved forward by an explicit
+    // expression so that it now starts past the current curPos of the enclosing
+    // region, we need to expand the current region to account for the space
+    // between the previous section, if any, and the start of this section.
+    if (ctx->memRegion && ctx->memRegion->curPos < dot)
+      expandMemoryRegion(ctx->memRegion, dot - ctx->memRegion->curPos,
+                         ctx->memRegion->name, sec->name);
+  } else {
+    // Non-SHF_ALLOC sections have zero addresses.
+    dot = 0;
+  }
 
   switchTo(sec);
 
@@ -918,6 +922,11 @@ void LinkerScript::assignOffsets(OutputSection *sec) {
     for (InputSection *sec : cast<InputSectionDescription>(base)->sections)
       output(sec);
   }
+
+  // Non-SHF_ALLOC sections do not affect the addresses of other OutputSections
+  // as they are not part of the process image.
+  if (!(sec->flags & SHF_ALLOC))
+    dot = savedDot;
 }
 
 static bool isDiscardable(OutputSection &sec) {
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index cffdce0d6c310a..b26817b66e2711 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1234,13 +1234,7 @@ static bool shouldSkip(BaseCommand *cmd) {
 static std::vector<BaseCommand *>::iterator
 findOrphanPos(std::vector<BaseCommand *>::iterator b,
               std::vector<BaseCommand *>::iterator e) {
-  // OutputSections without the SHF_ALLOC flag are not part of the memory image
-  // and their addresses usually don't matter. Place any orphan sections without
-  // the SHF_ALLOC flag at the end so that these do not affect the address
-  // assignment of OutputSections with the SHF_ALLOC flag.
   OutputSection *sec = cast<OutputSection>(*e);
-  if (!(sec->flags & SHF_ALLOC))
-    return e;
 
   // Find the first element that has as close a rank as possible.
   auto i = std::max_element(b, e, [=](BaseCommand *a, BaseCommand *b) {
@@ -2589,7 +2583,11 @@ template <class ELFT> void Writer<ELFT>::assignFileOffsets() {
       if (p->p_type == PT_LOAD && (p->p_flags & PF_X))
         lastRX = p;
 
+  // Layout SHF_ALLOC sections before non-SHF_ALLOC sections. A non-SHF_ALLOC
+  // will not occupy file offsets contained by a PT_LOAD.
   for (OutputSection *sec : outputSections) {
+    if (!(sec->flags & SHF_ALLOC))
+      continue;
     off = setFileOffset(sec, off);
 
     // If this is a last section of the last executable segment and that
@@ -2599,6 +2597,9 @@ template <class ELFT> void Writer<ELFT>::assignFileOffsets() {
         lastRX->lastSec == sec)
       off = alignTo(off, config->commonPageSize);
   }
+  for (OutputSection *sec : outputSections)
+    if (!(sec->flags & SHF_ALLOC))
+      off = setFileOffset(sec, off);
 
   sectionHeaderOff = alignTo(off, config->wordsize);
   fileSize = sectionHeaderOff + (outputSections.size() + 1) * sizeof(Elf_Shdr);
diff --git a/lld/test/ELF/linkerscript/memory-region-alignment.test b/lld/test/ELF/linkerscript/memory-region-alignment.test
index f0540a7f11a789..ea858299a7ebac 100644
--- a/lld/test/ELF/linkerscript/memory-region-alignment.test
+++ b/lld/test/ELF/linkerscript/memory-region-alignment.test
@@ -1,5 +1,5 @@
 # REQUIRES: x86
-# RUN: echo '.section .foo,"a"; .quad 0; .section .zed,"M",@progbits,1; .byte 0' > %t.s
+# RUN: echo '.section .foo,"a"; .quad 0; .section .zed,"aM",@progbits,1; .byte 0' > %t.s
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %t.s -o %t.o
 
 MEMORY {
@@ -28,24 +28,25 @@ SECTIONS {
 # CHECK-NEXT:   Offset: 0x1008
 # CHECK-NEXT:   Size: 8
 
-# CHECK:        Name: .text
+# CHECK:        Name: .zed
 # CHECK-NEXT:   Type: SHT_PROGBITS
 # CHECK-NEXT:   Flags [
 # CHECK-NEXT:     SHF_ALLOC
-# CHECK-NEXT:     SHF_EXECINSTR
+# CHECK-NEXT:     SHF_MERGE
 # CHECK-NEXT:   ]
 # CHECK-NEXT:   Address: 0x10
 # CHECK-NEXT:   Offset: 0x1010
-# CHECK-NEXT:   Size: 0
+# CHECK-NEXT:   Size: 1
 
-# CHECK:        Name: .zed
+# CHECK:        Name: .text
 # CHECK-NEXT:   Type: SHT_PROGBITS
 # CHECK-NEXT:   Flags [
-# CHECK-NEXT:     SHF_MERGE
+# CHECK-NEXT:     SHF_ALLOC
+# CHECK-NEXT:     SHF_EXECINSTR
 # CHECK-NEXT:   ]
-# CHECK-NEXT:   Address: 0x10
-# CHECK-NEXT:   Offset: 0x1010
-# CHECK-NEXT:   Size: 1
+# CHECK-NEXT:   Address: 0x14
+# CHECK-NEXT:   Offset: 0x1014
+# CHECK-NEXT:   Size: 0
 
 # CHECK:        Name: .comment
 # CHECK-NEXT:   Type: SHT_PROGBITS
@@ -54,5 +55,5 @@ SECTIONS {
 # CHECK-NEXT:     SHF_STRINGS
 # CHECK-NEXT:   ]
 # CHECK-NEXT:   Address: 0x0
-# CHECK-NEXT:   Offset: 0x1011
+# CHECK-NEXT:   Offset: 0x1014
 # CHECK-NEXT:   Size: 8
diff --git a/lld/test/ELF/linkerscript/sections-nonalloc.s b/lld/test/ELF/linkerscript/sections-nonalloc.s
new file mode 100644
index 00000000000000..a0669f701d8c90
--- /dev/null
+++ b/lld/test/ELF/linkerscript/sections-nonalloc.s
@@ -0,0 +1,90 @@
+# REQUIRES: x86
+# RUN: split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %t/main.s -o %t.o
+
+## Non-SHF_ALLOC sections are placed after all SHF_ALLOC sections. They will
+## thus not be contained in a PT_LOAD segment. data2 has a PT_LOAD segment,
+## even if it is preceded by a non-SHF_ALLOC section. Non-SHF_ALLOC orphan
+## sections have zero addresses.
+## NOTE: GNU ld assigns non-zero addresses to non-SHF_ALLOC non-orphan sections.
+# RUN: ld.lld -T %t/a.lds %t.o -o %ta
+# RUN: llvm-readelf -S -l %ta | FileCheck %s
+
+# CHECK:       [Nr] Name      Type     Address          Off    Size   ES Flg Lk
+# CHECK-NEXT:  [ 0]           NULL     0000000000000000 000000 000000 00      0
+# CHECK-NEXT:  [ 1] .bss      NOBITS   0000000000000000 001000 000001 00  WA  0
+# CHECK-NEXT:  [ 2] data1     PROGBITS 0000000000000001 001001 000001 00  WA  0
+# CHECK-NEXT:  [ 3] data3     PROGBITS 0000000000000002 001002 000001 00  WA  0
+# CHECK-NEXT:  [ 4] other1    PROGBITS 0000000000000000 001008 000001 00      0
+# CHECK-NEXT:  [ 5] other2    PROGBITS 0000000000000000 001010 000001 00      0
+## Orphan placement places other3, .symtab, .shstrtab and .strtab after other2.
+# CHECK-NEXT:  [ 6] other3    PROGBITS 0000000000000000 001020 000001 00      0
+# CHECK-NEXT:  [ 7] .symtab   SYMTAB   0000000000000000 001028 000030 18      9
+# CHECK-NEXT:  [ 8] .shstrtab STRTAB   0000000000000000 001058 00004d 00      0
+# CHECK-NEXT:  [ 9] .strtab   STRTAB   0000000000000000 0010a5 000008 00      0
+# CHECK-NEXT:  [10] data2     PROGBITS 0000000000000003 001003 000001 00  WA  0
+# CHECK-NEXT:  [11] .text     PROGBITS 0000000000000004 001004 000001 00  AX  0
+
+# CHECK:       Type       Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# CHECK-NEXT:  LOAD       0x001000 0x0000000000000000 0x0000000000000000 0x000004 0x000004 RW  0x1000
+# CHECK-NEXT:  LOAD       0x001004 0x0000000000000004 0x0000000000000004 0x000001 0x000001 R E 0x1000
+# CHECK-NEXT:  GNU_STACK  0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
+
+# RUN: ld.lld -T %t/b.lds %t.o -o %tb
+# RUN: llvm-readelf -S -l %tb | FileCheck %s --check-prefix=CHECK1
+
+# CHECK1:      [Nr] Name      Type     Address          Off    Size   ES Flg Lk
+# CHECK1-NEXT: [ 0]           NULL     0000000000000000 000000 000000 00      0
+# CHECK1-NEXT: [ 1] .text     PROGBITS 00000000000000b0 0000b0 000001 00  AX  0
+# CHECK1-NEXT: [ 2] .bss      NOBITS   00000000000000b1 0000b1 000001 00  WA  0
+# CHECK1-NEXT: [ 3] data1     PROGBITS 00000000000000b2 0000b2 000001 00  WA  0
+# CHECK1-NEXT: [ 4] data3     PROGBITS 00000000000000b3 0000b3 000001 00  WA  0
+# CHECK1-NEXT: [ 5] other1    PROGBITS 0000000000000000 0000b8 000001 00      0
+# CHECK1-NEXT: [ 6] other2    PROGBITS 0000000000000000 0000c0 000001 00      0
+# CHECK1-NEXT: [ 7] other3    PROGBITS 0000000000000000 0000d0 000001 00      0
+# CHECK1-NEXT: [ 8] .symtab   SYMTAB   0000000000000000 0000d8 000030 18     10
+# CHECK1-NEXT: [ 9] .shstrtab STRTAB   0000000000000000 000108 00004d 00      0
+# CHECK1-NEXT: [10] .strtab   STRTAB   0000000000000000 000155 000008 00      0
+# CHECK1-NEXT: [11] data2     PROGBITS 00000000000000b4 0000b4 000001 00  WA  0
+# CHECK1:      Type       Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# CHECK1-NEXT: LOAD       0x000000 0x0000000000000000 0x0000000000000000 0x0000b5 0x0000b5 RWE 0x1000
+# CHECK1-NEXT: 0x60000000 0x0000b8 0x0000000000000000 0x0000000000000000 0x000009 0x000001     0x8
+
+#--- a.lds
+SECTIONS {
+  .bss : { *(.bss) }
+  data1 : { *(data1) }
+  other1 : { *(other1) }
+  other2 : { *(other2) }
+  data2 : { *(data2) }
+  .text : { *(.text) }
+  /DISCARD/ : { *(.comment) }
+}
+
+#--- b.lds
+PHDRS {
+  text PT_LOAD FILEHDR PHDRS;
+  foo 0x60000000 FLAGS (0);
+}
+SECTIONS {
+  . = SIZEOF_HEADERS;
+  .text : { *(.text) } : text
+  .bss : { *(.bss) } : text
+  data1 : { *(data1) } : text
+  other1 : { *(other1) } : foo
+  other2 : { *(other2) } : foo
+  data2 : { *(data1) } : text
+  /DISCARD/ : { *(.comment) }
+}
+
+#--- main.s
+.globl _start
+_start: nop
+.section data1,"aw"; .byte 0
+.section data2,"aw"; .byte 0
+.section data3,"aw"; .byte 0
+.bss; .byte 0
+
+.section other1; .p2align 2; .byte 0
+.section other2; .p2align 3; .byte 0
+.section other3; .p2align 4; .byte 0
diff --git a/lld/test/ELF/linkerscript/sections.s b/lld/test/ELF/linkerscript/sections.s
index fa346406b743f8..539aa9c1705888 100644
--- a/lld/test/ELF/linkerscript/sections.s
+++ b/lld/test/ELF/linkerscript/sections.s
@@ -25,39 +25,6 @@
 # SEC-DEFAULT: 7 .shstrtab     0000003b {{[0-9a-f]*}}
 # SEC-DEFAULT: 8 .strtab       00000008 {{[0-9a-f]*}}
 
-## Sections are placed in the order specified by the linker script. .data has
-## a PT_LOAD segment, even if it is preceded by a non-alloc section. To
-## allow this, place non-alloc orphan sections at the end and advance
-## location counters for non-alloc non-orphan sections.
-# RUN: echo "SECTIONS { \
-# RUN:          .bss : { *(.bss) } \
-# RUN:          other : { *(other) } \
-# RUN:          .shstrtab : { *(.shstrtab) } \
-# RUN:          .symtab : { *(.symtab) } \
-# RUN:          .strtab : { *(.strtab) } \
-# RUN:          .data : { *(.data) } \
-# RUN:          .text : { *(.text) } }" > %t3.lds
-# RUN: ld.lld -o %t3a -T %t3.lds %t
-# RUN: llvm-readelf -S -l %t3a | FileCheck --check-prefix=SEC-ORDER %s
-# RUN: ld.lld -o %t3b -T %t3.lds --unique %t
-# RUN: llvm-readelf -S -l %t3b | FileCheck --check-prefix=SEC-ORDER %s
-
-# SEC-ORDER:       [Nr] Name      Type     Address          Off    Size   ES Flg
-# SEC-ORDER:       [ 0]           NULL     0000000000000000 000000 000000 00
-# SEC-ORDER-NEXT:  [ 1] .bss      NOBITS   0000000000000000 001000 000002 00  WA
-# SEC-ORDER-NEXT:  [ 2] other     PROGBITS 0000000000000002 001002 000003 00  WA
-# SEC-ORDER-NEXT:  [ 3] .shstrtab STRTAB   0000000000000005 001005 00003b 00
-# SEC-ORDER-NEXT:  [ 4] .symtab   SYMTAB   0000000000000040 001040 000030 18
-# SEC-ORDER-NEXT:  [ 5] .strtab   STRTAB   0000000000000070 001070 000008 00
-# SEC-ORDER-NEXT:  [ 6] .data     PROGBITS 0000000000000078 001078 000020 00  WA
-# SEC-ORDER-NEXT:  [ 7] .text     PROGBITS 0000000000000098 001098 00000e 00  AX
-# SEC-ORDER-NEXT:  [ 8] .comment  PROGBITS 0000000000000000 0010a6 000008 01  MS
-
-# SEC-ORDER:        Type      Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
-# SEC-ORDER-NEXT:   LOAD      0x001000 0x0000000000000000 0x0000000000000000 0x000098 0x000098 RW  0x1000
-# SEC-ORDER-NEXT:   LOAD      0x001098 0x0000000000000098 0x0000000000000098 0x00000e 0x00000e R E 0x1000
-# SEC-ORDER-NEXT:   GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
-
 # .text and .data have swapped names but proper sizes and types.
 # RUN: echo "SECTIONS { \
 # RUN:          .data : { *(.text) } \
@@ -112,12 +79,12 @@
 # SEP-BY-NONALLOC:      [ 1] .text     PROGBITS 0000000000000000 001000 00000e 00  AX
 # SEP-BY-NONALLOC-NEXT: [ 2] .data     PROGBITS 000000000000000e 00100e 000020 00  WA
 # SEP-BY-NONALLOC-NEXT: [ 3] .bss      NOBITS   000000000000002e 00102e 000002 00  WA
-# SEP-BY-NONALLOC-NEXT: [ 4] .comment  PROGBITS 0000000000000030 00102e 000008 01  MS
-# SEP-BY-NONALLOC-NEXT: [ 5] other     PROGBITS 0000000000000038 001038 000003 00  WA
+# SEP-BY-NONALLOC-NEXT: [ 4] .comment  PROGBITS 0000000000000000 001033 000008 01  MS
+# SEP-BY-NONALLOC:      [ 8] other     PROGBITS 0000000000000030 001030 000003 00  WA
 
 # SEP-BY-NONALLOC:      Type      Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
 # SEP-BY-NONALLOC-NEXT: LOAD      0x001000 0x0000000000000000 0x0000000000000000 0x00000e 0x00000e R E 0x1000
-# SEP-BY-NONALLOC-NEXT: LOAD      0x00100e 0x000000000000000e 0x000000000000000e 0x00002d 0x00002d RW  0x1000
+# SEP-BY-NONALLOC-NEXT: LOAD      0x00100e 0x000000000000000e 0x000000000000000e 0x000025 0x000025 RW  0x1000
 # SEP-BY-NONALLOC-NEXT: GNU_STACK 0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 RW  0
 
 # Input section pattern contains additional semicolon.
diff --git a/lld/test/ELF/linkerscript/symbols-non-alloc.test b/lld/test/ELF/linkerscript/symbols-non-alloc.test
index 2bd6fc84df4678..ca47b2bfbcac68 100644
--- a/lld/test/ELF/linkerscript/symbols-non-alloc.test
+++ b/lld/test/ELF/linkerscript/symbols-non-alloc.test
@@ -1,6 +1,6 @@
 # REQUIRES: x86
 ## The address of a symbol assignment after a non-SHF_ALLOC section equals the
-## end address of the section.
+## end address of the last SHF_ALLOC section.
 
 # RUN: echo '.section .nonalloc,""; .quad 0' \
 # RUN:   | llvm-mc -filetype=obj -triple=x86_64-unknown-linux - -o %t
@@ -8,10 +8,11 @@
 # RUN: llvm-objdump --section-headers -t %t2 | FileCheck %s
 
 # CHECK: Sections:
-# CHECK:  .nonalloc     00000008 0000000000000120
+# CHECK:  .text         00000000 0000000000000120
+# CHECK:  .nonalloc     00000008 0000000000000000
 
 # CHECK: SYMBOL TABLE:
-# CHECK:  0000000000000128 g .nonalloc 0000000000000000 Sym
+# CHECK:  0000000000000120 g .nonalloc 0000000000000000 Sym
 
 SECTIONS {
   . = SIZEOF_HEADERS;

From 645c6856a68af9b9dd7d918f630560cf07462ed7 Mon Sep 17 00:00:00 2001
From: Jamie Schmeiser <schmeise@ca.ibm.com>
Date: Tue, 18 Aug 2020 16:05:20 +0000
Subject: [PATCH 053/101] [NFC] Add raw_ostream parameter to printIR routines

This is a non-functional-change to generalize the printIR routines so that
the output can be saved and manipulated rather than being directly output
to dbgs(). This is a prerequisite change for many upcoming changes that
allow new ways of examining changes made to the IR in the new pass manager.

Reviewed By: aeubanks (Arthur Eubanks)

Differential Revision: https://reviews.llvm.org/D85999
---
 llvm/lib/Passes/StandardInstrumentations.cpp | 61 ++++++++++----------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 9e9caa1557b406..55dbca71437138 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -86,39 +86,39 @@ Optional<std::pair<const Module *, std::string>> unwrapModule(Any IR) {
   llvm_unreachable("Unknown IR unit");
 }
 
-void printIR(const Function *F, StringRef Banner, StringRef Extra = StringRef(),
-             bool Brief = false) {
+void printIR(raw_ostream &OS, const Function *F, StringRef Banner,
+             StringRef Extra = StringRef(), bool Brief = false) {
   if (Brief) {
-    dbgs() << F->getName() << '\n';
+    OS << F->getName() << '\n';
     return;
   }
 
   if (!llvm::isFunctionInPrintList(F->getName()))
     return;
-  dbgs() << Banner << Extra << "\n" << static_cast<const Value &>(*F);
+  OS << Banner << Extra << "\n" << static_cast<const Value &>(*F);
 }
 
-void printIR(const Module *M, StringRef Banner, StringRef Extra = StringRef(),
-             bool Brief = false) {
+void printIR(raw_ostream &OS, const Module *M, StringRef Banner,
+             StringRef Extra = StringRef(), bool Brief = false) {
   if (Brief) {
-    dbgs() << M->getName() << '\n';
+    OS << M->getName() << '\n';
     return;
   }
 
   if (llvm::isFunctionInPrintList("*") || llvm::forcePrintModuleIR()) {
-    dbgs() << Banner << Extra << "\n";
-    M->print(dbgs(), nullptr, false);
+    OS << Banner << Extra << "\n";
+    M->print(OS, nullptr, false);
   } else {
     for (const auto &F : M->functions()) {
-      printIR(&F, Banner, Extra);
+      printIR(OS, &F, Banner, Extra);
     }
   }
 }
 
-void printIR(const LazyCallGraph::SCC *C, StringRef Banner,
+void printIR(raw_ostream &OS, const LazyCallGraph::SCC *C, StringRef Banner,
              StringRef Extra = StringRef(), bool Brief = false) {
   if (Brief) {
-    dbgs() << *C << '\n';
+    OS << *C << '\n';
     return;
   }
 
@@ -127,47 +127,48 @@ void printIR(const LazyCallGraph::SCC *C, StringRef Banner,
     const Function &F = N.getFunction();
     if (!F.isDeclaration() && llvm::isFunctionInPrintList(F.getName())) {
       if (!BannerPrinted) {
-        dbgs() << Banner << Extra << "\n";
+        OS << Banner << Extra << "\n";
         BannerPrinted = true;
       }
-      F.print(dbgs());
+      F.print(OS);
     }
   }
 }
 
-void printIR(const Loop *L, StringRef Banner, bool Brief = false) {
+void printIR(raw_ostream &OS, const Loop *L, StringRef Banner,
+             bool Brief = false) {
   if (Brief) {
-    dbgs() << *L;
+    OS << *L;
     return;
   }
 
   const Function *F = L->getHeader()->getParent();
   if (!llvm::isFunctionInPrintList(F->getName()))
     return;
-  llvm::printLoop(const_cast<Loop &>(*L), dbgs(), std::string(Banner));
+  llvm::printLoop(const_cast<Loop &>(*L), OS, std::string(Banner));
 }
 
 /// Generic IR-printing helper that unpacks a pointer to IRUnit wrapped into
 /// llvm::Any and does actual print job.
-void unwrapAndPrint(Any IR, StringRef Banner, bool ForceModule = false,
-                    bool Brief = false) {
+void unwrapAndPrint(raw_ostream &OS, Any IR, StringRef Banner,
+                    bool ForceModule = false, bool Brief = false) {
   if (ForceModule) {
     if (auto UnwrappedModule = unwrapModule(IR))
-      printIR(UnwrappedModule->first, Banner, UnwrappedModule->second);
+      printIR(OS, UnwrappedModule->first, Banner, UnwrappedModule->second);
     return;
   }
 
   if (any_isa<const Module *>(IR)) {
     const Module *M = any_cast<const Module *>(IR);
     assert(M && "module should be valid for printing");
-    printIR(M, Banner, "", Brief);
+    printIR(OS, M, Banner, "", Brief);
     return;
   }
 
   if (any_isa<const Function *>(IR)) {
     const Function *F = any_cast<const Function *>(IR);
     assert(F && "function should be valid for printing");
-    printIR(F, Banner, "", Brief);
+    printIR(OS, F, Banner, "", Brief);
     return;
   }
 
@@ -175,14 +176,14 @@ void unwrapAndPrint(Any IR, StringRef Banner, bool ForceModule = false,
     const LazyCallGraph::SCC *C = any_cast<const LazyCallGraph::SCC *>(IR);
     assert(C && "scc should be valid for printing");
     std::string Extra = std::string(formatv(" (scc: {0})", C->getName()));
-    printIR(C, Banner, Extra, Brief);
+    printIR(OS, C, Banner, Extra, Brief);
     return;
   }
 
   if (any_isa<const Loop *>(IR)) {
     const Loop *L = any_cast<const Loop *>(IR);
     assert(L && "Loop should be valid for printing");
-    printIR(L, Banner, Brief);
+    printIR(OS, L, Banner, Brief);
     return;
   }
   llvm_unreachable("Unknown wrapped IR type");
@@ -226,7 +227,7 @@ void PrintIRInstrumentation::printBeforePass(StringRef PassID, Any IR) {
     return;
 
   SmallString<20> Banner = formatv("*** IR Dump Before {0} ***", PassID);
-  unwrapAndPrint(IR, Banner, llvm::forcePrintModuleIR());
+  unwrapAndPrint(dbgs(), IR, Banner, llvm::forcePrintModuleIR());
   return;
 }
 
@@ -241,7 +242,7 @@ void PrintIRInstrumentation::printAfterPass(StringRef PassID, Any IR) {
     popModuleDesc(PassID);
 
   SmallString<20> Banner = formatv("*** IR Dump After {0} ***", PassID);
-  unwrapAndPrint(IR, Banner, llvm::forcePrintModuleIR());
+  unwrapAndPrint(dbgs(), IR, Banner, llvm::forcePrintModuleIR());
 }
 
 void PrintIRInstrumentation::printAfterPassInvalidated(StringRef PassID) {
@@ -262,7 +263,7 @@ void PrintIRInstrumentation::printAfterPassInvalidated(StringRef PassID) {
 
   SmallString<20> Banner =
       formatv("*** IR Dump After {0} *** invalidated: ", PassID);
-  printIR(M, Banner, Extra);
+  printIR(dbgs(), M, Banner, Extra);
 }
 
 void PrintIRInstrumentation::registerCallbacks(
@@ -315,7 +316,7 @@ void PrintPassInstrumentation::registerCallbacks(
                "Unexpectedly skipping special pass");
 
         dbgs() << "Skipping pass: " << PassID << " on ";
-        unwrapAndPrint(IR, "", false, true);
+        unwrapAndPrint(dbgs(), IR, "", false, true);
       });
 
   PIC.registerBeforeNonSkippedPassCallback(
@@ -324,12 +325,12 @@ void PrintPassInstrumentation::registerCallbacks(
           return;
 
         dbgs() << "Running pass: " << PassID << " on ";
-        unwrapAndPrint(IR, "", false, true);
+        unwrapAndPrint(dbgs(), IR, "", false, true);
       });
 
   PIC.registerBeforeAnalysisCallback([](StringRef PassID, Any IR) {
     dbgs() << "Running analysis: " << PassID << " on ";
-    unwrapAndPrint(IR, "", false, true);
+    unwrapAndPrint(dbgs(), IR, "", false, true);
   });
 }
 

From aa48a480b89ab969448851ee888357d42ee7761e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 18 Aug 2020 09:07:38 -0700
Subject: [PATCH 054/101] [llvm-dwarfdump][test] Add a --statistics test for a
 DW_AT_artificial variable

There is an untested but useful case: `this` (even if not written) is counted as a
source variable.

Reviewed By: dblaikie

Differential Revision: https://reviews.llvm.org/D86044
---
 .../tools/llvm-dwarfdump/X86/statistics.ll    | 44 ++++++++++++++++---
 1 file changed, 37 insertions(+), 7 deletions(-)

diff --git a/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll b/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
index 589375ac6f55b2..bd717dfc85b388 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
+++ b/llvm/test/tools/llvm-dwarfdump/X86/statistics.ll
@@ -23,6 +23,11 @@
 ;
 ; int boo(int, int) {}
 
+; struct T {
+;   void empty();
+; };
+; void T::empty() {}
+
 ; Following variables/arguments/members should be counted:
 ;     - GlobalConst,
 ;     - Global,
@@ -30,16 +35,17 @@
 ;     - square::i,
 ;     - cube::i, cube::squared
 ;     - boo::1, boo::2
+;     - this in T::empty()
 ; Skipped entities:
 ;     - declaration of test::a,
 ;     - non-constant member S:fn,
 ;     - arguments of S:fn.
 
-; CHECK: "#unique source variables":9
+; CHECK: "#unique source variables":10
 ; +1 extra inline i.
-; CHECK: "#source variables":10
+; CHECK: "#source variables":11
 ; -1 square::i
-; CHECK: "#source variables with location":9
+; CHECK: "#source variables with location":10
 ; CHECK: "sum_all_local_vars(#bytes in parent scope)":[[BYTES:[0-9]+]]
 ; Because of the dbg.value in the middle of the function, the pc range coverage
 ; must be below 100%.
@@ -48,11 +54,11 @@
 ; CHECK: "sum_all_local_vars(#bytes in parent scope covered by DW_AT_location)":
 ; CHECK: "#bytes witin functions":[[FUNCSIZE:[0-9]+]]
 ; CHECK: "#bytes witin inlined functions":[[INLINESIZE:[0-9]+]]
-; CHECK: "#bytes in __debug_info":380
+; CHECK: "#bytes in __debug_info":459
 ; CHECK: "#bytes in __debug_loc":35
-; CHECK: "#bytes in __debug_abbrev":303
-; CHECK: "#bytes in __debug_line":117
-; CHECK: "#bytes in __debug_str":204
+; CHECK: "#bytes in __debug_abbrev":384
+; CHECK: "#bytes in __debug_line":126
+; CHECK: "#bytes in __debug_str":231
 
 ; ModuleID = '/tmp/quality.cpp'
 source_filename = "/tmp/quality.cpp"
@@ -118,6 +124,17 @@ entry:
   ret i32 0, !dbg !58
 }
 
+%struct.T = type { i8 }
+
+define void @_ZN1T5emptyEv(%struct.T* %this) #2 !dbg !59 {
+entry:
+  %this.addr = alloca %struct.T*, align 8
+  store %struct.T* %this, %struct.T** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata %struct.T** %this.addr, metadata !67, metadata !DIExpression()), !dbg !69
+  %this1 = load %struct.T*, %struct.T** %this.addr, align 8
+  ret void, !dbg !70
+}
+
 attributes #0 = { alwaysinline nounwind ssp uwtable }
 attributes #1 = { nounwind readnone speculatable }
 attributes #2 = { noinline nounwind optnone ssp uwtable }
@@ -185,3 +202,16 @@ attributes #2 = { noinline nounwind optnone ssp uwtable }
 !56 = !DILocation(line: 10, column: 12, scope: !52)
 !57 = !DILocalVariable(arg: 2, scope: !52, file: !3, line: 10, type: !8)
 !58 = !DILocation(line: 10, column: 17, scope: !52)
+
+!59 = distinct !DISubprogram(name: "empty", linkageName: "_ZN1T5emptyEv", scope: !60, file: !3, line: 25, type: !63, scopeLine: 25, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, declaration: !62, retainedNodes: !4)
+!60 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "T", file: !3, line: 22, size: 8, flags: DIFlagTypePassByValue, elements: !61, identifier: "_ZTS1T")
+!61 = !{!62}
+!62 = !DISubprogram(name: "empty", linkageName: "_ZN1T5emptyEv", scope: !60, file: !3, line: 23, type: !63, scopeLine: 23, flags: DIFlagPrototyped, spFlags: 0)
+!63 = !DISubroutineType(types: !64)
+!64 = !{!65, !66}
+!65 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!66 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !60, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!67 = !DILocalVariable(name: "this", arg: 1, scope: !59, type: !68, flags: DIFlagArtificial | DIFlagObjectPointer)
+!68 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !60, size: 64)
+!69 = !DILocation(line: 0, scope: !59)
+!70 = !DILocation(line: 25, column: 19, scope: !59)

From bb54bcf84970c04c9748004f3a4cf59b0c1832a7 Mon Sep 17 00:00:00 2001
From: Dokyung Song <dokyungs@google.com>
Date: Wed, 5 Aug 2020 23:12:19 +0000
Subject: [PATCH 055/101] [libFuzzer] Fix arguments of InsertPartOf/CopyPartOf
 calls in CrossOver mutator.

The CrossOver mutator is meant to cross over two given buffers (referred to as
the first/second buffer henceforth). Previously InsertPartOf/CopyPartOf calls
used in the CrossOver mutator incorrectly inserted/copied part of the second
buffer into a "scratch buffer" (MutateInPlaceHere of the size
CurrentMaxMutationLen), rather than the first buffer. This is not intended
behavior, because the scratch buffer does not always (i) contain the content of
the first buffer, and (ii) have the same size as the first buffer;
CurrentMaxMutationLen is typically a lot larger than the size of the first
buffer. This patch fixes the issue by using the first buffer instead of the
scratch buffer in InsertPartOf/CopyPartOf calls.

A FuzzBench experiment was run to make sure that this change does not
inadvertently degrade the performance. The performance is largely the same; more
details can be found at:
https://storage.googleapis.com/fuzzer-test-suite-public/fixcrossover-report/index.html

This patch also adds two new tests, namely "cross_over_insert" and
"cross_over_copy", which specifically target InsertPartOf and CopyPartOf,
respectively.

- cross_over_insert.test checks if the fuzzer can use InsertPartOf to trigger
  the crash.

- cross_over_copy.test checks if the fuzzer can use CopyPartOf to trigger the
  crash.

These newly added tests were designed to pass with the current patch, but not
without the it (with 790878f291fa5dc58a1c560cb6cc76fd1bfd1c5a these tests do not
pass). To achieve this, -max_len was intentionally given a high value. Without
this patch, InsertPartOf/CopyPartOf will generate larger inputs, possibly with
unpredictable data in it, thereby failing to trigger the crash.

The test pass condition for these new tests is narrowed down by (i) limiting
mutation depth to 1 (i.e., a single CrossOver mutation should be able to trigger
the crash) and (ii) checking whether the mutation sequence of "CrossOver-" leads
to the crash.

Also note that these newly added tests and an existing test (cross_over.test)
all use "-reduce_inputs=0" flags to prevent reducing inputs; it's easier to
force the fuzzer to keep original input string this way than tweaking
cov-instrumented basic blocks in the source code of the fuzzer executable.

Differential Revision: https://reviews.llvm.org/D85554
---
 compiler-rt/lib/fuzzer/FuzzerMutate.cpp       | 14 ++++++-------
 compiler-rt/test/fuzzer/CrossOverTest.cpp     | 15 +++++++-------
 compiler-rt/test/fuzzer/cross_over.test       |  4 ++--
 compiler-rt/test/fuzzer/cross_over_copy.test  | 20 +++++++++++++++++++
 .../test/fuzzer/cross_over_insert.test        | 20 +++++++++++++++++++
 5 files changed, 56 insertions(+), 17 deletions(-)
 create mode 100644 compiler-rt/test/fuzzer/cross_over_copy.test
 create mode 100644 compiler-rt/test/fuzzer/cross_over_insert.test

diff --git a/compiler-rt/lib/fuzzer/FuzzerMutate.cpp b/compiler-rt/lib/fuzzer/FuzzerMutate.cpp
index 29541eac5dc60b..df9ada45bb0391 100644
--- a/compiler-rt/lib/fuzzer/FuzzerMutate.cpp
+++ b/compiler-rt/lib/fuzzer/FuzzerMutate.cpp
@@ -425,26 +425,26 @@ size_t MutationDispatcher::Mutate_CrossOver(uint8_t *Data, size_t Size,
   if (!CrossOverWith) return 0;
   const Unit &O = *CrossOverWith;
   if (O.empty()) return 0;
-  MutateInPlaceHere.resize(MaxSize);
-  auto &U = MutateInPlaceHere;
   size_t NewSize = 0;
   switch(Rand(3)) {
     case 0:
-      NewSize = CrossOver(Data, Size, O.data(), O.size(), U.data(), U.size());
+      MutateInPlaceHere.resize(MaxSize);
+      NewSize = CrossOver(Data, Size, O.data(), O.size(),
+                          MutateInPlaceHere.data(), MaxSize);
+      memcpy(Data, MutateInPlaceHere.data(), NewSize);
       break;
     case 1:
-      NewSize = InsertPartOf(O.data(), O.size(), U.data(), U.size(), MaxSize);
+      NewSize = InsertPartOf(O.data(), O.size(), Data, Size, MaxSize);
       if (!NewSize)
-        NewSize = CopyPartOf(O.data(), O.size(), U.data(), U.size());
+        NewSize = CopyPartOf(O.data(), O.size(), Data, Size);
       break;
     case 2:
-      NewSize = CopyPartOf(O.data(), O.size(), U.data(), U.size());
+      NewSize = CopyPartOf(O.data(), O.size(), Data, Size);
       break;
     default: assert(0);
   }
   assert(NewSize > 0 && "CrossOver returned empty unit");
   assert(NewSize <= MaxSize && "CrossOver returned overisized unit");
-  memcpy(Data, U.data(), NewSize);
   return NewSize;
 }
 
diff --git a/compiler-rt/test/fuzzer/CrossOverTest.cpp b/compiler-rt/test/fuzzer/CrossOverTest.cpp
index a7643570a92b25..b4506f665dc762 100644
--- a/compiler-rt/test/fuzzer/CrossOverTest.cpp
+++ b/compiler-rt/test/fuzzer/CrossOverTest.cpp
@@ -4,10 +4,11 @@
 
 // Test for a fuzzer. The fuzzer must find the string
 // ABCDEFGHIJ
-// We use it as a test for CrossOver functionality
-// by passing two inputs to it:
-// ABCDE00000
-// ZZZZZFGHIJ
+// We use it as a test for each of CrossOver functionalities
+// by passing the following sets of two inputs to it:
+// {ABCDE00000, ZZZZZFGHIJ}
+// {ABCDEHIJ, ZFG} to specifically test InsertPartOf
+// {ABCDE00HIJ, ZFG} to specifically test CopyPartOf
 //
 #include <assert.h>
 #include <cstddef>
@@ -42,13 +43,11 @@ static const uint32_t ExpectedHash = 0xe1677acb;
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   // fprintf(stderr, "ExpectedHash: %x\n", ExpectedHash);
-  if (Size != 10) return 0;
+  if (Size == 10 && ExpectedHash == simple_hash(Data, Size))
+    *NullPtr = 0;
   if (*Data == 'A')
     Sink++;
   if (*Data == 'Z')
     Sink--;
-  if (ExpectedHash == simple_hash(Data, Size))
-    *NullPtr = 0;
   return 0;
 }
-
diff --git a/compiler-rt/test/fuzzer/cross_over.test b/compiler-rt/test/fuzzer/cross_over.test
index 058b5eb2c85cd0..64e06e8cd3667b 100644
--- a/compiler-rt/test/fuzzer/cross_over.test
+++ b/compiler-rt/test/fuzzer/cross_over.test
@@ -12,7 +12,7 @@ RUN: echo -n ABCDE00000 > %t-corpus/A
 RUN: echo -n ZZZZZFGHIJ > %t-corpus/B
 
 
-RUN: not %run %t-CrossOverTest -max_len=10 -seed=1 -runs=10000000 %t-corpus
+RUN: not %run %t-CrossOverTest -max_len=10 -reduce_inputs=0 -seed=1 -runs=10000000 %t-corpus
 
 # Test the same thing but using -seed_inputs instead of passing the corpus dir.
-RUN: not %run %t-CrossOverTest -max_len=10 -seed=1 -runs=10000000 -seed_inputs=%t-corpus/A,%t-corpus/B
+RUN: not %run %t-CrossOverTest -max_len=10 -reduce_inputs=0 -seed=1 -runs=10000000 -seed_inputs=%t-corpus/A,%t-corpus/B
diff --git a/compiler-rt/test/fuzzer/cross_over_copy.test b/compiler-rt/test/fuzzer/cross_over_copy.test
new file mode 100644
index 00000000000000..24b2f9b3b11325
--- /dev/null
+++ b/compiler-rt/test/fuzzer/cross_over_copy.test
@@ -0,0 +1,20 @@
+# Tests CrossOver CopyPartOf.
+# We want to make sure that the test can find the input
+# ABCDEFGHIJ when given two other inputs in the seed corpus:
+#    ABCDE00HIJ and
+# (Z)     FG
+#
+RUN: %cpp_compiler %S/CrossOverTest.cpp -o %t-CrossOverTest
+
+RUN: rm -rf %t-corpus
+RUN: mkdir %t-corpus
+RUN: echo -n ABCDE00HIJ > %t-corpus/A
+RUN: echo -n ZFG > %t-corpus/B
+
+
+RUN: not %run %t-CrossOverTest -mutate_depth=1 -max_len=1024 -reduce_inputs=0 -seed=1 -runs=10000000 %t-corpus 2>&1 | FileCheck %s
+
+# Test the same thing but using -seed_inputs instead of passing the corpus dir.
+RUN: not %run %t-CrossOverTest -mutate_depth=1 -max_len=1024 -reduce_inputs=0 -seed=1 -runs=10000000 -seed_inputs=%t-corpus/A,%t-corpus/B 2>&1 | FileCheck %s
+
+CHECK: MS: 1 CrossOver-
diff --git a/compiler-rt/test/fuzzer/cross_over_insert.test b/compiler-rt/test/fuzzer/cross_over_insert.test
new file mode 100644
index 00000000000000..cb7d4fab81ef7e
--- /dev/null
+++ b/compiler-rt/test/fuzzer/cross_over_insert.test
@@ -0,0 +1,20 @@
+# Tests CrossOver InsertPartOf.
+# We want to make sure that the test can find the input
+# ABCDEFGHIJ when given two other inputs in the seed corpus:
+#    ABCDE  HIJ and
+# (Z)     FG
+#
+RUN: %cpp_compiler %S/CrossOverTest.cpp -o %t-CrossOverTest
+
+RUN: rm -rf %t-corpus
+RUN: mkdir %t-corpus
+RUN: echo -n ABCDEHIJ > %t-corpus/A
+RUN: echo -n ZFG > %t-corpus/B
+
+
+RUN: not %run %t-CrossOverTest -mutate_depth=1 -max_len=1024 -reduce_inputs=0 -seed=1 -runs=10000000 %t-corpus 2>&1 | FileCheck %s
+
+# Test the same thing but using -seed_inputs instead of passing the corpus dir.
+RUN: not %run %t-CrossOverTest -mutate_depth=1 -max_len=1024 -reduce_inputs=0 -seed=1 -runs=10000000 -seed_inputs=%t-corpus/A,%t-corpus/B 2>&1 | FileCheck %s
+
+CHECK: MS: 1 CrossOver-

From cc98a0fbe46511ebcbca5600c9ec49901469ae3d Mon Sep 17 00:00:00 2001
From: George Mitenkov <georgemitenk0v@gmail.com>
Date: Tue, 18 Aug 2020 18:42:23 +0300
Subject: [PATCH 056/101] [MLIR][SPIRVToLLVM] Additional conversions for
 spirv-runner

This patch adds more op/type conversion support
necessary for `spirv-runner`:
- EntryPoint/ExecutionMode: currently removed since we assume
having only one kernel function in the kernel module.
- StorageBuffer storage class is now supported. We are not
concerned with multithreading so this is fine for now.
- Type conversion enhanced, now regular offsets and strides
for structs and arrays are supported (based on
`VulkanLayoutUtils`).
- Support of `spc.AccessChain` that is modelled with GEP op
in LLVM dialect.

Reviewed By: mravishankar

Differential Revision: https://reviews.llvm.org/D86109
---
 .../SPIRVToLLVM/ConvertSPIRVToLLVM.cpp        | 117 ++++++++++++++----
 .../SPIRVToLLVM/memory-ops-to-llvm.mlir       |  25 ++++
 .../SPIRVToLLVM/misc-ops-to-llvm.mlir         |  17 +++
 .../spirv-types-to-llvm-invalid.mlir          |  11 +-
 .../SPIRVToLLVM/spirv-types-to-llvm.mlir      |   8 +-
 5 files changed, 141 insertions(+), 37 deletions(-)

diff --git a/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp
index e7c5b3c9f6dcd4..9c2ba26274e9ab 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/ConvertSPIRVToLLVM.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/SPIRV/LayoutUtils.h"
 #include "mlir/Dialect/SPIRV/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/SPIRVOps.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
@@ -179,6 +180,22 @@ static Value processCountOrOffset(Location loc, Value value, Type srcType,
   return optionallyTruncateOrExtend(loc, broadcasted, dstType, rewriter);
 }
 
+/// Converts SPIR-V struct with a regular (according to `VulkanLayoutUtils`)
+/// offset to LLVM struct. Otherwise, the conversion is not supported.
+static Optional<Type>
+convertStructTypeWithOffset(spirv::StructType type,
+                            LLVMTypeConverter &converter) {
+  if (type != VulkanLayoutUtils::decorateType(type))
+    return llvm::None;
+
+  auto elementsVector = llvm::to_vector<8>(
+      llvm::map_range(type.getElementTypes(), [&](Type elementType) {
+        return converter.convertType(elementType).cast<LLVM::LLVMType>();
+      }));
+  return LLVM::LLVMType::getStructTy(type.getContext(), elementsVector,
+                                     /*isPacked=*/false);
+}
+
 /// Converts SPIR-V struct with no offset to packed LLVM struct.
 static Type convertStructTypePacked(spirv::StructType type,
                                     LLVMTypeConverter &converter) {
@@ -223,16 +240,22 @@ static LogicalResult replaceWithLoadOrStore(Operation *op,
 // Type conversion
 //===----------------------------------------------------------------------===//
 
-/// Converts SPIR-V array type to LLVM array. There is no modelling of array
-/// stride at the moment.
+/// Converts SPIR-V array type to LLVM array. Natural stride (according to
+/// `VulkanLayoutUtils`) is also mapped to LLVM array. This has to be respected
+/// when converting ops that manipulate array types.
 static Optional<Type> convertArrayType(spirv::ArrayType type,
                                        TypeConverter &converter) {
-  if (type.getArrayStride() != 0)
+  unsigned stride = type.getArrayStride();
+  Type elementType = type.getElementType();
+  auto sizeInBytes = elementType.cast<spirv::SPIRVType>().getSizeInBytes();
+  if (stride != 0 &&
+      !(sizeInBytes.hasValue() && sizeInBytes.getValue() == stride))
     return llvm::None;
-  auto elementType =
-      converter.convertType(type.getElementType()).cast<LLVM::LLVMType>();
+
+  auto llvmElementType =
+      converter.convertType(elementType).cast<LLVM::LLVMType>();
   unsigned numElements = type.getNumElements();
-  return LLVM::LLVMType::getArrayTy(elementType, numElements);
+  return LLVM::LLVMType::getArrayTy(llvmElementType, numElements);
 }
 
 /// Converts SPIR-V pointer type to LLVM pointer. Pointer's storage class is not
@@ -257,13 +280,15 @@ static Optional<Type> convertRuntimeArrayType(spirv::RuntimeArrayType type,
 }
 
 /// Converts SPIR-V struct to LLVM struct. There is no support of structs with
-/// member decorations or with offset.
+/// member decorations. Also, only natural offset is supported.
 static Optional<Type> convertStructType(spirv::StructType type,
                                         LLVMTypeConverter &converter) {
   SmallVector<spirv::StructType::MemberDecorationInfo, 4> memberDecorations;
   type.getMemberDecorations(memberDecorations);
-  if (type.hasOffset() || !memberDecorations.empty())
+  if (!memberDecorations.empty())
     return llvm::None;
+  if (type.hasOffset())
+    return convertStructTypeWithOffset(type, converter);
   return convertStructTypePacked(type, converter);
 }
 
@@ -273,6 +298,31 @@ static Optional<Type> convertStructType(spirv::StructType type,
 
 namespace {
 
+class AccessChainPattern : public SPIRVToLLVMConversion<spirv::AccessChainOp> {
+public:
+  using SPIRVToLLVMConversion<spirv::AccessChainOp>::SPIRVToLLVMConversion;
+
+  LogicalResult
+  matchAndRewrite(spirv::AccessChainOp op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto dstType = typeConverter.convertType(op.component_ptr().getType());
+    if (!dstType)
+      return failure();
+    // To use GEP we need to add a first 0 index to go through the pointer.
+    auto indices = llvm::to_vector<4>(op.indices());
+    Type indexType = op.indices().front().getType();
+    auto llvmIndexType = typeConverter.convertType(indexType);
+    if (!llvmIndexType)
+      return failure();
+    Value zero = rewriter.create<LLVM::ConstantOp>(
+        op.getLoc(), llvmIndexType, rewriter.getIntegerAttr(indexType, 0));
+    indices.insert(indices.begin(), zero);
+    rewriter.replaceOpWithNewOp<LLVM::GEPOp>(op, dstType, op.base_ptr(),
+                                             indices);
+    return success();
+  }
+};
+
 class AddressOfPattern : public SPIRVToLLVMConversion<spirv::AddressOfOp> {
 public:
   using SPIRVToLLVMConversion<spirv::AddressOfOp>::SPIRVToLLVMConversion;
@@ -545,11 +595,14 @@ class GlobalVariablePattern
     if (!dstType)
       return failure();
 
-    // Limit conversion to the current invocation only for now.
+    // Limit conversion to the current invocation only or `StorageBuffer`
+    // required by SPIR-V runner.
+    // This is okay because multiple invocations are not supported yet.
     auto storageClass = srcType.getStorageClass();
     if (storageClass != spirv::StorageClass::Input &&
         storageClass != spirv::StorageClass::Private &&
-        storageClass != spirv::StorageClass::Output) {
+        storageClass != spirv::StorageClass::Output &&
+        storageClass != spirv::StorageClass::StorageBuffer) {
       return failure();
     }
 
@@ -757,6 +810,20 @@ class NotPattern : public SPIRVToLLVMConversion<SPIRVOp> {
   }
 };
 
+/// A template pattern that erases the given `SPIRVOp`.
+template <typename SPIRVOp>
+class ErasePattern : public SPIRVToLLVMConversion<SPIRVOp> {
+public:
+  using SPIRVToLLVMConversion<SPIRVOp>::SPIRVToLLVMConversion;
+
+  LogicalResult
+  matchAndRewrite(SPIRVOp op, ArrayRef<Value> operands,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
 class ReturnPattern : public SPIRVToLLVMConversion<spirv::ReturnOp> {
 public:
   using SPIRVToLLVMConversion<spirv::ReturnOp>::SPIRVToLLVMConversion;
@@ -875,18 +942,6 @@ class LoopPattern : public SPIRVToLLVMConversion<spirv::LoopOp> {
   }
 };
 
-class MergePattern : public SPIRVToLLVMConversion<spirv::MergeOp> {
-public:
-  using SPIRVToLLVMConversion<spirv::MergeOp>::SPIRVToLLVMConversion;
-
-  LogicalResult
-  matchAndRewrite(spirv::MergeOp op, ArrayRef<Value> operands,
-                  ConversionPatternRewriter &rewriter) const override {
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
 /// Converts `spv.selection` with `spv.BranchConditional` in its header block.
 /// All blocks within selection should be reachable for conversion to succeed.
 class SelectionPattern : public SPIRVToLLVMConversion<spirv::SelectionOp> {
@@ -1266,11 +1321,18 @@ void mlir::populateSPIRVToLLVMConversionPatterns(
       ConstantScalarAndVectorPattern,
 
       // Control Flow ops
-      BranchConversionPattern, BranchConditionalConversionPattern, LoopPattern,
-      SelectionPattern, MergePattern,
+      BranchConversionPattern, BranchConditionalConversionPattern,
+      FunctionCallPattern, LoopPattern, SelectionPattern,
+      ErasePattern<spirv::MergeOp>,
+
+      // Entry points and execution mode
+      // Module generated from SPIR-V could have other "internal" functions, so
+      // having entry point and execution mode metadat can be useful. For now,
+      // simply remove them.
+      // TODO: Support EntryPoint/ExecutionMode properly.
+      ErasePattern<spirv::EntryPointOp>, ErasePattern<spirv::ExecutionModeOp>,
 
       // Function Call op
-      FunctionCallPattern,
 
       // GLSL extended instruction set ops
       DirectConversionPattern<spirv::GLSLCeilOp, LLVM::FCeilOp>,
@@ -1295,8 +1357,9 @@ void mlir::populateSPIRVToLLVMConversionPatterns(
       NotPattern<spirv::LogicalNotOp>,
 
       // Memory ops
-      AddressOfPattern, GlobalVariablePattern, LoadStorePattern<spirv::LoadOp>,
-      LoadStorePattern<spirv::StoreOp>, VariablePattern,
+      AccessChainPattern, AddressOfPattern, GlobalVariablePattern,
+      LoadStorePattern<spirv::LoadOp>, LoadStorePattern<spirv::StoreOp>,
+      VariablePattern,
 
       // Miscellaneous ops
       DirectConversionPattern<spirv::SelectOp, LLVM::SelectOp>,
diff --git a/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir
index 51a734c462a36c..4402a513fb93e8 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir
@@ -1,5 +1,30 @@
 // RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s
 
+//===----------------------------------------------------------------------===//
+// spv.AccessChain
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @access_chain
+func @access_chain() -> () {
+  // CHECK: %[[ONE:.*]] = llvm.mlir.constant(1 : i32) : !llvm.i32
+  %0 = spv.constant 1: i32
+  %1 = spv.Variable : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>
+  // CHECK: %[[ZERO:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+  // CHECK: llvm.getelementptr %{{.*}}[%[[ZERO]], %[[ONE]], %[[ONE]]] : (!llvm.ptr<struct<packed (float, array<4 x float>)>>, !llvm.i32, !llvm.i32, !llvm.i32) -> !llvm.ptr<float>
+  %2 = spv.AccessChain %1[%0, %0] : !spv.ptr<!spv.struct<f32, !spv.array<4xf32>>, Function>, i32, i32
+  return
+}
+
+// CHECK-LABEL: @access_chain_array
+func @access_chain_array(%arg0 : i32) -> () {
+  %0 = spv.Variable : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>
+  // CHECK: %[[ZERO:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
+  // CHECK: llvm.getelementptr %{{.*}}[%[[ZERO]], %{{.*}}] : (!llvm.ptr<array<4 x array<4 x float>>>, !llvm.i32, !llvm.i32) -> !llvm.ptr<array<4 x float>>
+  %1 = spv.AccessChain %0[%arg0] : !spv.ptr<!spv.array<4x!spv.array<4xf32>>, Function>, i32
+  %2 = spv.Load "Function" %1 ["Volatile"] : !spv.array<4xf32>
+  return
+}
+
 //===----------------------------------------------------------------------===//
 // spv.globalVariable and spv._address_of
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir
index 2e74485323ede2..d54b91668cd972 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir
@@ -20,6 +20,23 @@ func @select_vector(%arg0: vector<2xi1>, %arg1: vector<2xi32>) {
   return
 }
 
+//===----------------------------------------------------------------------===//
+// spv.EntryPoint and spv.ExecutionMode
+//===----------------------------------------------------------------------===//
+
+//      CHECK: module {
+// CHECK-NEXT:   llvm.func @empty
+// CHECK-NEXT:     llvm.return
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
+spv.module Logical GLSL450 {
+  spv.func @empty() -> () "None" {
+    spv.Return
+  }
+  spv.EntryPoint "GLCompute" @empty
+  spv.ExecutionMode @empty "LocalSize", 1, 1, 1
+}
+
 //===----------------------------------------------------------------------===//
 // spv.Undef
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm-invalid.mlir b/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm-invalid.mlir
index 96fb9f44af5783..87f0bd8d829808 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm-invalid.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm-invalid.mlir
@@ -1,21 +1,14 @@
 // RUN: mlir-opt %s -convert-spirv-to-llvm -verify-diagnostics -split-input-file
 
 // expected-error@+1 {{failed to legalize operation 'spv.func' that was explicitly marked illegal}}
-spv.func @array_with_stride(%arg: !spv.array<4 x f32, stride=4>) -> () "None" {
+spv.func @array_with_unnatural_stride(%arg: !spv.array<4 x f32, stride=8>) -> () "None" {
   spv.Return
 }
 
 // -----
 
 // expected-error@+1 {{failed to legalize operation 'spv.func' that was explicitly marked illegal}}
-spv.func @struct_with_offset1(%arg: !spv.struct<i32[0], i32[4]>) -> () "None" {
-  spv.Return
-}
-
-// -----
-
-// expected-error@+1 {{failed to legalize operation 'spv.func' that was explicitly marked illegal}}
-spv.func @struct_with_offset2(%arg: !spv.struct<i32[0], i32[8]>) -> () "None" {
+spv.func @struct_with_unnatural_offset(%arg: !spv.struct<i32[0], i32[8]>) -> () "None" {
   spv.Return
 }
 
diff --git a/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm.mlir
index d6618a7de7fed0..454b5b314f88a3 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm.mlir
@@ -5,7 +5,10 @@
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: @array(!llvm.array<16 x float>, !llvm.array<32 x vec<4 x float>>)
-func @array(!spv.array<16xf32>, !spv.array< 32 x vector<4xf32> >) -> ()
+func @array(!spv.array<16 x f32>, !spv.array< 32 x vector<4xf32> >) -> ()
+
+// CHECK-LABEL: @array_with_natural_stride(!llvm.array<16 x float>)
+func @array_with_natural_stride(!spv.array<16 x f32, stride=4>) -> ()
 
 //===----------------------------------------------------------------------===//
 // Pointer type
@@ -36,3 +39,6 @@ func @struct(!spv.struct<f64>) -> ()
 
 // CHECK-LABEL: @struct_nested(!llvm.struct<packed (i32, struct<packed (i64, i32)>)>)
 func @struct_nested(!spv.struct<i32, !spv.struct<i64, i32>>)
+
+// CHECK-LABEL: @struct_with_natural_offset(!llvm.struct<(i8, i32)>)
+func @struct_with_natural_offset(!spv.struct<i8[0], i32[4]>) -> ()

From 31f02ac60aa8e89c04617e82fa2b1140e33e824d Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Mon, 17 Aug 2020 16:03:55 +0100
Subject: [PATCH 057/101] [ARM] Use mov operand if the mov cannot be moved
 while tail predicating

There are some cases where the instruction that sets up the iteration
count for a tail predicated loop cannot be moved before the dlstp,
stopping tail predication entirely. This patch checks if the mov operand
can be used and if so, uses that instead.

Differential Revision: https://reviews.llvm.org/D86087
---
 llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp   |  37 ++-
 .../LowOverheadLoops/mov-after-dlstp.mir      | 269 ++++++++++++++++++
 .../Thumb2/LowOverheadLoops/mov-operand.ll    |  81 ++++++
 3 files changed, 375 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
 create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll

diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index be75d6bef08c4a..2e7cd412db1cc0 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -226,6 +226,7 @@ namespace {
     MachineInstr *Dec = nullptr;
     MachineInstr *End = nullptr;
     MachineInstr *VCTP = nullptr;
+    MachineOperand TPNumElements;
     SmallPtrSet<MachineInstr*, 4> SecondaryVCTPs;
     VPTBlock *CurrentBlock = nullptr;
     SetVector<MachineInstr*> CurrentPredicate;
@@ -239,7 +240,8 @@ namespace {
     LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
                     ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
                     const ARMBaseInstrInfo &TII)
-      : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII) {
+        : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII),
+          TPNumElements(MachineOperand::CreateImm(0)) {
       MF = ML.getHeader()->getParent();
       if (auto *MBB = ML.getLoopPreheader())
         Preheader = MBB;
@@ -291,11 +293,10 @@ namespace {
 
     SmallVectorImpl<VPTBlock> &getVPTBlocks() { return VPTBlocks; }
 
-    // Return the loop iteration count, or the number of elements if we're tail
-    // predicating.
-    MachineOperand &getCount() {
-      return IsTailPredicationLegal() ?
-        VCTP->getOperand(1) : Start->getOperand(0);
+    // Return the operand for the loop start instruction. This will be the loop
+    // iteration count, or the number of elements if we're tail predicating.
+    MachineOperand &getLoopStartOperand() {
+      return IsTailPredicationLegal() ? TPNumElements : Start->getOperand(0);
     }
 
     unsigned getStartOpcode() const {
@@ -453,7 +454,8 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
   // of the iteration count, to the loop start instruction. The number of
   // elements is provided to the vctp instruction, so we need to check that
   // we can use this register at InsertPt.
-  Register NumElements = VCTP->getOperand(1).getReg();
+  TPNumElements = VCTP->getOperand(1);
+  Register NumElements = TPNumElements.getReg();
 
   // If the register is defined within loop, then we can't perform TP.
   // TODO: Check whether this is just a mov of a register that would be
@@ -466,9 +468,8 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
   // The element count register maybe defined after InsertPt, in which case we
   // need to try to move either InsertPt or the def so that the [w|d]lstp can
   // use the value.
-  // TODO: On failing to move an instruction, check if the count is provided by
-  // a mov and whether we can use the mov operand directly.
   MachineBasicBlock *InsertBB = StartInsertPt->getParent();
+
   if (!RDA.isReachingDefLiveOut(StartInsertPt, NumElements)) {
     if (auto *ElemDef = RDA.getLocalLiveOutMIDef(InsertBB, NumElements)) {
       if (RDA.isSafeToMoveForwards(ElemDef, StartInsertPt)) {
@@ -482,9 +483,21 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
                               StartInsertPt);
         LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
       } else {
-        LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop "
-                   << "start instruction.\n");
-        return false;
+        // If we fail to move an instruction and the element count is provided
+        // by a mov, use the mov operand if it will have the same value at the
+        // insertion point
+        MachineOperand Operand = ElemDef->getOperand(1);
+        if (isMovRegOpcode(ElemDef->getOpcode()) &&
+            RDA.getUniqueReachingMIDef(ElemDef, Operand.getReg()) ==
+                RDA.getUniqueReachingMIDef(StartInsertPt, Operand.getReg())) {
+          TPNumElements = Operand;
+          NumElements = TPNumElements.getReg();
+        } else {
+          LLVM_DEBUG(dbgs()
+                     << "ARM Loops: Unable to move element count to loop "
+                     << "start instruction.\n");
+          return false;
+        }
       }
     }
   }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
new file mode 100644
index 00000000000000..9a5856335dfc67
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir
@@ -0,0 +1,269 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -run-pass=arm-low-overhead-loops -tail-predication=enabled %s -o - | FileCheck %s
+
+--- |
+  define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) #0 {
+  entry:
+    %0 = add i32 %blockSize, 3
+    %1 = icmp slt i32 %blockSize, 4
+    %smin = select i1 %1, i32 %blockSize, i32 4
+    %2 = sub i32 %0, %smin
+    %3 = lshr i32 %2, 2
+    %4 = add nuw nsw i32 %3, 1
+    %5 = icmp slt i32 %blockSize, 4
+    %smin3 = select i1 %5, i32 %blockSize, i32 4
+    %6 = sub i32 %0, %smin3
+    %7 = lshr i32 %6, 2
+    %8 = add nuw nsw i32 %7, 1
+    call void @llvm.set.loop.iterations.i32(i32 %8)
+    br label %do.body.i
+
+  do.body.i:                                        ; preds = %do.body.i, %entry
+    %blkCnt.0.i = phi i32 [ %13, %do.body.i ], [ %blockSize, %entry ]
+    %sumVec.0.i = phi <4 x float> [ %12, %do.body.i ], [ zeroinitializer, %entry ]
+    %pSrc.addr.0.i = phi float* [ %add.ptr.i, %do.body.i ], [ %pSrc, %entry ]
+    %9 = phi i32 [ %8, %entry ], [ %14, %do.body.i ]
+    %pSrc.addr.0.i2 = bitcast float* %pSrc.addr.0.i to <4 x float>*
+    %10 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0.i)
+    %11 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.0.i2, i32 4, <4 x i1> %10, <4 x float> zeroinitializer)
+    %12 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %sumVec.0.i, <4 x float> %11, <4 x i1> %10, <4 x float> %sumVec.0.i)
+    %add.ptr.i = getelementptr inbounds float, float* %pSrc.addr.0.i, i32 4
+    %13 = add i32 %blkCnt.0.i, -4
+    %14 = call i32 @llvm.loop.decrement.reg.i32(i32 %9, i32 1)
+    %15 = icmp ne i32 %14, 0
+    br i1 %15, label %do.body.i, label %arm_mean_f32_mve.exit
+
+  arm_mean_f32_mve.exit:                            ; preds = %do.body.i
+    %16 = extractelement <4 x float> %12, i32 3
+    %add2.i.i = fadd fast float %16, %16
+    %conv.i = uitofp i32 %blockSize to float
+    %div.i = fdiv fast float %add2.i.i, %conv.i
+    %17 = bitcast float %div.i to i32
+    %18 = insertelement <4 x i32> undef, i32 %17, i64 0
+    %19 = shufflevector <4 x i32> %18, <4 x i32> undef, <4 x i32> zeroinitializer
+    %20 = bitcast <4 x i32> %19 to <4 x float>
+    call void @llvm.set.loop.iterations.i32(i32 %4)
+    br label %do.body
+
+  do.body:                                          ; preds = %do.body, %arm_mean_f32_mve.exit
+    %blkCnt.0 = phi i32 [ %blockSize, %arm_mean_f32_mve.exit ], [ %26, %do.body ]
+    %sumVec.0 = phi <4 x float> [ zeroinitializer, %arm_mean_f32_mve.exit ], [ %25, %do.body ]
+    %pSrc.addr.0 = phi float* [ %pSrc, %arm_mean_f32_mve.exit ], [ %add.ptr, %do.body ]
+    %21 = phi i32 [ %4, %arm_mean_f32_mve.exit ], [ %27, %do.body ]
+    %pSrc.addr.01 = bitcast float* %pSrc.addr.0 to <4 x float>*
+    %22 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
+    %23 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %pSrc.addr.01, i32 4, <4 x i1> %22, <4 x float> zeroinitializer)
+    %24 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %23, <4 x float> %20, <4 x i1> %22, <4 x float> undef)
+    %25 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %24, <4 x float> %24, <4 x float> %sumVec.0, <4 x i1> %22)
+    %add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4
+    %26 = add i32 %blkCnt.0, -4
+    %27 = call i32 @llvm.loop.decrement.reg.i32(i32 %21, i32 1)
+    %28 = icmp ne i32 %27, 0
+    br i1 %28, label %do.body, label %do.end
+
+  do.end:                                           ; preds = %do.body
+    %29 = extractelement <4 x float> %25, i32 3
+    %add2.i = fadd fast float %29, %29
+    %sub2 = add i32 %blockSize, -1
+    %conv = uitofp i32 %sub2 to float
+    %div = fdiv fast float %add2.i, %conv
+    store float %div, float* %pResult, align 4
+    ret void
+  }
+
+  ; Function Attrs: nounwind readnone
+  declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
+
+  ; Function Attrs: nounwind readnone
+  declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>) #1
+
+  ; Function Attrs: nounwind readnone
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1
+
+  ; Function Attrs: argmemonly nounwind readonly willreturn
+  declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #2
+
+  ; Function Attrs: nounwind readnone
+  declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #1
+
+  ; Function Attrs: noduplicate nounwind
+  declare void @llvm.set.loop.iterations.i32(i32) #3
+
+  ; Function Attrs: noduplicate nounwind
+  declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #3
+
+  attributes #0 = { "target-features"="+mve.fp" }
+  attributes #1 = { nounwind readnone "target-features"="+mve.fp" }
+  attributes #2 = { argmemonly nounwind readonly willreturn "target-features"="+mve.fp" }
+  attributes #3 = { noduplicate nounwind }
+
+...
+---
+name:            arm_var_f32_mve
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: arm_var_f32_mve
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r4
+  ; CHECK:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
+  ; CHECK:   $r12 = tMOVr $r0, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r3
+  ; CHECK: bb.1.do.body.i:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r12
+  ; CHECK:   renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.0.i2, align 4)
+  ; CHECK:   renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK: bb.2.arm_mean_f32_mve.exit:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   liveins: $q0, $r0, $r1, $r2
+  ; CHECK:   $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg
+  ; CHECK:   $lr = MVE_DLSTP_32 $r1
+  ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, killed renamable $s3, 14 /* CC::al */, $noreg, implicit killed $q0
+  ; CHECK:   renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s4, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
+  ; CHECK:   $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
+  ; CHECK: bb.3.do.body:
+  ; CHECK:   successors: %bb.3(0x7c000000), %bb.4(0x04000000)
+  ; CHECK:   liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3
+  ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $r0, renamable $q2 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.01, align 4)
+  ; CHECK:   renamable $q2 = nnan ninf nsz arcp contract afn reassoc MVE_VSUBf32 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2
+  ; CHECK:   renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VFMAf32 killed renamable $q0, killed renamable $q2, killed renamable $q2, 0, killed $noreg
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.3
+  ; CHECK: bb.4.do.end:
+  ; CHECK:   liveins: $q0, $r1, $r2
+  ; CHECK:   renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 1, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, killed renamable $s3, 14 /* CC::al */, $noreg, implicit killed $q0
+  ; CHECK:   $s2 = VMOVSR killed $r0, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $s2 = VUITOS killed renamable $s2, 14 /* CC::al */, $noreg
+  ; CHECK:   renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s2, 14 /* CC::al */, $noreg
+  ; CHECK:   VSTRS killed renamable $s0, killed renamable $r2, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.pResult)
+  ; CHECK:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r4, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r4, -8
+    $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
+    tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    t2IT 10, 8, implicit-def $itstate
+    renamable $r3 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r3, implicit killed $itstate
+    renamable $r12 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+    renamable $r3, dead $cpsr = tSUBrr renamable $r1, killed renamable $r3, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 3, 14 /* CC::al */, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg
+    $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
+    $r12 = tMOVr $r0, 14 /* CC::al */, $noreg
+    t2DoLoopStart renamable $lr
+    $r4 = tMOVr $lr, 14 /* CC::al */, $noreg
+
+  bb.1.do.body.i:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r12
+
+    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    MVE_VPST 4, implicit $vpr
+    renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.0.i2, align 4)
+    renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, renamable $q0
+    t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14 /* CC::al */, $noreg
+
+  bb.2.arm_mean_f32_mve.exit:
+    successors: %bb.3(0x80000000)
+    liveins: $q0, $r0, $r1, $r2, $r4
+
+    $s4 = VMOVSR $r1, 14 /* CC::al */, $noreg
+    $lr = tMOVr $r4, 14 /* CC::al */, $noreg
+    renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, renamable $s3, 14 /* CC::al */, $noreg, implicit $q0
+    t2DoLoopStart killed $r4
+    renamable $s4 = VUITOS killed renamable $s4, 14 /* CC::al */, $noreg
+    renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s4, 14 /* CC::al */, $noreg
+    renamable $r3 = VMOVRS killed renamable $s0, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0
+    renamable $q1 = MVE_VDUP32 killed renamable $r3, 0, $noreg, undef renamable $q1
+    $r3 = tMOVr $r1, 14 /* CC::al */, $noreg
+
+  bb.3.do.body:
+    successors: %bb.3(0x7c000000), %bb.4(0x04000000)
+    liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3
+
+    renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
+    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    MVE_VPST 2, implicit $vpr
+    renamable $r0, renamable $q2 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.01, align 4)
+    renamable $q2 = nnan ninf nsz arcp contract afn reassoc MVE_VSUBf32 killed renamable $q2, renamable $q1, 1, renamable $vpr, undef renamable $q2
+    renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VFMAf32 killed renamable $q0, killed renamable $q2, renamable $q2, 1, killed renamable $vpr
+    t2LoopEnd renamable $lr, %bb.3, implicit-def dead $cpsr
+    tB %bb.4, 14 /* CC::al */, $noreg
+
+  bb.4.do.end:
+    liveins: $q0, $r1, $r2
+
+    renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 1, 14 /* CC::al */, $noreg
+    renamable $s0 = nnan ninf nsz arcp contract afn reassoc VADDS killed renamable $s3, renamable $s3, 14 /* CC::al */, $noreg, implicit $q0
+    $s2 = VMOVSR killed $r0, 14 /* CC::al */, $noreg
+    renamable $s2 = VUITOS killed renamable $s2, 14 /* CC::al */, $noreg
+    renamable $s0 = nnan ninf nsz arcp contract afn reassoc VDIVS killed renamable $s0, killed renamable $s2, 14 /* CC::al */, $noreg
+    VSTRS killed renamable $s0, killed renamable $r2, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.pResult)
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc
+
+...
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
new file mode 100644
index 00000000000000..63a2c0233f6e3e
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
+define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float* nocapture %pResult) {
+; CHECK-LABEL:  .LBB0_1: @ %do.body.i
+; CHECK:    dlstp.32 lr, r1
+; CHECK-NEXT:    vadd.f32 s0, s3, s3
+; CHECK-NEXT:    vcvt.f32.u32 s4, s4
+; CHECK-NEXT:    vdiv.f32 s0, s0, s4
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vdup.32 q1, r3
+; CHECK-NEXT:    mov r3, r1
+; CHECK-NEXT:  .LBB0_3: @ %do.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
+; CHECK-NEXT:    vsub.f32 q2, q2, q1
+; CHECK-NEXT:    vfma.f32 q0, q2, q2
+; CHECK-NEXT:    letp lr, .LBB0_3
+entry:
+  br label %do.body.i
+
+do.body.i:                                        ; preds = %entry, %do.body.i
+  %blkCnt.0.i = phi i32 [ %sub.i, %do.body.i ], [ %blockSize, %entry ]
+  %sumVec.0.i = phi <4 x float> [ %3, %do.body.i ], [ zeroinitializer, %entry ]
+  %pSrc.addr.0.i = phi float* [ %add.ptr.i, %do.body.i ], [ %pSrc, %entry ]
+  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0.i)
+  %1 = bitcast float* %pSrc.addr.0.i to <4 x float>*
+  %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
+  %3 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %sumVec.0.i, <4 x float> %2, <4 x i1> %0, <4 x float> %sumVec.0.i)
+  %sub.i = add nsw i32 %blkCnt.0.i, -4
+  %add.ptr.i = getelementptr inbounds float, float* %pSrc.addr.0.i, i32 4
+  %cmp.i = icmp sgt i32 %blkCnt.0.i, 4
+  br i1 %cmp.i, label %do.body.i, label %arm_mean_f32_mve.exit
+
+arm_mean_f32_mve.exit:                            ; preds = %do.body.i
+  %4 = extractelement <4 x float> %3, i32 3
+  %add2.i.i = fadd fast float %4, %4
+  %conv.i = uitofp i32 %blockSize to float
+  %div.i = fdiv fast float %add2.i.i, %conv.i
+  %.splatinsert = insertelement <4 x float> undef, float %div.i, i32 0
+  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %arm_mean_f32_mve.exit
+  %blkCnt.0 = phi i32 [ %blockSize, %arm_mean_f32_mve.exit ], [ %sub, %do.body ]
+  %sumVec.0 = phi <4 x float> [ zeroinitializer, %arm_mean_f32_mve.exit ], [ %9, %do.body ]
+  %pSrc.addr.0 = phi float* [ %pSrc, %arm_mean_f32_mve.exit ], [ %add.ptr, %do.body ]
+  %5 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.0)
+  %6 = bitcast float* %pSrc.addr.0 to <4 x float>*
+  %7 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %6, i32 4, <4 x i1> %5, <4 x float> zeroinitializer)
+  %8 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %7, <4 x float> %.splat, <4 x i1> %5, <4 x float> undef)
+  %9 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %8, <4 x float> %8, <4 x float> %sumVec.0, <4 x i1> %5)
+  %sub = add nsw i32 %blkCnt.0, -4
+  %add.ptr = getelementptr inbounds float, float* %pSrc.addr.0, i32 4
+  %cmp1 = icmp sgt i32 %blkCnt.0, 4
+  br i1 %cmp1, label %do.body, label %do.end
+
+do.end:                                           ; preds = %do.body
+  %10 = extractelement <4 x float> %9, i32 3
+  %add2.i = fadd fast float %10, %10
+  %sub2 = add i32 %blockSize, -1
+  %conv = uitofp i32 %sub2 to float
+  %div = fdiv fast float %add2.i, %conv
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %do.end
+  store float %div, float* %pResult, align 4
+  ret void
+}
+
+declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+
+declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>)
+
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
+
+declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
+

From 7baed769c7ea8de27a1c077c7ff30f4e19988ade Mon Sep 17 00:00:00 2001
From: Jan Kratochvil <jan.kratochvil@redhat.com>
Date: Tue, 18 Aug 2020 18:09:55 +0200
Subject: [PATCH 058/101] [lldb] [testsuite] Add split-file for check-lldb
 dependencies

D85968 started to use `split-file` and while buildbots run fine while
doing `make check-lldb` by hand I get:

.../llvm-monorepo-clangassert/tools/lldb/test/SymbolFile/DWARF/Output/DW_AT_declaration-with-children.s.script: line 2: split-file: command not found
failed:
  lldb-shell :: SymbolFile/DWARF/DW_AT_declaration-with-children.s

Differential Revision: https://reviews.llvm.org/D86144
---
 lldb/test/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt
index c0249180253abe..21d8c61f11ed58 100644
--- a/lldb/test/CMakeLists.txt
+++ b/lldb/test/CMakeLists.txt
@@ -77,6 +77,7 @@ if(NOT LLDB_BUILT_STANDALONE)
     dsymutil
     llvm-strip
     not
+    split-file
     yaml2obj
   )
 endif()

From 3471520b1f6bc4fedfe45505f02924dc44e5106f Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 18 Aug 2020 17:15:45 +0100
Subject: [PATCH 059/101] [ARM] Allow tail predication of VLDn

VLD2/4 instructions cannot be predicated, so we cannot tail predicate
them from autovec. From intrinsics though, they should be valid as they
will just end up loading extra values into off vector lanes, not
effecting the on lanes. The same is true for loads in general where so
long as we are not using the other vector lanes, an unpredicated load
can be converted to a predicated one.

This marks VLD2 and VLD4 instructions as validForTailPredication and
allows any unpredicated load in tail predication loop, which seems to be
valid given the other checks we have.

Differential Revision: https://reviews.llvm.org/D86022
---
 llvm/lib/Target/ARM/ARMInstrMVE.td            |  1 +
 llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp   |  6 +-
 .../Thumb2/LowOverheadLoops/unpredload.ll     | 40 +++---------
 .../unittests/Target/ARM/MachineInstrTest.cpp | 62 +++++++++++++++----
 4 files changed, 64 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index c4ce13677b309f..eda41e8eef065d 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -5829,6 +5829,7 @@ class MVE_vldst24_base<bit writeback, bit fourregs, bits<2> stage, bits<2> size,
   let mayLoad = load;
   let mayStore = !eq(load,0);
   let hasSideEffects = 0;
+  let validForTailPredication = load;
 }
 
 // A parameter class used to encapsulate all the ways the writeback
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 2e7cd412db1cc0..11e8aa742d89b3 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -782,7 +782,7 @@ bool LowOverheadLoop::ValidateLiveOuts() {
   // the false lanes are zeroed and here we're trying to track that those false
   // lanes remain zero, or where they change, the differences are masked away
   // by their user(s).
-  // All MVE loads and stores have to be predicated, so we know that any load
+  // All MVE stores have to be predicated, so we know that any predicate load
   // operands, or stored results are equivalent already. Other explicitly
   // predicated instructions will perform the same operation in the original
   // loop and the tail-predicated form too. Because of this, we can insert
@@ -1038,8 +1038,8 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
   }
 
   // If the instruction is already explicitly predicated, then the conversion
-  // will be fine, but ensure that all memory operations are predicated.
-  return !IsUse && MI->mayLoadOrStore() ? false : true;
+  // will be fine, but ensure that all store operations are predicated.
+  return !IsUse && MI->mayStore() ? false : true;
 }
 
 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
index 3f48cc3ad59b2e..440080e4e142dc 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
@@ -6,26 +6,17 @@ define void @arm_cmplx_mag_squared_q15_mve(i16* %pSrc, i16* %pDst, i32 %blockSiz
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    subs.w r12, r2, #8
-; CHECK-NEXT:    mov.w r3, #-1
-; CHECK-NEXT:    csinv r3, r3, r12, pl
-; CHECK-NEXT:    add.w r12, r3, r2
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dlstp.16 lr, r2
 ; CHECK-NEXT:  .LBB0_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vld20.16 {q0, q1}, [r0]
-; CHECK-NEXT:    vctp.16 r2
-; CHECK-NEXT:    subs r2, #8
 ; CHECK-NEXT:    vld21.16 {q0, q1}, [r0]!
-; CHECK-NEXT:    vpstttt
-; CHECK-NEXT:    vmulht.s16 q2, q1, q1
-; CHECK-NEXT:    vmulht.s16 q0, q0, q0
-; CHECK-NEXT:    vqaddt.s16 q0, q0, q2
-; CHECK-NEXT:    vshrt.s16 q0, q0, #1
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vstrht.16 q0, [r1], #16
-; CHECK-NEXT:    le lr, .LBB0_1
+; CHECK-NEXT:    vmulh.s16 q2, q1, q1
+; CHECK-NEXT:    vmulh.s16 q0, q0, q0
+; CHECK-NEXT:    vqadd.s16 q0, q0, q2
+; CHECK-NEXT:    vshr.s16 q0, q0, #1
+; CHECK-NEXT:    vstrh.16 q0, [r1], #16
+; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %do.end
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -148,25 +139,14 @@ define i32 @good2(i32* nocapture readonly %x, i32* nocapture readonly %y, i32 %n
 ; CHECK-LABEL: good2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    cmp r2, #4
-; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r3, #4
-; CHECK-NEXT:    subs r3, r2, r3
-; CHECK-NEXT:    add.w r12, r3, #3
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dls lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB3_1: @ %do.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vctp.32 r2
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
-; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vpst
-; CHECK-NEXT:    vmlavat.s32 r12, q1, q0
-; CHECK-NEXT:    le lr, .LBB3_1
+; CHECK-NEXT:    vmlava.s32 r12, q1, q0
+; CHECK-NEXT:    letp lr, .LBB3_1
 ; CHECK-NEXT:  @ %bb.2: @ %do.end
 ; CHECK-NEXT:    mov r0, r12
 ; CHECK-NEXT:    pop {r7, pc}
diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
index 792a15dcbfafd3..876e011e1ce8a6 100644
--- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp
+++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
@@ -382,7 +382,7 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
       return false;
     case MVE_ASRLi:
     case MVE_ASRLr:
-    case MVE_LSRL:	
+    case MVE_LSRL:
     case MVE_SQRSHR:
     case MVE_SQSHL:
     case MVE_SRSHR:
@@ -393,7 +393,7 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
     case MVE_VABDf32:
     case MVE_VABDs16:
     case MVE_VABDs32:
-    case MVE_VABDs8:	
+    case MVE_VABDs8:
     case MVE_VABDu16:
     case MVE_VABDu32:
     case MVE_VABDu8:
@@ -609,6 +609,42 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
     case MVE_VIWDUPu16:
     case MVE_VIWDUPu32:
     case MVE_VIWDUPu8:
+    case MVE_VLD20_8:
+    case MVE_VLD21_8:
+    case MVE_VLD20_16:
+    case MVE_VLD21_16:
+    case MVE_VLD20_32:
+    case MVE_VLD21_32:
+    case MVE_VLD20_8_wb:
+    case MVE_VLD21_8_wb:
+    case MVE_VLD20_16_wb:
+    case MVE_VLD21_16_wb:
+    case MVE_VLD20_32_wb:
+    case MVE_VLD21_32_wb:
+    case MVE_VLD40_8:
+    case MVE_VLD41_8:
+    case MVE_VLD42_8:
+    case MVE_VLD43_8:
+    case MVE_VLD40_16:
+    case MVE_VLD41_16:
+    case MVE_VLD42_16:
+    case MVE_VLD43_16:
+    case MVE_VLD40_32:
+    case MVE_VLD41_32:
+    case MVE_VLD42_32:
+    case MVE_VLD43_32:
+    case MVE_VLD40_8_wb:
+    case MVE_VLD41_8_wb:
+    case MVE_VLD42_8_wb:
+    case MVE_VLD43_8_wb:
+    case MVE_VLD40_16_wb:
+    case MVE_VLD41_16_wb:
+    case MVE_VLD42_16_wb:
+    case MVE_VLD43_16_wb:
+    case MVE_VLD40_32_wb:
+    case MVE_VLD41_32_wb:
+    case MVE_VLD42_32_wb:
+    case MVE_VLD43_32_wb:
     case MVE_VLDRBS16:
     case MVE_VLDRBS16_post:
     case MVE_VLDRBS16_pre:
@@ -657,9 +693,9 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
     case MVE_VLDRWU32_rq_u:
     case MVE_VMOVimmf32:
     case MVE_VMOVimmi16:
-    case MVE_VMOVimmi32:	
+    case MVE_VMOVimmi32:
     case MVE_VMOVimmi64:
-    case MVE_VMOVimmi8:	
+    case MVE_VMOVimmi8:
     case MVE_VMOVNi16bh:
     case MVE_VMOVNi16th:
     case MVE_VMOVNi32bh:
@@ -679,7 +715,7 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
     case MVE_VMULLTs8:
     case MVE_VMULLTu16:
     case MVE_VMULLTu32:
-    case MVE_VMULLTu8:	
+    case MVE_VMULLTu8:
     case MVE_VMUL_qr_f16:
     case MVE_VMUL_qr_f32:
     case MVE_VMUL_qr_i16:
@@ -702,7 +738,7 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
     case MVE_VORR:
     case MVE_VORRimmi16:
     case MVE_VORRimmi32:
-    case MVE_VPST:	
+    case MVE_VPST:
     case MVE_VQABSs16:
     case MVE_VQABSs32:
     case MVE_VQABSs8:
@@ -814,7 +850,7 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
     case MVE_VRHADDs32:
     case MVE_VRHADDs8:
     case MVE_VRHADDu16:
-    case MVE_VRHADDu32:	
+    case MVE_VRHADDu32:
     case MVE_VRHADDu8:
     case MVE_VRINTf16A:
     case MVE_VRINTf16M:
@@ -825,12 +861,12 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
     case MVE_VRINTf32A:
     case MVE_VRINTf32M:
     case MVE_VRINTf32N:
-    case MVE_VRINTf32P:	
-    case MVE_VRINTf32X:	
+    case MVE_VRINTf32P:
+    case MVE_VRINTf32X:
     case MVE_VRINTf32Z:
     case MVE_VRSHL_by_vecs16:
     case MVE_VRSHL_by_vecs32:
-    case MVE_VRSHL_by_vecs8:	
+    case MVE_VRSHL_by_vecs8:
     case MVE_VRSHL_by_vecu16:
     case MVE_VRSHL_by_vecu32:
     case MVE_VRSHL_by_vecu8:
@@ -887,7 +923,7 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
     case MVE_VSTRB16_rq:
     case MVE_VSTRB32:
     case MVE_VSTRB32_post:
-    case MVE_VSTRB32_pre:	
+    case MVE_VSTRB32_pre:
     case MVE_VSTRB32_rq:
     case MVE_VSTRB8_rq:
     case MVE_VSTRBU8:
@@ -957,7 +993,9 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
     for (auto &Op : Desc.operands()) {
       // Only check instructions that access the MQPR regs.
       if ((Op.OperandType & MCOI::OPERAND_REGISTER) == 0 ||
-          Op.RegClass != ARM::MQPRRegClassID)
+          (Op.RegClass != ARM::MQPRRegClassID &&
+           Op.RegClass != ARM::QQPRRegClassID &&
+           Op.RegClass != ARM::QQQQPRRegClassID))
         continue;
 
       uint64_t Flags = MII->get(i).TSFlags;

From ca77ab494aa29f7521ff797d230cd1b36cbe4e62 Mon Sep 17 00:00:00 2001
From: "Mott, Jeffrey T" <jeffrey.t.mott@intel.com>
Date: Fri, 17 Jul 2020 09:50:08 -0700
Subject: [PATCH 060/101] Disable use of _ExtInt with '__atomic' builtins

We're (temporarily) disabling ExtInt for the '__atomic' builtins so we can better design their behavior later. The idea is until we do an audit/design for the way atomic builtins are supposed to work with _ExtInt, we should leave them restricted so they don't limit our future options, such as by binding us to a sub-optimal implementation via ABI.

Example after this change:

    $ cat test.c

        void f(_ExtInt(64) *ptr) {
          __atomic_fetch_add(ptr, 1, 0);
        }

    $ clang -c test.c

        test.c:2:22: error: argument to atomic builtin of type '_ExtInt' is not supported
          __atomic_fetch_add(ptr, 1, 0);
                             ^
        1 error generated.

Differential Revision: https://reviews.llvm.org/D84049
---
 clang/include/clang/Basic/DiagnosticSemaKinds.td |  7 ++++---
 clang/lib/Sema/SemaChecking.cpp                  |  5 +++++
 clang/lib/Sema/SemaType.cpp                      |  5 +----
 clang/test/Sema/builtins.c                       |  4 ++++
 clang/test/SemaCXX/ext-int.cpp                   |  5 +++--
 libcxx/test/libcxx/atomics/ext-int.verify.cpp    | 11 +++++++++++
 6 files changed, 28 insertions(+), 9 deletions(-)
 create mode 100644 libcxx/test/libcxx/atomics/ext-int.verify.cpp

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index fd21285b1f7929..a63fae5b5f726c 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -6038,9 +6038,8 @@ def err_func_def_incomplete_result : Error<
 def err_atomic_specifier_bad_type
     : Error<"_Atomic cannot be applied to "
             "%select{incomplete |array |function |reference |atomic |qualified "
-            "|sizeless ||integer |integer }0type "
-            "%1 %select{|||||||which is not trivially copyable|with less than "
-            "1 byte of precision|with a non power of 2 precision}0">;
+            "|sizeless ||integer }0type "
+            "%1 %select{|||||||which is not trivially copyable|}0">;
 
 // Expressions.
 def ext_sizeof_alignof_function_type : Extension<
@@ -7967,6 +7966,8 @@ def err_atomic_exclusive_builtin_pointer_size : Error<
   " 1,2,4 or 8 byte type (%0 invalid)">;
 def err_atomic_builtin_ext_int_size : Error<
   "Atomic memory operand must have a power-of-two size">;
+def err_atomic_builtin_ext_int_prohibit : Error<
+  "argument to atomic builtin of type '_ExtInt' is not supported">;
 def err_atomic_op_needs_atomic : Error<
   "address argument to atomic operation must be a pointer to _Atomic "
   "type (%0 invalid)">;
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 4efd62f58d2e6d..70d3a682fc7028 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5050,6 +5050,11 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
                 ? 0
                 : 1);
 
+  if (ValType->isExtIntType()) {
+    Diag(Ptr->getExprLoc(), diag::err_atomic_builtin_ext_int_prohibit);
+    return ExprError();
+  }
+
   return AE;
 }
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index b2be31ac09904e..4ab5cc5fd8b989 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -8963,11 +8963,8 @@ QualType Sema::BuildAtomicType(QualType T, SourceLocation Loc) {
     else if (!T.isTriviallyCopyableType(Context))
       // Some other non-trivially-copyable type (probably a C++ class)
       DisallowedKind = 7;
-    else if (auto *ExtTy = T->getAs<ExtIntType>()) {
-      if (ExtTy->getNumBits() < 8)
+    else if (T->isExtIntType()) {
         DisallowedKind = 8;
-      else if (!llvm::isPowerOf2_32(ExtTy->getNumBits()))
-        DisallowedKind = 9;
     }
 
     if (DisallowedKind != -1) {
diff --git a/clang/test/Sema/builtins.c b/clang/test/Sema/builtins.c
index 4b445724f712a1..e4093edb5f0063 100644
--- a/clang/test/Sema/builtins.c
+++ b/clang/test/Sema/builtins.c
@@ -285,12 +285,16 @@ void test_ei_i42i(_ExtInt(42) *ptr, int value) {
   __sync_fetch_and_add(ptr, value); // expected-error {{Atomic memory operand must have a power-of-two size}}
   // expected-warning@+1 {{the semantics of this intrinsic changed with GCC version 4.4 - the newer semantics are provided here}}
   __sync_nand_and_fetch(ptr, value); // expected-error {{Atomic memory operand must have a power-of-two size}}
+
+  __atomic_fetch_add(ptr, 1, 0); // expected-error {{argument to atomic builtin of type '_ExtInt' is not supported}}
 }
 
 void test_ei_i64i(_ExtInt(64) *ptr, int value) {
   __sync_fetch_and_add(ptr, value); // expect success
   // expected-warning@+1 {{the semantics of this intrinsic changed with GCC version 4.4 - the newer semantics are provided here}}
   __sync_nand_and_fetch(ptr, value); // expect success
+
+  __atomic_fetch_add(ptr, 1, 0); // expected-error {{argument to atomic builtin of type '_ExtInt' is not supported}}
 }
 
 void test_ei_ii42(int *ptr, _ExtInt(42) value) {
diff --git a/clang/test/SemaCXX/ext-int.cpp b/clang/test/SemaCXX/ext-int.cpp
index 0f2a3b89be1f1c..a619cd2eb5de1a 100644
--- a/clang/test/SemaCXX/ext-int.cpp
+++ b/clang/test/SemaCXX/ext-int.cpp
@@ -91,10 +91,11 @@ typedef _ExtInt(32) __attribute__((vector_size(16))) VecTy;
 _Complex _ExtInt(3) Cmplx;
 
 // Reject cases of _Atomic:
-// expected-error@+1{{_Atomic cannot be applied to integer type '_ExtInt(4)' with less than 1 byte of precision}}
+// expected-error@+1{{_Atomic cannot be applied to integer type '_ExtInt(4)'}}
 _Atomic _ExtInt(4) TooSmallAtomic;
-// expected-error@+1{{_Atomic cannot be applied to integer type '_ExtInt(9)' with a non power of 2 precision}}
+// expected-error@+1{{_Atomic cannot be applied to integer type '_ExtInt(9)'}}
 _Atomic _ExtInt(9) NotPow2Atomic;
+// expected-error@+1{{_Atomic cannot be applied to integer type '_ExtInt(128)'}}
 _Atomic _ExtInt(128) JustRightAtomic;
 
 // Test result types of Unary/Bitwise/Binary Operations:
diff --git a/libcxx/test/libcxx/atomics/ext-int.verify.cpp b/libcxx/test/libcxx/atomics/ext-int.verify.cpp
new file mode 100644
index 00000000000000..3f57437f43cc6f
--- /dev/null
+++ b/libcxx/test/libcxx/atomics/ext-int.verify.cpp
@@ -0,0 +1,11 @@
+// REQUIRES: clang-11
+
+#include <atomic>
+
+int main(int, char**)
+{
+  // expected-error@atomic:*1 {{_Atomic cannot be applied to integer type '_ExtInt(32)'}}
+  std::atomic<_ExtInt(32)> x {42};
+
+  return 0;
+}

From c466c5fa7ee90f90c0e1d08777f1f085bb78a475 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 18 Aug 2020 09:20:05 -0700
Subject: [PATCH 061/101] [ARM] Fix build after D86087

---
 llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 11e8aa742d89b3..4d1ab88fe3b2c8 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1342,7 +1342,7 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
   MachineBasicBlock *MBB = InsertPt->getParent();
   bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
   unsigned Opc = LoLoop.getStartOpcode();
-  MachineOperand &Count = LoLoop.getCount();
+  MachineOperand &Count = LoLoop.getLoopStartOperand();
 
   MachineInstrBuilder MIB =
     BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));

From 8f4859d35120b007c53ac075375d9d1791ec6c86 Mon Sep 17 00:00:00 2001
From: Mauricio Sifontes <sifontes@google.com>
Date: Tue, 18 Aug 2020 16:47:06 +0000
Subject: [PATCH 062/101] Create Optimization Pass Wrapper for MLIR Reduce

Create a reduction pass that accepts an optimization pass as argument
and only replaces the golden module in the pipeline if the output of the
optimization pass is smaller than the input and still exhibits the
interesting behavior.

Add a -test-pass option to test individual passes in the MLIR Reduce
tool.

Reviewed By: jpienaar

Differential Revision: https://reviews.llvm.org/D84783
---
 mlir/include/mlir/Reducer/OptReductionPass.h  | 52 ++++++++++++++++++
 mlir/include/mlir/Reducer/Passes.td           |  7 ++-
 mlir/include/mlir/Reducer/ReductionTreePass.h |  2 +-
 mlir/test/mlir-reduce/dce-test.mlir           | 17 ++++++
 ...-tree-pass.mlir => multiple-function.mlir} |  2 +-
 .../{testcase-linux.mlir => simple-test.mlir} |  4 +-
 ...reducer-pass.mlir => single-function.mlir} |  2 +-
 mlir/tools/mlir-reduce/CMakeLists.txt         |  1 +
 mlir/tools/mlir-reduce/OptReductionPass.cpp   | 55 +++++++++++++++++++
 mlir/tools/mlir-reduce/mlir-reduce.cpp        | 23 ++++++--
 10 files changed, 154 insertions(+), 11 deletions(-)
 create mode 100644 mlir/include/mlir/Reducer/OptReductionPass.h
 create mode 100644 mlir/test/mlir-reduce/dce-test.mlir
 rename mlir/test/mlir-reduce/{reduction-tree-pass.mlir => multiple-function.mlir} (90%)
 rename mlir/test/mlir-reduce/{testcase-linux.mlir => simple-test.mlir} (81%)
 rename mlir/test/mlir-reduce/{test-reducer-pass.mlir => single-function.mlir} (52%)
 create mode 100644 mlir/tools/mlir-reduce/OptReductionPass.cpp

diff --git a/mlir/include/mlir/Reducer/OptReductionPass.h b/mlir/include/mlir/Reducer/OptReductionPass.h
new file mode 100644
index 00000000000000..2168ea2159506b
--- /dev/null
+++ b/mlir/include/mlir/Reducer/OptReductionPass.h
@@ -0,0 +1,52 @@
+//===- OptReductionPass.h - Optimization Reduction Pass Wrapper -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Opt Reduction Pass Wrapper. It creates a pass to run
+// any optimization pass within it and only replaces the output module with the
+// transformed version if it is smaller and interesting.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_REDUCER_OPTREDUCTIONPASS_H
+#define MLIR_REDUCER_OPTREDUCTIONPASS_H
+
+#include "PassDetail.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Reducer/ReductionNode.h"
+#include "mlir/Reducer/ReductionTreePass.h"
+#include "mlir/Reducer/Tester.h"
+#include "mlir/Transforms/Passes.h"
+#include "llvm/Support/Debug.h"
+
+namespace mlir {
+
+class OptReductionPass : public OptReductionBase<OptReductionPass> {
+public:
+  OptReductionPass(const Tester *test, MLIRContext *context,
+                   std::unique_ptr<Pass> optPass);
+
+  OptReductionPass(const OptReductionPass &srcPass);
+
+  /// Runs the pass instance in the pass pipeline.
+  void runOnOperation() override;
+
+private:
+  // Points to the context to be used in the pass manager.
+  MLIRContext *context;
+
+  // This is used to test the interesting behavior of the transformed module.
+  const Tester *test;
+
+  // Points to the mlir-opt pass to be called.
+  std::unique_ptr<Pass> optPass;
+};
+
+} // end namespace mlir
+
+#endif
diff --git a/mlir/include/mlir/Reducer/Passes.td b/mlir/include/mlir/Reducer/Passes.td
index 4703dd746a7095..d3a934ef693345 100644
--- a/mlir/include/mlir/Reducer/Passes.td
+++ b/mlir/include/mlir/Reducer/Passes.td
@@ -17,7 +17,10 @@ include "mlir/Pass/PassBase.td"
 
 def ReductionTree : Pass<"reduction-tree", "ModuleOp"> {
   let summary = "A general reduction tree pass for the MLIR Reduce Tool";
-  let constructor = "mlir::createReductionTreePass()";
 }
 
-#endif // MLIR_REDUCE_PASSES
+def OptReduction : Pass<"opt-reduction-pass", "ModuleOp"> {
+  let summary = "A reduction pass wrapper for optimization passes";
+}
+
+#endif // MLIR_REDUCER_PASSES
diff --git a/mlir/include/mlir/Reducer/ReductionTreePass.h b/mlir/include/mlir/Reducer/ReductionTreePass.h
index 01104aa0429b02..d07a475e4f9948 100644
--- a/mlir/include/mlir/Reducer/ReductionTreePass.h
+++ b/mlir/include/mlir/Reducer/ReductionTreePass.h
@@ -34,7 +34,7 @@ enum TraversalMode { SinglePath, MultiPath, Concurrent, Backtrack };
 // class.
 class ReductionTreeUtils {
 public:
-  void updateGoldenModule(ModuleOp &golden, ModuleOp reduced);
+  static void updateGoldenModule(ModuleOp &golden, ModuleOp reduced);
 };
 
 /// This class defines the Reduction Tree Pass. It provides a framework to
diff --git a/mlir/test/mlir-reduce/dce-test.mlir b/mlir/test/mlir-reduce/dce-test.mlir
new file mode 100644
index 00000000000000..e368343e056a0a
--- /dev/null
+++ b/mlir/test/mlir-reduce/dce-test.mlir
@@ -0,0 +1,17 @@
+// UNSUPPORTED: -windows-
+// RUN: mlir-reduce %s -test %S/failure-test.sh -pass-test DCE | FileCheck %s
+// This input should be reduced by the pass pipeline so that only
+// the @simple1 function remains as the other fucntions should be
+// removed by the dead code elimination pass.
+// CHECK-LABEL: func @simple1(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+
+// CHECK-NOT: func @dead_nested_function
+func @dead_private_function() attributes { sym_visibility = "private" }
+
+// CHECK-NOT: func @dead_nested_function
+func @dead_nested_function() attributes { sym_visibility = "nested" }
+
+func @simple1(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
+  "test.crashOp" () : () -> ()
+  return
+}
diff --git a/mlir/test/mlir-reduce/reduction-tree-pass.mlir b/mlir/test/mlir-reduce/multiple-function.mlir
similarity index 90%
rename from mlir/test/mlir-reduce/reduction-tree-pass.mlir
rename to mlir/test/mlir-reduce/multiple-function.mlir
index dc04a626d1915a..d225df8b8676d7 100644
--- a/mlir/test/mlir-reduce/reduction-tree-pass.mlir
+++ b/mlir/test/mlir-reduce/multiple-function.mlir
@@ -1,5 +1,5 @@
 // UNSUPPORTED: -windows-
-// RUN: mlir-reduce %s -test %S/failure-test.sh | FileCheck %s
+// RUN: mlir-reduce %s -test %S/failure-test.sh -pass-test function-reducer | FileCheck %s
 // This input should be reduced by the pass pipeline so that only 
 // the @simple5 function remains as this is the shortest function 
 // containing the interesting behavior.
diff --git a/mlir/test/mlir-reduce/testcase-linux.mlir b/mlir/test/mlir-reduce/simple-test.mlir
similarity index 81%
rename from mlir/test/mlir-reduce/testcase-linux.mlir
rename to mlir/test/mlir-reduce/simple-test.mlir
index f2bb161bb5a690..5329e9552f5bb2 100644
--- a/mlir/test/mlir-reduce/testcase-linux.mlir
+++ b/mlir/test/mlir-reduce/simple-test.mlir
@@ -1,5 +1,5 @@
 // UNSUPPORTED: -windows-
-// RUN: mlir-reduce %s -test %S/test.sh
+// RUN: mlir-reduce %s -test %S/test.sh -pass-test function
 
 func @simple1(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
   cond_br %arg0, ^bb1, ^bb2
@@ -10,4 +10,4 @@ func @simple1(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>) {
   br ^bb3(%0 : memref<2xf32>)
 ^bb3(%1: memref<2xf32>):
   return
-}
\ No newline at end of file
+}
diff --git a/mlir/test/mlir-reduce/test-reducer-pass.mlir b/mlir/test/mlir-reduce/single-function.mlir
similarity index 52%
rename from mlir/test/mlir-reduce/test-reducer-pass.mlir
rename to mlir/test/mlir-reduce/single-function.mlir
index da5b0c96335530..732963553e9002 100644
--- a/mlir/test/mlir-reduce/test-reducer-pass.mlir
+++ b/mlir/test/mlir-reduce/single-function.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s
-// RUN: not mlir-opt %s -test-mlir-reducer
+// RUN: not mlir-opt %s -test-mlir-reducer -pass-test function-reducer
 
 func @test() {
   "test.crashOp"() : () -> ()
diff --git a/mlir/tools/mlir-reduce/CMakeLists.txt b/mlir/tools/mlir-reduce/CMakeLists.txt
index b3a7c36a030928..f581eee21fab62 100644
--- a/mlir/tools/mlir-reduce/CMakeLists.txt
+++ b/mlir/tools/mlir-reduce/CMakeLists.txt
@@ -32,6 +32,7 @@ set(LIBS
   )
 
 add_llvm_tool(mlir-reduce
+  OptReductionPass.cpp
   Passes/FunctionReducer.cpp
   ReductionNode.cpp
   ReductionTreePass.cpp
diff --git a/mlir/tools/mlir-reduce/OptReductionPass.cpp b/mlir/tools/mlir-reduce/OptReductionPass.cpp
new file mode 100644
index 00000000000000..dbb3d97046d476
--- /dev/null
+++ b/mlir/tools/mlir-reduce/OptReductionPass.cpp
@@ -0,0 +1,55 @@
+//===- OptReductionPass.cpp - Optimization Reduction Pass Wrapper ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Opt Reduction Pass class. It creates a pass to run
+// any optimization pass within it and only replaces the output module with the
+// transformed version if it is smaller and interesting.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Reducer/OptReductionPass.h"
+
+#define DEBUG_TYPE "mlir-reduce"
+
+using namespace mlir;
+
+OptReductionPass::OptReductionPass(const Tester *test, MLIRContext *context,
+                                   std::unique_ptr<Pass> optPass)
+    : context(context), test(test), optPass(std::move(optPass)) {}
+
+OptReductionPass::OptReductionPass(const OptReductionPass &srcPass)
+    : test(srcPass.test), optPass(srcPass.optPass.get()) {}
+
+/// Runs the pass instance in the pass pipeline.
+void OptReductionPass::runOnOperation() {
+  LLVM_DEBUG(llvm::dbgs() << "\nOptimization Reduction pass: ");
+  LLVM_DEBUG(llvm::dbgs() << optPass.get()->getName() << "\nTesting:\n");
+
+  ModuleOp module = this->getOperation();
+  ModuleOp moduleVariant = module.clone();
+  PassManager pmTransform(context);
+  pmTransform.addPass(std::move(optPass));
+
+  if (failed(pmTransform.run(moduleVariant)))
+    return;
+
+  ReductionNode original(module, nullptr);
+  original.measureAndTest(test);
+
+  ReductionNode reduced(moduleVariant, nullptr);
+  reduced.measureAndTest(test);
+
+  if (reduced.isInteresting() && reduced.getSize() < original.getSize()) {
+    ReductionTreeUtils::updateGoldenModule(module, reduced.getModule().clone());
+    LLVM_DEBUG(llvm::dbgs() << "\nSuccessful Transformed version\n\n");
+  } else {
+    LLVM_DEBUG(llvm::dbgs() << "\nUnsuccessful Transformed version\n\n");
+  }
+
+  LLVM_DEBUG(llvm::dbgs() << "Pass Complete\n\n");
+}
diff --git a/mlir/tools/mlir-reduce/mlir-reduce.cpp b/mlir/tools/mlir-reduce/mlir-reduce.cpp
index 93de0703d8927e..4c69aa0ad2172b 100644
--- a/mlir/tools/mlir-reduce/mlir-reduce.cpp
+++ b/mlir/tools/mlir-reduce/mlir-reduce.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Parser.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Reducer/OptReductionPass.h"
 #include "mlir/Reducer/ReductionNode.h"
 #include "mlir/Reducer/ReductionTreePass.h"
 #include "mlir/Reducer/Tester.h"
@@ -46,6 +47,11 @@ static llvm::cl::opt<std::string>
                    llvm::cl::desc("Output filename for the reduced test case"),
                    llvm::cl::init("-"));
 
+// TODO: Use PassPipelineCLParser to define pass pieplines in the command line.
+static llvm::cl::opt<std::string>
+    passTestSpecifier("pass-test",
+                      llvm::cl::desc("Indicate a specific pass to be tested"));
+
 // Parse and verify the input MLIR file.
 static LogicalResult loadModule(MLIRContext &context, OwningModuleRef &module,
                                 StringRef inputFilename) {
@@ -94,10 +100,19 @@ int main(int argc, char **argv) {
   // Reduction pass pipeline.
   PassManager pm(&context);
 
-  // Reduction tree pass with OpReducer variant generation and single path
-  // traversal.
-  pm.addPass(
-      std::make_unique<ReductionTreePass<FunctionReducer, SinglePath>>(&test));
+  if (passTestSpecifier == "DCE") {
+
+    // Opt Reduction Pass with SymbolDCEPass as opt pass.
+    pm.addPass(std::make_unique<OptReductionPass>(&test, &context,
+                                                  createSymbolDCEPass()));
+
+  } else if (passTestSpecifier == "function-reducer") {
+
+    // Reduction tree pass with OpReducer variant generation and single path
+    // traversal.
+    pm.addPass(std::make_unique<ReductionTreePass<FunctionReducer, SinglePath>>(
+        &test));
+  }
 
   ModuleOp m = moduleRef.get().clone();
 

From 501a078cbb4a79170fccf1346d772dae3d318057 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 18 Aug 2020 09:49:05 -0700
Subject: [PATCH 063/101] Revert "[TSan][libdispatch] Add interceptors for
 dispatch_async_and_wait()"

This reverts commit d137db80297f286f3a19eacc63d4a980646da437.

Breaks builds on older SDKs.
---
 .../rtl/tsan_interceptors_libdispatch.cpp     |  5 ---
 .../test/tsan/libdispatch/async_and_wait.c    | 31 -------------------
 2 files changed, 36 deletions(-)
 delete mode 100644 compiler-rt/test/tsan/libdispatch/async_and_wait.c

diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_libdispatch.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_libdispatch.cpp
index 292ea5fbb23931..5dacd3256abc9a 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_libdispatch.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_libdispatch.cpp
@@ -219,9 +219,6 @@ static void invoke_and_release_block(void *param) {
 DISPATCH_INTERCEPT(dispatch, false)
 DISPATCH_INTERCEPT(dispatch_barrier, true)
 
-DISPATCH_INTERCEPT_SYNC_F(dispatch_async_and_wait_f, false)
-DISPATCH_INTERCEPT_SYNC_B(dispatch_async_and_wait, false)
-
 DECLARE_REAL(void, dispatch_after_f, dispatch_time_t when,
              dispatch_queue_t queue, void *context, dispatch_function_t work)
 
@@ -749,8 +746,6 @@ void InitializeLibdispatchInterceptors() {
   INTERCEPT_FUNCTION(dispatch_barrier_async_f);
   INTERCEPT_FUNCTION(dispatch_barrier_sync);
   INTERCEPT_FUNCTION(dispatch_barrier_sync_f);
-  INTERCEPT_FUNCTION(dispatch_async_and_wait);
-  INTERCEPT_FUNCTION(dispatch_async_and_wait_f);
   INTERCEPT_FUNCTION(dispatch_after);
   INTERCEPT_FUNCTION(dispatch_after_f);
   INTERCEPT_FUNCTION(dispatch_once);
diff --git a/compiler-rt/test/tsan/libdispatch/async_and_wait.c b/compiler-rt/test/tsan/libdispatch/async_and_wait.c
deleted file mode 100644
index 5e63c118aef53d..00000000000000
--- a/compiler-rt/test/tsan/libdispatch/async_and_wait.c
+++ /dev/null
@@ -1,31 +0,0 @@
-// RUN: %clang_tsan %s -o %t
-// RUN: %run %t 2>&1 | FileCheck %s --implicit-check-not='ThreadSanitizer'
-
-#include "dispatch/dispatch.h"
-
-#include <stdio.h>
-
-long global;
-
-int main() {
-  dispatch_queue_t q = dispatch_queue_create("my.queue", DISPATCH_QUEUE_SERIAL);
-  dispatch_semaphore_t s = dispatch_semaphore_create(0);
-
-  // Force queue to context switch onto separate thread.
-  dispatch_async(q, ^{
-    dispatch_semaphore_wait(s, DISPATCH_TIME_FOREVER);
-  });
-  dispatch_semaphore_signal(s);
-
-  global++;
-  dispatch_async_and_wait(q, ^{
-    // The queue continues to execute on separate thread.  This would cause a
-    // race if we had used `dispatch_async()` without the `_and_wait` part.
-    global++;
-  });
-  global++;
-
-  fprintf(stderr, "Done.\n");
-}
-
-// CHECK: Done.

From 11ff5176c4655526960dd01024f5d1f99499d4ad Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 18 Aug 2020 17:08:49 +0100
Subject: [PATCH 064/101] [X86][AVX] lowerShuffleWithVPMOV - add non-VLX
 support.

We can efficiently handle non-VLX cases now that we have the getAVX512TruncNode helper.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 17 ++------
 llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll | 40 ++++++++-----------
 2 files changed, 19 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ec4d236dc3ea19..2b19254c4344b6 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -11361,14 +11361,8 @@ static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
 //     t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
 //   t18: v2i64 = bitcast t51
 //
-// Without avx512vl, this is lowered to:
-//
-// vpmovqd %zmm0, %ymm0
-// vpshufb {{.*#+}} xmm0 =
-// xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-//
-// But when avx512vl is available, one can just use a single vpmovdw
-// instruction.
+// One can just use a single vpmovdw instruction, without avx512vl we need to
+// use the zmm variant and extract the lower subvector, padding with zeroes.
 // TODO: Merge with lowerShuffleAsVTRUNC.
 static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
                                      SDValue V2, ArrayRef<int> Mask,
@@ -11400,11 +11394,6 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
   SDValue Src = V1.getOperand(0).getOperand(0);
   MVT SrcVT = Src.getSimpleValueType();
 
-  // The vptrunc** instructions truncating 128 bit and 256 bit vectors
-  // are only available with avx512vl.
-  if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
-    return SDValue();
-
   // Down Convert Word to Byte is only available with avx512bw. The case with
   // 256-bit output doesn't contain a shuffle and is therefore not handled here.
   if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
@@ -11417,7 +11406,7 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
       !matchShuffleAsVPMOV(Mask, SwappedOps, 4))
     return SDValue();
 
-  return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
+  return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, true);
 }
 
 // Attempt to match binary shuffle patterns as a truncate.
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 1559fdbbe72c8c..3919f326d39a53 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -561,9 +561,8 @@ define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nou
 ;
 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vmovdqa %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -575,9 +574,8 @@ define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nou
 ;
 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vmovdqa %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -621,9 +619,8 @@ define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nou
 ;
 ; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vmovdqa %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -635,9 +632,8 @@ define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nou
 ;
 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vmovdqa %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -816,9 +812,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) no
 ;
 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vmovdqa %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -830,9 +825,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) no
 ;
 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vmovdqa %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
@@ -881,9 +875,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) no
 ;
 ; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vmovdqa %ymm0, %ymm0
+; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -895,9 +888,8 @@ define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) no
 ;
 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vmovdqa %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;

From a1caa302970de86b15d360212b526be8f1d59641 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Fri, 14 Aug 2020 17:09:23 -0700
Subject: [PATCH 065/101] [gn build] Add support for expensive checks

Reviewed By: hans, MaskRay

Differential Revision: https://reviews.llvm.org/D86007
---
 llvm/utils/gn/build/BUILD.gn                      | 4 ++++
 llvm/utils/gn/build/buildflags.gni                | 3 +++
 llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn | 8 +++++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/llvm/utils/gn/build/BUILD.gn b/llvm/utils/gn/build/BUILD.gn
index e29cdb678a361f..3c0b905991b50f 100644
--- a/llvm/utils/gn/build/BUILD.gn
+++ b/llvm/utils/gn/build/BUILD.gn
@@ -34,6 +34,10 @@ config("compiler_defaults") {
     defines += [ "NDEBUG" ]
   }
 
+  if (llvm_enable_expensive_checks) {
+    defines += [ "EXPENSIVE_CHECKS" ]
+  }
+
   asmflags = target_flags
   cflags = target_flags
   ldflags = target_flags + target_ldflags
diff --git a/llvm/utils/gn/build/buildflags.gni b/llvm/utils/gn/build/buildflags.gni
index 4dcdc962b7d116..eb8ac55e48e01b 100644
--- a/llvm/utils/gn/build/buildflags.gni
+++ b/llvm/utils/gn/build/buildflags.gni
@@ -10,4 +10,7 @@ declare_args() {
 
   # Whether to enable assertions.
   llvm_enable_assertions = true
+
+  # Whether to enable expensive checks.
+  llvm_enable_expensive_checks = false
 }
diff --git a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
index 5f8058699d7293..32480e51a4c270 100644
--- a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
@@ -1,6 +1,7 @@
 import("//compiler-rt/target.gni")
 import("//compiler-rt/test/test.gni")
 import("//llvm/triples.gni")
+import("//llvm/utils/gn/build/buildflags.gni")
 import("//llvm/utils/gn/build/libs/zlib/enable.gni")
 import("//llvm/utils/gn/build/toolchain/compiler.gni")
 import("//llvm/utils/gn/build/write_cmake_config.gni")
@@ -51,12 +52,17 @@ write_cmake_config("lit_common_configured") {
     "SANITIZER_CAN_USE_CXXABI_PYBOOL=True",
     "COMPILER_RT_HAS_LLD_PYBOOL=True",
     "COMPILER_RT_HAS_GWP_ASAN_PYBOOL=False",
-    "LLVM_ENABLE_EXPENSIVE_CHECKS_PYBOOL=False",
     "HAVE_RPC_XDR_H=0",
     "ANDROID_NDK_VERSION=19",
     "ANDROID_SERIAL_FOR_TESTING=$android_serial_for_testing",
   ]
 
+  if (llvm_enable_expensive_checks) {
+    values += [ "LLVM_ENABLE_EXPENSIVE_CHECKS_PYBOOL=True" ]
+  } else {
+    values += [ "LLVM_ENABLE_EXPENSIVE_CHECKS_PYBOOL=False" ]
+  }
+
   if (host_cpu == "x64") {
     values += [ "HOST_ARCH=x86_64" ]
   } else {

From 55565752306e352e655bf8a4ba919c14d6b195c2 Mon Sep 17 00:00:00 2001
From: Rob Suderman <rob.suderman@gmail.com>
Date: Thu, 13 Aug 2020 14:59:58 -0700
Subject: [PATCH 066/101] Added std.floor operation to match std.ceil

There should be an equivalent std.floor op to std.ceil. This includes
matching lowerings for SPIRV, NVVM, ROCDL, and LLVM.

Reviewed By: ftynse

Differential Revision: https://reviews.llvm.org/D85940
---
 .../mlir/Dialect/StandardOps/IR/Ops.td        | 33 +++++++++++++++++++
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        |  5 ++-
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      |  5 ++-
 .../StandardToLLVM/StandardToLLVM.cpp         |  2 ++
 .../ConvertStandardToSPIRV.cpp                |  1 +
 .../Conversion/GPUToNVVM/gpu-to-nvvm.mlir     | 15 +++++++++
 .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir   | 15 +++++++++
 .../StandardToLLVM/standard-to-llvm.mlir      | 21 ++++++++++++
 .../StandardToSPIRV/std-ops-to-spirv.mlir     |  2 ++
 mlir/test/IR/core-ops.mlir                    | 12 +++++++
 10 files changed, 109 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
index 088f262790d6c3..510d485d019f18 100644
--- a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
+++ b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td
@@ -814,6 +814,39 @@ def CeilFOp : FloatUnaryOp<"ceilf"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// FloorFOp
+//===----------------------------------------------------------------------===//
+
+def FloorFOp : FloatUnaryOp<"floorf"> {
+  let summary = "floor of the specified value";
+  let description = [{
+    Syntax:
+
+    ```
+    operation ::= ssa-id `=` `std.floorf` ssa-use `:` type
+    ```
+
+    The `floorf` operation computes the floor of a given value. It takes one
+    operand and returns one result of the same type. This type may be a float
+    scalar type, a vector whose element type is float, or a tensor of floats.
+    It has no standard attributes.
+
+    Example:
+
+    ```mlir
+    // Scalar floor value.
+    %a = floorf %b : f64
+
+    // SIMD vector element-wise floor value.
+    %f = floorf %g : vector<4xf32>
+
+    // Tensor element-wise floor value.
+    %x = floorf %y : tensor<4x?xf8>
+    ```
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // CmpFOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 76c166842c2d95..d11cc51d1d594f 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -137,7 +137,8 @@ struct LowerGpuOpsToNVVMOpsPass
     LLVMConversionTarget target(getContext());
     target.addIllegalDialect<gpu::GPUDialect>();
     target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::FAbsOp, LLVM::FCeilOp,
-                        LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op>();
+                        LLVM::FFloorOp, LLVM::LogOp, LLVM::Log10Op,
+                        LLVM::Log2Op>();
     target.addIllegalOp<FuncOp>();
     target.addLegalDialect<NVVM::NVVMDialect>();
     // TODO: Remove once we support replacing non-root ops.
@@ -174,6 +175,8 @@ void mlir::populateGpuToNVVMConversionPatterns(
                                                "__nv_cos");
   patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__nv_expf",
                                                "__nv_exp");
+  patterns.insert<OpToFuncCallLowering<FloorFOp>>(converter, "__nv_floorf",
+                                                  "__nv_floor");
   patterns.insert<OpToFuncCallLowering<LogOp>>(converter, "__nv_logf",
                                                "__nv_log");
   patterns.insert<OpToFuncCallLowering<Log10Op>>(converter, "__nv_log10f",
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 697f8078e725da..40cf097c9c5a9e 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -71,7 +71,8 @@ struct LowerGpuOpsToROCDLOpsPass
     LLVMConversionTarget target(getContext());
     target.addIllegalDialect<gpu::GPUDialect>();
     target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::FAbsOp, LLVM::FCeilOp,
-                        LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op>();
+                        LLVM::FFloorOp, LLVM::LogOp, LLVM::Log10Op,
+                        LLVM::Log2Op>();
     target.addIllegalOp<FuncOp>();
     target.addLegalDialect<ROCDL::ROCDLDialect>();
     // TODO: Remove once we support replacing non-root ops.
@@ -104,6 +105,8 @@ void mlir::populateGpuToROCDLConversionPatterns(
                                                "__ocml_cos_f64");
   patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__ocml_exp_f32",
                                                "__ocml_exp_f64");
+  patterns.insert<OpToFuncCallLowering<FloorFOp>>(converter, "__ocml_floor_f32",
+                                                  "__ocml_floor_f64");
   patterns.insert<OpToFuncCallLowering<LogOp>>(converter, "__ocml_log_f32",
                                                "__ocml_log_f64");
   patterns.insert<OpToFuncCallLowering<Log10Op>>(converter, "__ocml_log10_f32",
diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
index 4a061963fce3aa..0ee1166b1a643b 100644
--- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -1418,6 +1418,7 @@ using CosOpLowering = VectorConvertToLLVMPattern<CosOp, LLVM::CosOp>;
 using DivFOpLowering = VectorConvertToLLVMPattern<DivFOp, LLVM::FDivOp>;
 using ExpOpLowering = VectorConvertToLLVMPattern<ExpOp, LLVM::ExpOp>;
 using Exp2OpLowering = VectorConvertToLLVMPattern<Exp2Op, LLVM::Exp2Op>;
+using FloorFOpLowering = VectorConvertToLLVMPattern<FloorFOp, LLVM::FFloorOp>;
 using Log10OpLowering = VectorConvertToLLVMPattern<Log10Op, LLVM::Log10Op>;
 using Log2OpLowering = VectorConvertToLLVMPattern<Log2Op, LLVM::Log2Op>;
 using LogOpLowering = VectorConvertToLLVMPattern<LogOp, LLVM::LogOp>;
@@ -3285,6 +3286,7 @@ void mlir::populateStdToLLVMNonMemoryConversionPatterns(
       DivFOpLowering,
       ExpOpLowering,
       Exp2OpLowering,
+      FloorFOpLowering,
       GenericAtomicRMWOpLowering,
       LogOpLowering,
       Log10OpLowering,
diff --git a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp
index 268139faa2fdde..6ae17c33070cfa 100644
--- a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp
+++ b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp
@@ -1076,6 +1076,7 @@ void populateStandardToSPIRVPatterns(MLIRContext *context,
       UnaryAndBinaryOpPattern<CosOp, spirv::GLSLCosOp>,
       UnaryAndBinaryOpPattern<DivFOp, spirv::FDivOp>,
       UnaryAndBinaryOpPattern<ExpOp, spirv::GLSLExpOp>,
+      UnaryAndBinaryOpPattern<FloorFOp, spirv::GLSLFloorOp>,
       UnaryAndBinaryOpPattern<LogOp, spirv::GLSLLogOp>,
       UnaryAndBinaryOpPattern<MulFOp, spirv::FMulOp>,
       UnaryAndBinaryOpPattern<MulIOp, spirv::IMulOp>,
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index df38df1749cbc0..6b071a053ce385 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -172,6 +172,21 @@ gpu.module @test_module {
 
 // -----
 
+gpu.module @test_module {
+  // CHECK: llvm.func @__nv_floorf(!llvm.float) -> !llvm.float
+  // CHECK: llvm.func @__nv_floor(!llvm.double) -> !llvm.double
+  // CHECK-LABEL: func @gpu_floor
+  func @gpu_floor(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = std.floorf %arg_f32 : f32
+    // CHECK: llvm.call @__nv_floorf(%{{.*}}) : (!llvm.float) -> !llvm.float
+    %result64 = std.floorf %arg_f64 : f64
+    // CHECK: llvm.call @__nv_floor(%{{.*}}) : (!llvm.double) -> !llvm.double
+    std.return %result32, %result64 : f32, f64
+  }
+}
+
+// -----
+
 gpu.module @test_module {
   // CHECK: llvm.func @__nv_cosf(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__nv_cos(!llvm.double) -> !llvm.double
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index a7565bb6e323f0..b17d75fd7afb0b 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -114,6 +114,21 @@ gpu.module @test_module {
 
 // -----
 
+gpu.module @test_module {
+  // CHECK: llvm.func @__ocml_floor_f32(!llvm.float) -> !llvm.float
+  // CHECK: llvm.func @__ocml_floor_f64(!llvm.double) -> !llvm.double
+  // CHECK-LABEL: func @gpu_floor
+  func @gpu_floor(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
+    %result32 = std.floorf %arg_f32 : f32
+    // CHECK: llvm.call @__ocml_floor_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
+    %result64 = std.floorf %arg_f64 : f64
+    // CHECK: llvm.call @__ocml_floor_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
+    std.return %result32, %result64 : f32, f64
+  }
+}
+
+// -----
+
 gpu.module @test_module {
   // CHECK: llvm.func @__ocml_cos_f32(!llvm.float) -> !llvm.float
   // CHECK: llvm.func @__ocml_cos_f64(!llvm.double) -> !llvm.double
diff --git a/mlir/test/Conversion/StandardToLLVM/standard-to-llvm.mlir b/mlir/test/Conversion/StandardToLLVM/standard-to-llvm.mlir
index c55950a556344f..c7363085817e1e 100644
--- a/mlir/test/Conversion/StandardToLLVM/standard-to-llvm.mlir
+++ b/mlir/test/Conversion/StandardToLLVM/standard-to-llvm.mlir
@@ -40,6 +40,27 @@ func @sine(%arg0 : f32) {
 
 // -----
 
+// CHECK-LABEL: func @ceilf(
+// CHECK-SAME: !llvm.float
+func @ceilf(%arg0 : f32) {
+  // CHECK: "llvm.intr.ceil"(%arg0) : (!llvm.float) -> !llvm.float
+  %0 = ceilf %arg0 : f32
+  std.return
+}
+
+// -----
+
+// CHECK-LABEL: func @floorf(
+// CHECK-SAME: !llvm.float
+func @floorf(%arg0 : f32) {
+  // CHECK: "llvm.intr.floor"(%arg0) : (!llvm.float) -> !llvm.float
+  %0 = floorf %arg0 : f32
+  std.return
+}
+
+// -----
+
+
 // CHECK-LABEL: func @rsqrt_double(
 // CHECK-SAME: !llvm.double
 func @rsqrt_double(%arg0 : f64) {
diff --git a/mlir/test/Conversion/StandardToSPIRV/std-ops-to-spirv.mlir b/mlir/test/Conversion/StandardToSPIRV/std-ops-to-spirv.mlir
index e85f78f757a3a3..1b83af1be7551e 100644
--- a/mlir/test/Conversion/StandardToSPIRV/std-ops-to-spirv.mlir
+++ b/mlir/test/Conversion/StandardToSPIRV/std-ops-to-spirv.mlir
@@ -65,6 +65,8 @@ func @float32_unary_scalar(%arg0: f32) {
   %8 = tanh %arg0 : f32
   // CHECK: spv.GLSL.Sin %{{.*}}: f32
   %9 = sin %arg0 : f32
+  // CHECK: spv.GLSL.Floor %{{.*}}: f32
+  %10 = floorf %arg0 : f32
   return
 }
 
diff --git a/mlir/test/IR/core-ops.mlir b/mlir/test/IR/core-ops.mlir
index 74470719047791..69e974bc41734d 100644
--- a/mlir/test/IR/core-ops.mlir
+++ b/mlir/test/IR/core-ops.mlir
@@ -554,6 +554,18 @@ func @standard_instrs(tensor<4x4x?xf32>, f32, i32, index, i64, f16) {
   // CHECK: = fptosi {{.*}} : f16 to i64
   %162 = fptosi %half : f16 to i64
 
+  // CHECK: floorf %arg1 : f32
+  %163 = "std.floorf"(%f) : (f32) -> f32
+
+  // CHECK: %{{[0-9]+}} = floorf %arg1 : f32
+  %164 = floorf %f : f32
+
+  // CHECK: %{{[0-9]+}} = floorf %cst_8 : vector<4xf32>
+  %165 = floorf %vcf32 : vector<4xf32>
+
+  // CHECK: %{{[0-9]+}} = floorf %arg0 : tensor<4x4x?xf32>
+  %166 = floorf %t : tensor<4x4x?xf32>
+
   return
 }
 

From 40e269ea6db9c755c27e2ee1e201a640ac085afd Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Fri, 14 Aug 2020 01:58:00 -0700
Subject: [PATCH 067/101] [GlobalISel] Add a combine for ashr(shl x, c), c -->
 sext_inreg x, c'

By detecting this sign extend pattern early, we can uncover opportunities for
more optimizations.

Differential Revision: https://reviews.llvm.org/D85965
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |  6 ++
 .../llvm/CodeGen/GlobalISel/MIPatternMatch.h  |  6 ++
 .../include/llvm/Target/GlobalISel/Combine.td | 12 ++-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 30 +++++++
 ...galizercombiner-ashr-shl-to-sext-inreg.mir | 90 +++++++++++++++++++
 .../AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll     | 24 ++---
 .../AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll     | 35 ++++----
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll     |  9 +-
 .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll     |  9 +-
 9 files changed, 174 insertions(+), 47 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-ashr-shl-to-sext-inreg.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index e632f5fd05ec2b..e5f2700f6de9a2 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -294,6 +294,12 @@ class CombinerHelper {
   bool applyBuildInstructionSteps(MachineInstr &MI,
                                   InstructionStepsMatchInfo &MatchInfo);
 
+  /// Match ashr (shl x, C), C -> sext_inreg (C)
+  bool matchAshrShlToSextInreg(MachineInstr &MI,
+                               std::tuple<Register, int64_t> &MatchInfo);
+  bool applyAshShlToSextInreg(MachineInstr &MI,
+                              std::tuple<Register, int64_t> &MatchInfo);
+
   /// Try to transform \p MI by using all of the above
   /// combine functions. Returns true if changed.
   bool tryCombine(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index 043be086ff417d..4e216a284088bd 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -251,6 +251,12 @@ m_GLShr(const LHS &L, const RHS &R) {
   return BinaryOp_match<LHS, RHS, TargetOpcode::G_LSHR, false>(L, R);
 }
 
+template <typename LHS, typename RHS>
+inline BinaryOp_match<LHS, RHS, TargetOpcode::G_ASHR, false>
+m_GAShr(const LHS &L, const RHS &R) {
+  return BinaryOp_match<LHS, RHS, TargetOpcode::G_ASHR, false>(L, R);
+}
+
 // Helper for unary instructions (G_[ZSA]EXT/G_TRUNC) etc
 template <typename SrcTy, unsigned Opcode> struct UnaryOp_match {
   SrcTy L;
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 9cb45e2bfc117e..4647afad418505 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -284,6 +284,15 @@ def hoist_logic_op_with_same_opcode_hands: GICombineRule <
   (apply [{ return Helper.applyBuildInstructionSteps(*${root}, ${info});}])
 >;
 
+// Fold ashr (shl x, C), C -> sext_inreg (C)
+def shl_ashr_to_sext_inreg_matchinfo : GIDefMatchData<"std::tuple<Register, int64_t>">;
+def shl_ashr_to_sext_inreg : GICombineRule<
+  (defs root:$root, shl_ashr_to_sext_inreg_matchinfo:$info),
+  (match (wip_match_opcode G_ASHR): $root,
+    [{ return Helper.matchAshrShlToSextInreg(*${root}, ${info}); }]),
+  (apply [{ return Helper.applyAshShlToSextInreg(*${root}, ${info});}])
+>;
+
 // FIXME: These should use the custom predicate feature once it lands.
 def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero,
                                      undef_to_negative_one,
@@ -301,4 +310,5 @@ def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl]>;
 def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain,
     combines_for_extload, combine_indexed_load_store, undef_combines,
     identity_combines, simplify_add_to_sub,
-    hoist_logic_op_with_same_opcode_hands]>;
+    hoist_logic_op_with_same_opcode_hands,
+    shl_ashr_to_sext_inreg]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b922f6988a2c23..48294a07597f8a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1887,6 +1887,36 @@ bool CombinerHelper::applyBuildInstructionSteps(
   return true;
 }
 
+bool CombinerHelper::matchAshrShlToSextInreg(
+    MachineInstr &MI, std::tuple<Register, int64_t> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_ASHR);
+  int64_t ShlCst, AshrCst;
+  Register Src;
+  // FIXME: detect splat constant vectors.
+  if (!mi_match(MI.getOperand(0).getReg(), MRI,
+                m_GAShr(m_GShl(m_Reg(Src), m_ICst(ShlCst)), m_ICst(AshrCst))))
+    return false;
+  if (ShlCst != AshrCst)
+    return false;
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_SEXT_INREG, {MRI.getType(Src)}}))
+    return false;
+  MatchInfo = {Src, ShlCst};
+  return true;
+}
+bool CombinerHelper::applyAshShlToSextInreg(
+    MachineInstr &MI, std::tuple<Register, int64_t> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_ASHR);
+  Register Src;
+  int64_t ShiftAmt;
+  std::tie(Src, ShiftAmt) = MatchInfo;
+  unsigned Size = MRI.getType(Src).getScalarSizeInBits();
+  Builder.setInstrAndDebugLoc(MI);
+  Builder.buildSExtInReg(MI.getOperand(0).getReg(), Src, Size - ShiftAmt);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-ashr-shl-to-sext-inreg.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-ashr-shl-to-sext-inreg.mir
new file mode 100644
index 00000000000000..14bda863d2c289
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-ashr-shl-to-sext-inreg.mir
@@ -0,0 +1,90 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+---
+name:            ashr_shl_to_sext_inreg
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$w0' }
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: ashr_shl_to_sext_inreg
+    ; CHECK: liveins: $w0
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s16) = G_SEXT_INREG [[TRUNC]], 8
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SEXT_INREG]](s16)
+    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(s32) = COPY $w0
+    %0:_(s16) = G_TRUNC %1(s32)
+    %2:_(s16) = G_CONSTANT i16 8
+    %3:_(s16) = G_SHL %0, %2(s16)
+    %4:_(s16) = exact G_ASHR %3, %2(s16)
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $w0 = COPY %5(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            different_shift_amts
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$w0' }
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: different_shift_amts
+    ; CHECK: liveins: $w0
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 12
+    ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
+    ; CHECK: [[ASHR:%[0-9]+]]:_(s16) = exact G_ASHR [[SHL]], [[C1]](s16)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
+    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(s32) = COPY $w0
+    %0:_(s16) = G_TRUNC %1(s32)
+    %2:_(s16) = G_CONSTANT i16 12
+    %4:_(s16) = G_CONSTANT i16 8
+    %3:_(s16) = G_SHL %0, %2(s16)
+    %5:_(s16) = exact G_ASHR %3, %4(s16)
+    %6:_(s32) = G_ANYEXT %5(s16)
+    $w0 = COPY %6(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            ashr_shl_to_sext_inreg_vector
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+    ; Currently don't support this for vectors just yet, this will need updating
+    ; when we do.
+    ; CHECK-LABEL: name: ashr_shl_to_sext_inreg_vector
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
+    ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+    ; CHECK: [[SHL:%[0-9]+]]:_(<4 x s16>) = G_SHL [[COPY]], [[BUILD_VECTOR]](<4 x s16>)
+    ; CHECK: [[ASHR:%[0-9]+]]:_(<4 x s16>) = exact G_ASHR [[SHL]], [[BUILD_VECTOR]](<4 x s16>)
+    ; CHECK: $d0 = COPY [[ASHR]](<4 x s16>)
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:_(<4 x s16>) = COPY $d0
+    %2:_(s16) = G_CONSTANT i16 8
+    %1:_(<4 x s16>) = G_BUILD_VECTOR %2(s16), %2(s16), %2(s16), %2(s16)
+    %3:_(<4 x s16>) = G_SHL %0, %1(<4 x s16>)
+    %4:_(<4 x s16>) = exact G_ASHR %3, %1(<4 x s16>)
+    $d0 = COPY %4(<4 x s16>)
+    RET_ReallyLR implicit $d0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
index e5d26476e94248..f3a53fb7d22d50 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
@@ -674,8 +674,7 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrs
 ; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x180000
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 8
+; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x180000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -830,8 +829,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_add_i32 s2, s2, s0
 ; GFX6-NEXT:    s_bfe_i32 s0, s2, 0x80000
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
+; GFX6-NEXT:    s_sext_i32_i8 s0, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -854,8 +852,7 @@ define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %ou
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_add_i32 s2, s2, s0
 ; GFX6-NEXT:    s_bfe_i32 s0, s2, 8
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 24
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 24
+; GFX6-NEXT:    s_sext_i32_i8 s0, s0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -879,8 +876,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 add
 ; GFX6-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
   %load = load i8, i8 addrspace(1)* %ptr, align 1
@@ -904,8 +900,7 @@ define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 a
 ; GFX6-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 8, 0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 24, v0
+; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
   %load = load i8, i8 addrspace(1)* %ptr, align 1
@@ -927,8 +922,7 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 31
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
+; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x10000
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x10000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -951,8 +945,7 @@ define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 30
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x20000
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x10001
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -975,8 +968,7 @@ define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 30
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x20000
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x20001
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
index ab3fbc03e81d57..a8098b7dd9d159 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ubfe.ll
@@ -423,8 +423,7 @@ define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b32 s0, s0, 31
-; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
+; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x10000
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x10000
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
@@ -950,22 +949,22 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out)
 define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
 ; GFX6-LABEL: simplify_bfe_u32_multi_use_arg:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:        s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:        s_load_dwordx2 s[8:9], s[0:1], 0xb
-; GFX6-NEXT:        s_load_dwordx2 s[0:1], s[0:1], 0xd
-; GFX6-NEXT:        s_mov_b32 s6, -1
-; GFX6-NEXT:        s_mov_b32 s7, 0xf000
-; GFX6-NEXT:        s_mov_b64 s[10:11], s[6:7]
-; GFX6-NEXT:        s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:        s_load_dword s0, s[0:1], 0x0
-; GFX6-NEXT:        s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:        s_and_b32 s0, s0, 63
-; GFX6-NEXT:        s_bfe_u32 s1, s0, 0x20002
-; GFX6-NEXT:        v_mov_b32_e32 v1, s1
-; GFX6-NEXT:        v_mov_b32_e32 v0, s0
-; GFX6-NEXT:        buffer_store_dword v1, off, s[4:7], 0
-; GFX6-NEXT:        buffer_store_dword v0, off, s[8:11], 0
-; GFX6-NEXT:        s_endpgm
+; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_and_b32 s0, s0, 63
+; GFX6-NEXT:    s_bfe_u32 s1, s0, 0x20002
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX6-NEXT:    s_endpgm
                                             i32 addrspace(1)* %out1,
                                             i32 addrspace(1)* %in) #0 {
   %src = load i32, i32 addrspace(1)* %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index f6565fe1b6e24a..db9e75dd582c82 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -3415,8 +3415,7 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; CGP-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v0, 7, v0
+; CGP-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and i64 %num, 16777215
@@ -3736,10 +3735,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v3|
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; CGP-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; CGP-NEXT:    v_lshlrev_b32_e32 v2, 7, v2
-; CGP-NEXT:    v_ashrrev_i32_e32 v0, 7, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v2, 7, v2
+; CGP-NEXT:    v_bfe_i32 v0, v0, 0, 25
+; CGP-NEXT:    v_bfe_i32 v2, v2, 0, 25
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 06d46321a59b61..7f55c735859753 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -3363,8 +3363,7 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v0, 7, v0
+; CGP-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and i64 %num, 16777215
@@ -3677,20 +3676,18 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_rcp_f32_e32 v5, v4
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 30, v6
 ; CGP-NEXT:    v_or_b32_e32 v6, 1, v6
-; CGP-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; CGP-NEXT:    v_bfe_i32 v0, v0, 0, 25
 ; CGP-NEXT:    v_mul_f32_e32 v5, v1, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v5, v5
 ; CGP-NEXT:    v_mad_f32 v1, -v5, v4, v1
 ; CGP-NEXT:    v_cvt_i32_f32_e32 v5, v5
 ; CGP-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v4|
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, 0, v6, vcc
-; CGP-NEXT:    v_ashrrev_i32_e32 v0, 7, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
 ; CGP-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT:    v_lshlrev_b32_e32 v2, 7, v2
-; CGP-NEXT:    v_ashrrev_i32_e32 v2, 7, v2
+; CGP-NEXT:    v_bfe_i32 v2, v2, 0, 25
 ; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %num.mask = and <2 x i64> %num, <i64 16777215, i64 16777215>

From 04a6ea5d77e7613a5e1398ddf2a0fcb4e1cea41c Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Fri, 14 Aug 2020 02:00:07 -0700
Subject: [PATCH 068/101] [GlobalISel] Add a combine for sext_inreg(load x), c
 --> sextload x

This is restricted to single use loads, which if we fold to sextloads we can
find more optimal addressing modes on AArch64.

This also fixes an overload the MachineFunction::getMachineMemOperand() method
which was incorrectly using the MF alignment instead of the MMO alignment.

Differential Revision: https://reviews.llvm.org/D85966
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   4 +
 .../include/llvm/Target/GlobalISel/Combine.td |   9 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  62 +++++++++++
 llvm/lib/CodeGen/MachineFunction.cpp          |   2 +-
 ...alizercombiner-sextload-from-sextinreg.mir | 103 ++++++++++++++++++
 5 files changed, 178 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-sextload-from-sextinreg.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index e5f2700f6de9a2..61af8cd15f11de 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -135,6 +135,10 @@ class CombinerHelper {
   bool matchSextTruncSextLoad(MachineInstr &MI);
   bool applySextTruncSextLoad(MachineInstr &MI);
 
+  /// Match sext_inreg(load p), imm -> sextload p
+  bool matchSextInRegOfLoad(MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo);
+  bool applySextInRegOfLoad(MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo);
+
   bool matchElideBrByInvertingCond(MachineInstr &MI);
   void applyElideBrByInvertingCond(MachineInstr &MI);
   bool tryElideBrByInvertingCond(MachineInstr &MI);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 4647afad418505..2e85c6064a127c 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -132,6 +132,13 @@ def sext_trunc_sextload : GICombineRule<
          [{ return Helper.matchSextTruncSextLoad(*${d}); }]),
   (apply [{ Helper.applySextTruncSextLoad(*${d}); }])>;
 
+def sext_inreg_of_load_matchdata : GIDefMatchData<"std::tuple<Register, unsigned>">;
+def sext_inreg_of_load : GICombineRule<
+  (defs root:$root, sext_inreg_of_load_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SEXT_INREG):$root,
+         [{ return Helper.matchSextInRegOfLoad(*${root}, ${matchinfo}); }]),
+  (apply [{ return Helper.applySextInRegOfLoad(*${root}, ${matchinfo}); }])>;
+
 def combine_indexed_load_store : GICombineRule<
   (defs root:$root, indexed_load_store_matchdata:$matchinfo),
   (match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD, G_STORE):$root,
@@ -311,4 +318,4 @@ def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain,
     combines_for_extload, combine_indexed_load_store, undef_combines,
     identity_combines, simplify_add_to_sub,
     hoist_logic_op_with_same_opcode_hands,
-    shl_ashr_to_sext_inreg]>;
+    shl_ashr_to_sext_inreg, sext_inreg_of_load]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 48294a07597f8a..588a24e1dc57f3 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -614,6 +615,67 @@ bool CombinerHelper::applySextTruncSextLoad(MachineInstr &MI) {
   return true;
 }
 
+bool CombinerHelper::matchSextInRegOfLoad(
+    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
+
+  // Only supports scalars for now.
+  if (MRI.getType(MI.getOperand(0).getReg()).isVector())
+    return false;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  MachineInstr *LoadDef = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
+  if (!LoadDef || !MRI.hasOneNonDBGUse(LoadDef->getOperand(0).getReg()))
+    return false;
+
+  // If the sign extend extends from a narrower width than the load's width,
+  // then we can narrow the load width when we combine to a G_SEXTLOAD.
+  auto &MMO = **LoadDef->memoperands_begin();
+  // Don't do this for non-simple loads.
+  if (MMO.isAtomic() || MMO.isVolatile())
+    return false;
+
+  // Avoid widening the load at all.
+  unsigned NewSizeBits =
+      std::min((uint64_t)MI.getOperand(2).getImm(), MMO.getSizeInBits());
+
+  // Don't generate G_SEXTLOADs with a < 1 byte width.
+  if (NewSizeBits < 8)
+    return false;
+  // Don't bother creating a non-power-2 sextload, it will likely be broken up
+  // anyway for most targets.
+  if (!isPowerOf2_32(NewSizeBits))
+    return false;
+  MatchInfo = {LoadDef->getOperand(0).getReg(), NewSizeBits};
+  return true;
+}
+
+bool CombinerHelper::applySextInRegOfLoad(
+    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
+  Register LoadReg;
+  unsigned ScalarSizeBits;
+  std::tie(LoadReg, ScalarSizeBits) = MatchInfo;
+  auto *LoadDef = MRI.getVRegDef(LoadReg);
+  assert(LoadDef && "Expected a load reg");
+
+  // If we have the following:
+  // %ld = G_LOAD %ptr, (load 2)
+  // %ext = G_SEXT_INREG %ld, 8
+  //    ==>
+  // %ld = G_SEXTLOAD %ptr (load 1)
+
+  auto &MMO = **LoadDef->memoperands_begin();
+  Builder.setInstrAndDebugLoc(MI);
+  auto &MF = Builder.getMF();
+  auto PtrInfo = MMO.getPointerInfo();
+  auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, ScalarSizeBits / 8);
+  Builder.buildLoadInstr(TargetOpcode::G_SEXTLOAD, MI.getOperand(0).getReg(),
+                         LoadDef->getOperand(1).getReg(), *NewMMO);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
                                             Register &Base, Register &Offset) {
   auto &MF = *MI.getParent()->getParent();
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 464f71a4fd5397..abf47847814f20 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -477,7 +477,7 @@ MachineMemOperand *MachineFunction::getMachineMemOperand(
 MachineMemOperand *MachineFunction::getMachineMemOperand(
     const MachineMemOperand *MMO, MachinePointerInfo &PtrInfo, uint64_t Size) {
   return new (Allocator) MachineMemOperand(
-      PtrInfo, MMO->getFlags(), Size, Alignment, AAMDNodes(), nullptr,
+      PtrInfo, MMO->getFlags(), Size, MMO->getBaseAlign(), AAMDNodes(), nullptr,
       MMO->getSyncScopeID(), MMO->getOrdering(), MMO->getFailureOrdering());
 }
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-sextload-from-sextinreg.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-sextload-from-sextinreg.mir
new file mode 100644
index 00000000000000..a216c5b74b3561
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-sextload-from-sextinreg.mir
@@ -0,0 +1,103 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+---
+name:            sextload_from_inreg
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0' }
+body:             |
+  bb.1:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: sextload_from_inreg
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[SEXTLOAD:%[0-9]+]]:_(s16) = G_SEXTLOAD [[COPY]](p0) :: (load 1, align 2)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SEXTLOAD]](s16)
+    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %0:_(p0) = COPY $x0
+    %1:_(s16) = G_LOAD %0(p0) :: (load 2)
+    %2:_(s16) = G_SEXT_INREG %1, 8
+    %3:_(s32) = G_ANYEXT %2(s16)
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            non_pow_2_inreg
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0' }
+body:             |
+  bb.1:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: non_pow_2_inreg
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load 4)
+    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LOAD]], 24
+    ; CHECK: $w0 = COPY [[SEXT_INREG]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = G_LOAD %0(p0) :: (load 4)
+    %2:_(s32) = G_SEXT_INREG %1, 24
+    $w0 = COPY %2(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            atomic
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0' }
+body:             |
+  bb.1:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: atomic
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[COPY]](p0) :: (load acquire 2)
+    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s16) = G_SEXT_INREG [[LOAD]], 8
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SEXT_INREG]](s16)
+    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %0:_(p0) = COPY $x0
+    %1:_(s16) = G_LOAD %0(p0) :: (load acquire 2)
+    %2:_(s16) = G_SEXT_INREG %1, 8
+    %3:_(s32) = G_ANYEXT %2(s16)
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            volatile
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0' }
+body:             |
+  bb.1:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: volatile
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[COPY]](p0) :: (volatile load 2)
+    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s16) = G_SEXT_INREG [[LOAD]], 8
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SEXT_INREG]](s16)
+    ; CHECK: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %0:_(p0) = COPY $x0
+    %1:_(s16) = G_LOAD %0(p0) :: (volatile load 2)
+    %2:_(s16) = G_SEXT_INREG %1, 8
+    %3:_(s32) = G_ANYEXT %2(s16)
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+
+...

From a3538b83943f640865b92e947da0d5ef5bdc930b Mon Sep 17 00:00:00 2001
From: Tim Keith <tkeith@nvidia.com>
Date: Tue, 18 Aug 2020 10:47:52 -0700
Subject: [PATCH 069/101] [flang] Improve error messages for procedures in
 expressions

When a procedure name was used on the RHS of an assignment we were not
reporting the error. When one was used in an expression the error
message wasn't very good (e.g. "Operands of + must be numeric; have
INTEGER(4) and untyped").

Detect these cases in ArgumentAnalyzer and emit better messages,
depending on whether the named procedure is a function or subroutine.

Procedure names may appear as actual arguments to function and
subroutine calls so don't report errors in those cases. That is the same
case where assumed type arguments are allowed, so rename `isAssumedType_`
to `isProcedureCall_` and use that to decide if it is an error.

Differential Revision: https://reviews.llvm.org/D86107
---
 flang/include/flang/Evaluate/tools.h |  1 +
 flang/lib/Evaluate/tools.cpp         |  4 ++++
 flang/lib/Semantics/expression.cpp   | 25 +++++++++++++++++--------
 flang/test/Semantics/assign04.f90    |  9 +++++++++
 flang/test/Semantics/resolve63.f90   |  5 +++++
 5 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 081795208b136a..98d4a516054eea 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -813,6 +813,7 @@ template <typename A> bool IsAllocatableOrPointer(const A &x) {
 
 // Procedure and pointer detection predicates
 bool IsProcedure(const Expr<SomeType> &);
+bool IsFunction(const Expr<SomeType> &);
 bool IsProcedurePointer(const Expr<SomeType> &);
 bool IsNullPointer(const Expr<SomeType> &);
 
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 6cc411f22adb05..e9089f56aa46a5 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -703,6 +703,10 @@ bool IsAssumedRank(const ActualArgument &arg) {
 bool IsProcedure(const Expr<SomeType> &expr) {
   return std::holds_alternative<ProcedureDesignator>(expr.u);
 }
+bool IsFunction(const Expr<SomeType> &expr) {
+  const auto *designator{std::get_if<ProcedureDesignator>(&expr.u)};
+  return designator && designator->GetType().has_value();
+}
 
 bool IsProcedurePointer(const Expr<SomeType> &expr) {
   return std::visit(common::visitors{
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 9b6531cdbd6dfb..cfb908179c3a93 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -98,11 +98,10 @@ static std::optional<DynamicTypeWithLength> AnalyzeTypeSpec(
 class ArgumentAnalyzer {
 public:
   explicit ArgumentAnalyzer(ExpressionAnalyzer &context)
-      : context_{context}, allowAssumedType_{false} {}
+      : context_{context}, isProcedureCall_{false} {}
   ArgumentAnalyzer(ExpressionAnalyzer &context, parser::CharBlock source,
-      bool allowAssumedType = false)
-      : context_{context}, source_{source}, allowAssumedType_{
-                                                allowAssumedType} {}
+      bool isProcedureCall = false)
+      : context_{context}, source_{source}, isProcedureCall_{isProcedureCall} {}
   bool fatalErrors() const { return fatalErrors_; }
   ActualArguments &&GetActuals() {
     CHECK(!fatalErrors_);
@@ -167,7 +166,7 @@ class ArgumentAnalyzer {
   ActualArguments actuals_;
   parser::CharBlock source_;
   bool fatalErrors_{false};
-  const bool allowAssumedType_;
+  const bool isProcedureCall_; // false for user-defined op or assignment
   const Symbol *sawDefinedOp_{nullptr};
 };
 
@@ -2003,7 +2002,7 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::FunctionReference &funcRef,
     std::optional<parser::StructureConstructor> *structureConstructor) {
   const parser::Call &call{funcRef.v};
   auto restorer{GetContextualMessages().SetLocation(call.source)};
-  ArgumentAnalyzer analyzer{*this, call.source, true /* allowAssumedType */};
+  ArgumentAnalyzer analyzer{*this, call.source, true /* isProcedureCall */};
   for (const auto &arg : std::get<std::list<parser::ActualArgSpec>>(call.t)) {
     analyzer.Analyze(arg, false /* not subroutine call */);
   }
@@ -2042,7 +2041,7 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::FunctionReference &funcRef,
 void ExpressionAnalyzer::Analyze(const parser::CallStmt &callStmt) {
   const parser::Call &call{callStmt.v};
   auto restorer{GetContextualMessages().SetLocation(call.source)};
-  ArgumentAnalyzer analyzer{*this, call.source, true /* allowAssumedType */};
+  ArgumentAnalyzer analyzer{*this, call.source, true /* isProcedureCall */};
   const auto &actualArgList{std::get<std::list<parser::ActualArgSpec>>(call.t)};
   for (const auto &arg : actualArgList) {
     analyzer.Analyze(arg, true /* is subroutine call */);
@@ -2982,7 +2981,7 @@ std::optional<ActualArgument> ArgumentAnalyzer::AnalyzeExpr(
   source_.ExtendToCover(expr.source);
   if (const Symbol * assumedTypeDummy{AssumedTypeDummy(expr)}) {
     expr.typedExpr.Reset(new GenericExprWrapper{}, GenericExprWrapper::Deleter);
-    if (allowAssumedType_) {
+    if (isProcedureCall_) {
       return ActualArgument{ActualArgument::AssumedType{*assumedTypeDummy}};
     } else {
       context_.SayAt(expr.source,
@@ -2990,6 +2989,16 @@ std::optional<ActualArgument> ArgumentAnalyzer::AnalyzeExpr(
       return std::nullopt;
     }
   } else if (MaybeExpr argExpr{context_.Analyze(expr)}) {
+    if (!isProcedureCall_ && IsProcedure(*argExpr)) {
+      if (IsFunction(*argExpr)) {
+        context_.SayAt(
+            expr.source, "Function call must have argument list"_err_en_US);
+      } else {
+        context_.SayAt(
+            expr.source, "Subroutine name is not allowed here"_err_en_US);
+      }
+      return std::nullopt;
+    }
     return ActualArgument{context_.Fold(std::move(*argExpr))};
   } else {
     return std::nullopt;
diff --git a/flang/test/Semantics/assign04.f90 b/flang/test/Semantics/assign04.f90
index 99f4901b205016..fb47f6dceab966 100644
--- a/flang/test/Semantics/assign04.f90
+++ b/flang/test/Semantics/assign04.f90
@@ -132,3 +132,12 @@ subroutine s10(a, n)
   real a(n)
   a(1:n) = 0.0  ! should not get a second error here
 end
+
+subroutine s11
+  intrinsic :: sin
+  real :: a
+  !ERROR: Function call must have argument list
+  a = sin
+  !ERROR: Subroutine name is not allowed here
+  a = s11
+end
diff --git a/flang/test/Semantics/resolve63.f90 b/flang/test/Semantics/resolve63.f90
index bd4e1d14e195a2..141945a262276d 100644
--- a/flang/test/Semantics/resolve63.f90
+++ b/flang/test/Semantics/resolve63.f90
@@ -104,6 +104,7 @@ subroutine test_conformability(x, y)
 
 ! Invalid operand types when user-defined operator is not available
 module m2
+  intrinsic :: sin
   type :: t
   end type
   type(t) :: x, y
@@ -113,6 +114,10 @@ module m2
   subroutine test_relational()
     !ERROR: Operands of .EQ. must have comparable types; have TYPE(t) and REAL(4)
     l = x == r
+    !ERROR: Subroutine name is not allowed here
+    l = r == test_numeric
+    !ERROR: Function call must have argument list
+    l = r == sin
   end
   subroutine test_numeric()
     !ERROR: Operands of + must be numeric; have REAL(4) and TYPE(t)

From f7a49d2aa691266497c4baa35f29ba0167b39d23 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Fri, 14 Aug 2020 07:56:29 -0700
Subject: [PATCH 070/101] [WIP][DebugInfo] Lazily parse debug_loclist offsets

Parsing DWARFv5 debug_loclist offsets when a CU is parsed is weighing
down memory usage of symbolizers that don't need to parse this data at
all. There's not much benefit to caching these anyway - since they are
O(1) lookup and reading once you know where the offset list starts (and
can do bounds checking with the offset list size too).

In general, I think it might be time to start paying down some of the
technical debt of loc/loclist/range/rnglist parsing to try to unify it a
bit more.

eg:

* Currently DWARFUnit has: RangeSection, RangeSectionBase, LocSection,
  LocSectionBase, LocTable, RngListTable, LoclistTableHeader (be nice if
  these were all wrapped up in two variables - one for loclists, one for
  rnglists)

* rnglists and loclists are handled differently (see:
  LoclistTableHeader, but no RnglistTableHeader)

* maybe all these types could be less stateful - lazily parse what they
  need to, even reparsing rather than caching because it doesn't seem
  too expensive, for instance. (though admittedly so long as it's
  constantcost/overead per compilatiton that's probably adequate)

* Maybe implementing and using a DWARFDataExtractor that can be
  sub-ranged (so we could slice it up to just the single contribution) -
  though maybe that's not so useful because loc/ranges need to refer to
  it by absolute, not contribution-relative mechanisms

Differential Revision: https://reviews.llvm.org/D86110
---
 .../Plugins/SymbolFile/DWARF/DWARFUnit.h      |  7 +++--
 .../SymbolFile/DWARF/DW_AT_loclists_base.s    |  4 +--
 .../llvm/DebugInfo/DWARF/DWARFDebugLoc.h      |  2 ++
 .../llvm/DebugInfo/DWARF/DWARFListTable.h     | 31 ++++++++++---------
 llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h | 11 ++-----
 llvm/lib/DebugInfo/DWARF/DWARFContext.cpp     |  4 +--
 llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp   |  9 +++---
 llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp        | 16 +++++++---
 8 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
index affad286a49038..1d8236c4ed42fd 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
@@ -237,7 +237,9 @@ class DWARFUnit : public lldb_private::UserID {
   llvm::Optional<uint64_t> GetRnglistOffset(uint32_t Index) const {
     if (!m_rnglist_table)
       return llvm::None;
-    if (llvm::Optional<uint64_t> off = m_rnglist_table->getOffsetEntry(Index))
+    if (llvm::Optional<uint64_t> off = m_rnglist_table->getOffsetEntry(
+            m_dwarf.GetDWARFContext().getOrLoadRngListsData().GetAsLLVM(),
+            Index))
       return *off + m_ranges_base;
     return llvm::None;
   }
@@ -246,7 +248,8 @@ class DWARFUnit : public lldb_private::UserID {
     if (!m_loclist_table_header)
       return llvm::None;
 
-    llvm::Optional<uint64_t> Offset =  m_loclist_table_header->getOffsetEntry(Index);
+    llvm::Optional<uint64_t> Offset = m_loclist_table_header->getOffsetEntry(
+        m_dwarf.GetDWARFContext().getOrLoadLocListsData().GetAsLLVM(), Index);
     if (!Offset)
       return llvm::None;
     return *Offset + m_loclists_base;
diff --git a/lldb/test/Shell/SymbolFile/DWARF/DW_AT_loclists_base.s b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_loclists_base.s
index ca32e9930a76a7..6ff35f8c659648 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/DW_AT_loclists_base.s
+++ b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_loclists_base.s
@@ -5,7 +5,7 @@
 
 # CHECK-LABEL: image lookup -v -s lookup_loclists
 # CHECK: Variable: {{.*}}, name = "x0", type = "int", location = DW_OP_reg0 RAX,
-# CHECK: Variable: {{.*}}, name = "x1", type = "int", location = ,
+# CHECK-NOT: Variable:
 
 loclists:
         nop
@@ -28,7 +28,7 @@ lookup_loclists:
         .short  5                       # Version
         .byte   8                       # Address size
         .byte   0                       # Segment selector size
-        .long   1                       # Offset entry count
+        .long   2                       # Offset entry count
 .Lloclists_table_base:
         .long   .Ldebug_loc0-.Lloclists_table_base
         .long   .Ldebug_loc1-.Lloclists_table_base
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
index 3b141304f85f49..dbc11c51a7890b 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
@@ -72,6 +72,8 @@ class DWARFLocationTable {
       std::function<Optional<object::SectionedAddress>(uint32_t)> LookupAddr,
       function_ref<bool(Expected<DWARFLocationExpression>)> Callback) const;
 
+  const DWARFDataExtractor &getData() { return Data; }
+
 protected:
   DWARFDataExtractor Data;
 
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h
index 496fdb2477f9dc..bcfc71381aeee8 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFListTable.h
@@ -72,10 +72,6 @@ class DWARFListTableHeader {
   };
 
   Header HeaderData;
-  /// The offset table, which contains offsets to the individual list entries.
-  /// It is used by forms such as DW_FORM_rnglistx.
-  /// FIXME: Generate the table and use the appropriate forms.
-  std::vector<uint64_t> Offsets;
   /// The table's format, either DWARF32 or DWARF64.
   dwarf::DwarfFormat Format;
   /// The offset at which the header (and hence the table) is located within
@@ -93,7 +89,6 @@ class DWARFListTableHeader {
 
   void clear() {
     HeaderData = {};
-    Offsets.clear();
   }
   uint64_t getHeaderOffset() const { return HeaderOffset; }
   uint8_t getAddrSize() const { return HeaderData.AddrSize; }
@@ -115,11 +110,17 @@ class DWARFListTableHeader {
     llvm_unreachable("Invalid DWARF format (expected DWARF32 or DWARF64");
   }
 
-  void dump(raw_ostream &OS, DIDumpOptions DumpOpts = {}) const;
-  Optional<uint64_t> getOffsetEntry(uint32_t Index) const {
-    if (Index < Offsets.size())
-      return Offsets[Index];
-    return None;
+  void dump(DataExtractor Data, raw_ostream &OS,
+            DIDumpOptions DumpOpts = {}) const;
+  Optional<uint64_t> getOffsetEntry(DataExtractor Data, uint32_t Index) const {
+    if (Index > HeaderData.OffsetEntryCount)
+      return None;
+
+    uint8_t OffsetByteSize = Format == dwarf::DWARF64 ? 8 : 4;
+    uint64_t Offset =
+        getHeaderOffset() + getHeaderSize(Format) + OffsetByteSize * Index;
+    auto R = Data.getUnsigned(&Offset, OffsetByteSize);
+    return R;
   }
 
   /// Extract the table header and the array of offsets.
@@ -169,14 +170,14 @@ template <typename DWARFListType> class DWARFListTableBase {
   uint8_t getAddrSize() const { return Header.getAddrSize(); }
   dwarf::DwarfFormat getFormat() const { return Header.getFormat(); }
 
-  void dump(raw_ostream &OS,
+  void dump(DWARFDataExtractor Data, raw_ostream &OS,
             llvm::function_ref<Optional<object::SectionedAddress>(uint32_t)>
                 LookupPooledAddress,
             DIDumpOptions DumpOpts = {}) const;
 
   /// Return the contents of the offset entry designated by a given index.
-  Optional<uint64_t> getOffsetEntry(uint32_t Index) const {
-    return Header.getOffsetEntry(Index);
+  Optional<uint64_t> getOffsetEntry(DataExtractor Data, uint32_t Index) const {
+    return Header.getOffsetEntry(Data, Index);
   }
   /// Return the size of the table header including the length but not including
   /// the offsets. This is dependent on the table format, which is unambiguously
@@ -240,11 +241,11 @@ Error DWARFListType<ListEntryType>::extract(DWARFDataExtractor Data,
 
 template <typename DWARFListType>
 void DWARFListTableBase<DWARFListType>::dump(
-    raw_ostream &OS,
+    DWARFDataExtractor Data, raw_ostream &OS,
     llvm::function_ref<Optional<object::SectionedAddress>(uint32_t)>
         LookupPooledAddress,
     DIDumpOptions DumpOpts) const {
-  Header.dump(OS, DumpOpts);
+  Header.dump(Data, OS, DumpOpts);
   OS << HeaderString << "\n";
 
   // Determine the length of the longest encoding string we have in the table,
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index 1613e74d4a367e..c76ee5efa37b26 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -412,18 +412,13 @@ class DWARFUnit {
   /// Return a rangelist's offset based on an index. The index designates
   /// an entry in the rangelist table's offset array and is supplied by
   /// DW_FORM_rnglistx.
-  Optional<uint64_t> getRnglistOffset(uint32_t Index) {
-    if (!RngListTable)
-      return None;
-    if (Optional<uint64_t> Off = RngListTable->getOffsetEntry(Index))
-      return *Off + RangeSectionBase;
-    return None;
-  }
+  Optional<uint64_t> getRnglistOffset(uint32_t Index);
 
   Optional<uint64_t> getLoclistOffset(uint32_t Index) {
     if (!LoclistTableHeader)
       return None;
-    if (Optional<uint64_t> Off = LoclistTableHeader->getOffsetEntry(Index))
+    if (Optional<uint64_t> Off =
+            LoclistTableHeader->getOffsetEntry(LocTable->getData(), Index))
       return *Off + getLocSectionBase();
     return None;
   }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index 3bcde8fafb1f75..96ba5794683344 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -255,7 +255,7 @@ static void dumpRnglistsSection(
         break;
       Offset = TableOffset + Length;
     } else {
-      Rnglists.dump(OS, LookupPooledAddress, DumpOpts);
+      Rnglists.dump(rnglistData, OS, LookupPooledAddress, DumpOpts);
     }
   }
 }
@@ -316,7 +316,7 @@ static void dumpLoclistsSection(raw_ostream &OS, DIDumpOptions DumpOpts,
       return;
     }
 
-    Header.dump(OS, DumpOpts);
+    Header.dump(Data, OS, DumpOpts);
 
     uint64_t EndOffset = Header.length() + Header.getHeaderOffset();
     Data.setAddressSize(Header.getAddrSize());
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
index 2124a49bef6065..c876af1e9b5135 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
@@ -71,12 +71,12 @@ Error DWARFListTableHeader::extract(DWARFDataExtractor Data,
         ") than there is space for",
         SectionName.data(), HeaderOffset, HeaderData.OffsetEntryCount);
   Data.setAddressSize(HeaderData.AddrSize);
-  for (uint32_t I = 0; I < HeaderData.OffsetEntryCount; ++I)
-    Offsets.push_back(Data.getRelocatedValue(OffsetByteSize, OffsetPtr));
+  *OffsetPtr += HeaderData.OffsetEntryCount * OffsetByteSize;
   return Error::success();
 }
 
-void DWARFListTableHeader::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
+void DWARFListTableHeader::dump(DataExtractor Data, raw_ostream &OS,
+                                DIDumpOptions DumpOpts) const {
   if (DumpOpts.Verbose)
     OS << format("0x%8.8" PRIx64 ": ", HeaderOffset);
   int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(Format);
@@ -91,7 +91,8 @@ void DWARFListTableHeader::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
 
   if (HeaderData.OffsetEntryCount > 0) {
     OS << "offsets: [";
-    for (const auto &Off : Offsets) {
+    for (uint32_t I = 0; I < HeaderData.OffsetEntryCount; ++I) {
+      auto Off = *getOffsetEntry(Data, I);
       OS << format("\n0x%0*" PRIx64, OffsetDumpWidth, Off);
       if (DumpOpts.Verbose)
         OS << format(" => 0x%08" PRIx64,
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 0527f29d1a1a9d..b871e6ebdca56c 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -548,17 +548,13 @@ Error DWARFUnit::tryExtractDIEsIfNeeded(bool CUDieOnly) {
 
       uint64_t HeaderSize = DWARFListTableHeader::getHeaderSize(Header.getFormat());
       uint64_t Offset = getLocSectionBase();
-      DWARFDataExtractor Data(Context.getDWARFObj(), *LocSection,
-                              isLittleEndian, getAddressByteSize());
+      const DWARFDataExtractor &Data = LocTable->getData();
       if (Offset < HeaderSize)
         return createStringError(errc::invalid_argument,
                                  "did not detect a valid"
                                  " list table with base = 0x%" PRIx64 "\n",
                                  Offset);
       Offset -= HeaderSize;
-      if (auto *IndexEntry = Header.getIndexEntry())
-        if (const auto *Contrib = IndexEntry->getContribution(DW_SECT_LOCLISTS))
-          Offset += Contrib->Offset;
       if (Error E = LoclistTableHeader->extract(Data, &Offset))
         return createStringError(errc::invalid_argument,
                                  "parsing a loclist table: " +
@@ -1009,3 +1005,13 @@ DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor & DA) {
     return DescOrError.takeError();
   return *DescOrError;
 }
+
+Optional<uint64_t> DWARFUnit::getRnglistOffset(uint32_t Index) {
+  if (!RngListTable)
+    return None;
+  DataExtractor RangesData(RangeSection->Data, isLittleEndian,
+                           getAddressByteSize());
+  if (Optional<uint64_t> Off = RngListTable->getOffsetEntry(RangesData, Index))
+    return *Off + RangeSectionBase;
+  return None;
+}

From 5a15f6628efcb583e1cca1fdc57d7e64f5f665da Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 27 Jul 2020 22:00:50 -0400
Subject: [PATCH 071/101] GlobalISel: Implement fewerElementsVector for
 G_INSERT_VECTOR_ELT

Add unit tests since AMDGPU will only trigger this for gigantic
vectors, and won't use the annoying odd sized breakdown case.
---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |    6 +-
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   45 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |    2 +-
 .../AMDGPU/GlobalISel/insertelement.large.ll  |  137 ++
 .../GlobalISel/legalize-insert-vector-elt.mir | 1227 ++---------------
 .../GlobalISel/LegalizerHelperTest.cpp        |   81 ++
 6 files changed, 392 insertions(+), 1106 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index e1295f66c29847..d0e7419ec8129d 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -279,9 +279,9 @@ class LegalizerHelper {
   LegalizeResult fewerElementsVectorBuildVector(MachineInstr &MI,
                                                 unsigned TypeIdx,
                                                 LLT NarrowTy);
-  LegalizeResult fewerElementsVectorExtractVectorElt(MachineInstr &MI,
-                                                     unsigned TypeIdx,
-                                                     LLT NarrowTy);
+  LegalizeResult fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
+                                                           unsigned TypeIdx,
+                                                           LLT NarrowTy);
 
   LegalizeResult
   reduceLoadStoreWidth(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 841c37ec7a3ea0..2f722d04a69732 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3608,18 +3608,24 @@ LegalizerHelper::fewerElementsVectorBuildVector(MachineInstr &MI,
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::fewerElementsVectorExtractVectorElt(MachineInstr &MI,
-                                                     unsigned TypeIdx,
-                                                     LLT NarrowVecTy) {
-  assert(TypeIdx == 1 && "not a vector type index");
+LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
+                                                           unsigned TypeIdx,
+                                                           LLT NarrowVecTy) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcVec = MI.getOperand(1).getReg();
+  Register InsertVal;
+  bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
+
+  assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
+  if (IsInsert)
+    InsertVal = MI.getOperand(2).getReg();
+
+  Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
 
   // TODO: Handle total scalarization case.
   if (!NarrowVecTy.isVector())
     return UnableToLegalize;
 
-  Register DstReg = MI.getOperand(0).getReg();
-  Register SrcVec = MI.getOperand(1).getReg();
-  Register Idx = MI.getOperand(2).getReg();
   LLT VecTy = MRI.getType(SrcVec);
 
   // If the index is a constant, we can really break this down as you would
@@ -3637,8 +3643,8 @@ LegalizerHelper::fewerElementsVectorExtractVectorElt(MachineInstr &MI,
     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
 
     // Build a sequence of NarrowTy pieces in VecParts for this operand.
-    buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
-                        TargetOpcode::G_ANYEXT);
+    LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
+                                    TargetOpcode::G_ANYEXT);
 
     unsigned NewNumElts = NarrowVecTy.getNumElements();
 
@@ -3647,12 +3653,26 @@ LegalizerHelper::fewerElementsVectorExtractVectorElt(MachineInstr &MI,
     auto NewIdx =
         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
 
-    MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
+    if (IsInsert) {
+      LLT PartTy = MRI.getType(VecParts[PartIdx]);
+
+      // Use the adjusted index to insert into one of the subvectors.
+      auto InsertPart = MIRBuilder.buildInsertVectorElement(
+          PartTy, VecParts[PartIdx], InsertVal, NewIdx);
+      VecParts[PartIdx] = InsertPart.getReg(0);
+
+      // Recombine the inserted subvector with the others to reform the result
+      // vector.
+      buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
+    } else {
+      MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
+    }
+
     MI.eraseFromParent();
     return Legalized;
   }
 
-  // With a variable index, we can't perform the extract in a smaller type, so
+  // With a variable index, we can't perform the operation in a smaller type, so
   // we're forced to expand this.
   //
   // TODO: We could emit a chain of compare/select to figure out which piece to
@@ -3992,7 +4012,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_BUILD_VECTOR:
     return fewerElementsVectorBuildVector(MI, TypeIdx, NarrowTy);
   case G_EXTRACT_VECTOR_ELT:
-    return fewerElementsVectorExtractVectorElt(MI, TypeIdx, NarrowTy);
+  case G_INSERT_VECTOR_ELT:
+    return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
   case G_LOAD:
   case G_STORE:
     return reduceLoadStoreWidth(MI, TypeIdx, NarrowTy);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 28fbc3ec59e710..4c3adb108031a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1359,7 +1359,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .clampScalar(EltTypeIdx, S32, S64)
       .clampScalar(VecTypeIdx, S32, S64)
       .clampScalar(IdxTypeIdx, S32, S32)
-      .clampMaxNumElements(1, S32, 32)
+      .clampMaxNumElements(VecTypeIdx, S32, 32)
       // TODO: Clamp elements for 64-bit vectors?
       // It should only be necessary with variable indexes.
       // As a last resort, lower to the stack
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
new file mode 100644
index 00000000000000..5d1468eba04ea7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in, <64 x i32> addrspace(1)* %ptr.out) #0 {
+; GCN-LABEL: v_insert_v64i32_37:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT:    v_lshlrev_b64 v[0:1], 8, v[0:1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_add_co_u32_e32 v8, vcc, v2, v0
+; GCN-NEXT:    s_mov_b32 s1, 0
+; GCN-NEXT:    v_addc_co_u32_e32 v9, vcc, v3, v1, vcc
+; GCN-NEXT:    s_movk_i32 s0, 0x80
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_add_co_u32_e32 v12, vcc, v8, v2
+; GCN-NEXT:    s_movk_i32 s0, 0xc0
+; GCN-NEXT:    v_mov_b32_e32 v65, s1
+; GCN-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NEXT:    v_mov_b32_e32 v64, s0
+; GCN-NEXT:    s_movk_i32 s0, 0x50
+; GCN-NEXT:    v_mov_b32_e32 v69, s1
+; GCN-NEXT:    v_addc_co_u32_e32 v13, vcc, v9, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NEXT:    v_add_co_u32_e32 v66, vcc, v4, v0
+; GCN-NEXT:    v_mov_b32_e32 v68, s0
+; GCN-NEXT:    s_movk_i32 s0, 0x60
+; GCN-NEXT:    v_mov_b32_e32 v71, s1
+; GCN-NEXT:    v_addc_co_u32_e32 v67, vcc, v5, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v70, s0
+; GCN-NEXT:    s_movk_i32 s0, 0x70
+; GCN-NEXT:    v_mov_b32_e32 v73, s1
+; GCN-NEXT:    v_add_co_u32_e32 v74, vcc, v66, v2
+; GCN-NEXT:    v_mov_b32_e32 v72, s0
+; GCN-NEXT:    s_movk_i32 s0, 0x90
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_addc_co_u32_e32 v75, vcc, v67, v3, vcc
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_add_co_u32_e32 v76, vcc, v66, v0
+; GCN-NEXT:    v_addc_co_u32_e32 v77, vcc, v67, v1, vcc
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v[12:13], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v[12:13], off
+; GCN-NEXT:    v_add_co_u32_e32 v10, vcc, 64, v8
+; GCN-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v9, vcc
+; GCN-NEXT:    v_add_co_u32_e32 v28, vcc, v8, v64
+; GCN-NEXT:    v_addc_co_u32_e32 v29, vcc, v9, v65, vcc
+; GCN-NEXT:    global_load_dwordx4 v[32:35], v[8:9], off
+; GCN-NEXT:    global_load_dwordx4 v[36:39], v[8:9], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[40:43], v[8:9], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v[8:9], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v[10:11], off
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v[10:11], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v[10:11], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[60:63], v[10:11], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v[12:13], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v[12:13], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v[28:29], off
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v[28:29], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v[28:29], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v[28:29], off offset:48
+; GCN-NEXT:    s_movk_i32 s0, 0xa0
+; GCN-NEXT:    s_waitcnt vmcnt(15)
+; GCN-NEXT:    v_mov_b32_e32 v5, 0x3e7
+; GCN-NEXT:    s_waitcnt vmcnt(14)
+; GCN-NEXT:    global_store_dwordx4 v[74:75], v[0:3], off
+; GCN-NEXT:    global_store_dwordx4 v[76:77], v[4:7], off
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v66, v0
+; GCN-NEXT:    s_movk_i32 s0, 0xb0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v67, v1, vcc
+; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, v66, v2
+; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, v67, v3, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v[2:3], v[12:15], off
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v66, v64
+; GCN-NEXT:    s_movk_i32 s0, 0xd0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v67, v65, vcc
+; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, v66, v2
+; GCN-NEXT:    s_movk_i32 s0, 0xe0
+; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, v67, v3, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v[0:1], v[16:19], off
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v[2:3], v[20:23], off
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v66, v0
+; GCN-NEXT:    s_movk_i32 s0, 0xf0
+; GCN-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v67, v1, vcc
+; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, v66, v2
+; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, v67, v3, vcc
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v[0:1], v[24:27], off
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v[2:3], v[28:31], off
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, 64, v66
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v67, vcc
+; GCN-NEXT:    global_store_dwordx4 v[0:1], v[36:39], off offset:-48
+; GCN-NEXT:    global_store_dwordx4 v[0:1], v[40:43], off offset:-32
+; GCN-NEXT:    global_store_dwordx4 v[0:1], v[44:47], off offset:-16
+; GCN-NEXT:    global_store_dwordx4 v[0:1], v[48:51], off
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v66, v68
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v67, v69, vcc
+; GCN-NEXT:    global_store_dwordx4 v[66:67], v[32:35], off
+; GCN-NEXT:    global_store_dwordx4 v[0:1], v[52:55], off
+; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v66, v70
+; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v67, v71, vcc
+; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, v66, v72
+; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, v67, v73, vcc
+; GCN-NEXT:    global_store_dwordx4 v[0:1], v[56:59], off
+; GCN-NEXT:    global_store_dwordx4 v[2:3], v[60:63], off
+; GCN-NEXT:    s_endpgm
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.in, i32 %id
+  %vec = load <64 x i32>, <64 x i32> addrspace(1)* %gep.in
+  %insert = insertelement <64 x i32> %vec, i32 999, i32 37
+  %gep.out = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.out, i32 %id
+  store <64 x i32> %insert, <64 x i32> addrspace(1)* %gep.out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { "amdgpu-waves-per-eu"="1,10" }
+attributes #1 = { nounwind readnone speculatable willreturn }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir
index 3f3ec6216585b2..6d0d24e2373eaa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert-vector-elt.mir
@@ -190,712 +190,94 @@ body: |
 
     ; CHECK-LABEL: name: insert_vector_elt_64_65_v64s32
     ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $sgpr0_sgpr1
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
-    ; CHECK: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p1) :: (load 64, align 4, addrspace 4)
-    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-    ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; CHECK: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load 64 + 64, align 4, addrspace 4)
-    ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 128
-    ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; CHECK: [[LOAD2:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load 64 + 128, align 4, addrspace 4)
-    ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 192
-    ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; CHECK: [[LOAD3:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load 64 + 192, align 4, addrspace 4)
-    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12345
-    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.1
-    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<16 x s32>)
-    ; CHECK: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<16 x s32>)
-    ; CHECK: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32), [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32), [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32), [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32), [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32), [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32), [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD2]](<16 x s32>)
-    ; CHECK: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32), [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32), [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32), [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32), [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32), [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32), [[UV60:%[0-9]+]]:_(s32), [[UV61:%[0-9]+]]:_(s32), [[UV62:%[0-9]+]]:_(s32), [[UV63:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD3]](<16 x s32>)
-    ; CHECK: G_STORE [[UV]](s32), [[FRAME_INDEX]](p5) :: (store 4 into %stack.1, align 256, addrspace 5)
-    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C5]](s32)
-    ; CHECK: [[COPY1:%[0-9]+]]:_(p5) = COPY [[PTR_ADD3]](p5)
-    ; CHECK: G_STORE [[UV1]](s32), [[COPY1]](p5) :: (store 4 into %stack.1 + 4, align 256, addrspace 5)
-    ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C6]](s32)
-    ; CHECK: [[COPY2:%[0-9]+]]:_(p5) = COPY [[PTR_ADD4]](p5)
-    ; CHECK: G_STORE [[UV2]](s32), [[COPY2]](p5) :: (store 4 into %stack.1 + 8, align 256, addrspace 5)
-    ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; CHECK: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C7]](s32)
-    ; CHECK: [[COPY3:%[0-9]+]]:_(p5) = COPY [[PTR_ADD5]](p5)
-    ; CHECK: G_STORE [[UV3]](s32), [[COPY3]](p5) :: (store 4 into %stack.1 + 12, align 256, addrspace 5)
-    ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C8]](s32)
-    ; CHECK: [[COPY4:%[0-9]+]]:_(p5) = COPY [[PTR_ADD6]](p5)
-    ; CHECK: G_STORE [[UV4]](s32), [[COPY4]](p5) :: (store 4 into %stack.1 + 16, align 256, addrspace 5)
-    ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-    ; CHECK: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C9]](s32)
-    ; CHECK: [[COPY5:%[0-9]+]]:_(p5) = COPY [[PTR_ADD7]](p5)
-    ; CHECK: G_STORE [[UV5]](s32), [[COPY5]](p5) :: (store 4 into %stack.1 + 20, align 256, addrspace 5)
-    ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; CHECK: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C10]](s32)
-    ; CHECK: [[COPY6:%[0-9]+]]:_(p5) = COPY [[PTR_ADD8]](p5)
-    ; CHECK: G_STORE [[UV6]](s32), [[COPY6]](p5) :: (store 4 into %stack.1 + 24, align 256, addrspace 5)
-    ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
-    ; CHECK: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C11]](s32)
-    ; CHECK: [[COPY7:%[0-9]+]]:_(p5) = COPY [[PTR_ADD9]](p5)
-    ; CHECK: G_STORE [[UV7]](s32), [[COPY7]](p5) :: (store 4 into %stack.1 + 28, align 256, addrspace 5)
-    ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; CHECK: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C12]](s32)
-    ; CHECK: [[COPY8:%[0-9]+]]:_(p5) = COPY [[PTR_ADD10]](p5)
-    ; CHECK: G_STORE [[UV8]](s32), [[COPY8]](p5) :: (store 4 into %stack.1 + 32, align 256, addrspace 5)
-    ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 36
-    ; CHECK: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C13]](s32)
-    ; CHECK: [[COPY9:%[0-9]+]]:_(p5) = COPY [[PTR_ADD11]](p5)
-    ; CHECK: G_STORE [[UV9]](s32), [[COPY9]](p5) :: (store 4 into %stack.1 + 36, align 256, addrspace 5)
-    ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 40
-    ; CHECK: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C14]](s32)
-    ; CHECK: [[COPY10:%[0-9]+]]:_(p5) = COPY [[PTR_ADD12]](p5)
-    ; CHECK: G_STORE [[UV10]](s32), [[COPY10]](p5) :: (store 4 into %stack.1 + 40, align 256, addrspace 5)
-    ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 44
-    ; CHECK: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C15]](s32)
-    ; CHECK: [[COPY11:%[0-9]+]]:_(p5) = COPY [[PTR_ADD13]](p5)
-    ; CHECK: G_STORE [[UV11]](s32), [[COPY11]](p5) :: (store 4 into %stack.1 + 44, align 256, addrspace 5)
-    ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
-    ; CHECK: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C16]](s32)
-    ; CHECK: [[COPY12:%[0-9]+]]:_(p5) = COPY [[PTR_ADD14]](p5)
-    ; CHECK: G_STORE [[UV12]](s32), [[COPY12]](p5) :: (store 4 into %stack.1 + 48, align 256, addrspace 5)
-    ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 52
-    ; CHECK: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C17]](s32)
-    ; CHECK: [[COPY13:%[0-9]+]]:_(p5) = COPY [[PTR_ADD15]](p5)
-    ; CHECK: G_STORE [[UV13]](s32), [[COPY13]](p5) :: (store 4 into %stack.1 + 52, align 256, addrspace 5)
-    ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 56
-    ; CHECK: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C18]](s32)
-    ; CHECK: [[COPY14:%[0-9]+]]:_(p5) = COPY [[PTR_ADD16]](p5)
-    ; CHECK: G_STORE [[UV14]](s32), [[COPY14]](p5) :: (store 4 into %stack.1 + 56, align 256, addrspace 5)
-    ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 60
-    ; CHECK: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C19]](s32)
-    ; CHECK: [[COPY15:%[0-9]+]]:_(p5) = COPY [[PTR_ADD17]](p5)
-    ; CHECK: G_STORE [[UV15]](s32), [[COPY15]](p5) :: (store 4 into %stack.1 + 60, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C]](s32)
-    ; CHECK: [[COPY16:%[0-9]+]]:_(p5) = COPY [[PTR_ADD18]](p5)
-    ; CHECK: G_STORE [[UV16]](s32), [[COPY16]](p5) :: (store 4 into %stack.1 + 64, align 256, addrspace 5)
-    ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 68
-    ; CHECK: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C20]](s32)
-    ; CHECK: [[COPY17:%[0-9]+]]:_(p5) = COPY [[PTR_ADD19]](p5)
-    ; CHECK: G_STORE [[UV17]](s32), [[COPY17]](p5) :: (store 4 into %stack.1 + 68, align 256, addrspace 5)
-    ; CHECK: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 72
-    ; CHECK: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C21]](s32)
-    ; CHECK: [[COPY18:%[0-9]+]]:_(p5) = COPY [[PTR_ADD20]](p5)
-    ; CHECK: G_STORE [[UV18]](s32), [[COPY18]](p5) :: (store 4 into %stack.1 + 72, align 256, addrspace 5)
-    ; CHECK: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 76
-    ; CHECK: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C22]](s32)
-    ; CHECK: [[COPY19:%[0-9]+]]:_(p5) = COPY [[PTR_ADD21]](p5)
-    ; CHECK: G_STORE [[UV19]](s32), [[COPY19]](p5) :: (store 4 into %stack.1 + 76, align 256, addrspace 5)
-    ; CHECK: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 80
-    ; CHECK: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C23]](s32)
-    ; CHECK: [[COPY20:%[0-9]+]]:_(p5) = COPY [[PTR_ADD22]](p5)
-    ; CHECK: G_STORE [[UV20]](s32), [[COPY20]](p5) :: (store 4 into %stack.1 + 80, align 256, addrspace 5)
-    ; CHECK: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 84
-    ; CHECK: [[PTR_ADD23:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C24]](s32)
-    ; CHECK: [[COPY21:%[0-9]+]]:_(p5) = COPY [[PTR_ADD23]](p5)
-    ; CHECK: G_STORE [[UV21]](s32), [[COPY21]](p5) :: (store 4 into %stack.1 + 84, align 256, addrspace 5)
-    ; CHECK: [[C25:%[0-9]+]]:_(s32) = G_CONSTANT i32 88
-    ; CHECK: [[PTR_ADD24:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C25]](s32)
-    ; CHECK: [[COPY22:%[0-9]+]]:_(p5) = COPY [[PTR_ADD24]](p5)
-    ; CHECK: G_STORE [[UV22]](s32), [[COPY22]](p5) :: (store 4 into %stack.1 + 88, align 256, addrspace 5)
-    ; CHECK: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 92
-    ; CHECK: [[PTR_ADD25:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C26]](s32)
-    ; CHECK: [[COPY23:%[0-9]+]]:_(p5) = COPY [[PTR_ADD25]](p5)
-    ; CHECK: G_STORE [[UV23]](s32), [[COPY23]](p5) :: (store 4 into %stack.1 + 92, align 256, addrspace 5)
-    ; CHECK: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 96
-    ; CHECK: [[PTR_ADD26:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C27]](s32)
-    ; CHECK: [[COPY24:%[0-9]+]]:_(p5) = COPY [[PTR_ADD26]](p5)
-    ; CHECK: G_STORE [[UV24]](s32), [[COPY24]](p5) :: (store 4 into %stack.1 + 96, align 256, addrspace 5)
-    ; CHECK: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-    ; CHECK: [[PTR_ADD27:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C28]](s32)
-    ; CHECK: [[COPY25:%[0-9]+]]:_(p5) = COPY [[PTR_ADD27]](p5)
-    ; CHECK: G_STORE [[UV25]](s32), [[COPY25]](p5) :: (store 4 into %stack.1 + 100, align 256, addrspace 5)
-    ; CHECK: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 104
-    ; CHECK: [[PTR_ADD28:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C29]](s32)
-    ; CHECK: [[COPY26:%[0-9]+]]:_(p5) = COPY [[PTR_ADD28]](p5)
-    ; CHECK: G_STORE [[UV26]](s32), [[COPY26]](p5) :: (store 4 into %stack.1 + 104, align 256, addrspace 5)
-    ; CHECK: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 108
-    ; CHECK: [[PTR_ADD29:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C30]](s32)
-    ; CHECK: [[COPY27:%[0-9]+]]:_(p5) = COPY [[PTR_ADD29]](p5)
-    ; CHECK: G_STORE [[UV27]](s32), [[COPY27]](p5) :: (store 4 into %stack.1 + 108, align 256, addrspace 5)
-    ; CHECK: [[C31:%[0-9]+]]:_(s32) = G_CONSTANT i32 112
-    ; CHECK: [[PTR_ADD30:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C31]](s32)
-    ; CHECK: [[COPY28:%[0-9]+]]:_(p5) = COPY [[PTR_ADD30]](p5)
-    ; CHECK: G_STORE [[UV28]](s32), [[COPY28]](p5) :: (store 4 into %stack.1 + 112, align 256, addrspace 5)
-    ; CHECK: [[C32:%[0-9]+]]:_(s32) = G_CONSTANT i32 116
-    ; CHECK: [[PTR_ADD31:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C32]](s32)
-    ; CHECK: [[COPY29:%[0-9]+]]:_(p5) = COPY [[PTR_ADD31]](p5)
-    ; CHECK: G_STORE [[UV29]](s32), [[COPY29]](p5) :: (store 4 into %stack.1 + 116, align 256, addrspace 5)
-    ; CHECK: [[C33:%[0-9]+]]:_(s32) = G_CONSTANT i32 120
-    ; CHECK: [[PTR_ADD32:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C33]](s32)
-    ; CHECK: [[COPY30:%[0-9]+]]:_(p5) = COPY [[PTR_ADD32]](p5)
-    ; CHECK: G_STORE [[UV30]](s32), [[COPY30]](p5) :: (store 4 into %stack.1 + 120, align 256, addrspace 5)
-    ; CHECK: [[C34:%[0-9]+]]:_(s32) = G_CONSTANT i32 124
-    ; CHECK: [[PTR_ADD33:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C34]](s32)
-    ; CHECK: [[COPY31:%[0-9]+]]:_(p5) = COPY [[PTR_ADD33]](p5)
-    ; CHECK: G_STORE [[UV31]](s32), [[COPY31]](p5) :: (store 4 into %stack.1 + 124, align 256, addrspace 5)
-    ; CHECK: [[C35:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
-    ; CHECK: [[PTR_ADD34:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C35]](s32)
-    ; CHECK: [[COPY32:%[0-9]+]]:_(p5) = COPY [[PTR_ADD34]](p5)
-    ; CHECK: G_STORE [[UV32]](s32), [[COPY32]](p5) :: (store 4 into %stack.1 + 128, align 256, addrspace 5)
-    ; CHECK: [[C36:%[0-9]+]]:_(s32) = G_CONSTANT i32 132
-    ; CHECK: [[PTR_ADD35:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C36]](s32)
-    ; CHECK: [[COPY33:%[0-9]+]]:_(p5) = COPY [[PTR_ADD35]](p5)
-    ; CHECK: G_STORE [[UV33]](s32), [[COPY33]](p5) :: (store 4 into %stack.1 + 132, align 256, addrspace 5)
-    ; CHECK: [[C37:%[0-9]+]]:_(s32) = G_CONSTANT i32 136
-    ; CHECK: [[PTR_ADD36:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C37]](s32)
-    ; CHECK: [[COPY34:%[0-9]+]]:_(p5) = COPY [[PTR_ADD36]](p5)
-    ; CHECK: G_STORE [[UV34]](s32), [[COPY34]](p5) :: (store 4 into %stack.1 + 136, align 256, addrspace 5)
-    ; CHECK: [[C38:%[0-9]+]]:_(s32) = G_CONSTANT i32 140
-    ; CHECK: [[PTR_ADD37:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C38]](s32)
-    ; CHECK: [[COPY35:%[0-9]+]]:_(p5) = COPY [[PTR_ADD37]](p5)
-    ; CHECK: G_STORE [[UV35]](s32), [[COPY35]](p5) :: (store 4 into %stack.1 + 140, align 256, addrspace 5)
-    ; CHECK: [[C39:%[0-9]+]]:_(s32) = G_CONSTANT i32 144
-    ; CHECK: [[PTR_ADD38:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C39]](s32)
-    ; CHECK: [[COPY36:%[0-9]+]]:_(p5) = COPY [[PTR_ADD38]](p5)
-    ; CHECK: G_STORE [[UV36]](s32), [[COPY36]](p5) :: (store 4 into %stack.1 + 144, align 256, addrspace 5)
-    ; CHECK: [[C40:%[0-9]+]]:_(s32) = G_CONSTANT i32 148
-    ; CHECK: [[PTR_ADD39:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C40]](s32)
-    ; CHECK: [[COPY37:%[0-9]+]]:_(p5) = COPY [[PTR_ADD39]](p5)
-    ; CHECK: G_STORE [[UV37]](s32), [[COPY37]](p5) :: (store 4 into %stack.1 + 148, align 256, addrspace 5)
-    ; CHECK: [[C41:%[0-9]+]]:_(s32) = G_CONSTANT i32 152
-    ; CHECK: [[PTR_ADD40:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C41]](s32)
-    ; CHECK: [[COPY38:%[0-9]+]]:_(p5) = COPY [[PTR_ADD40]](p5)
-    ; CHECK: G_STORE [[UV38]](s32), [[COPY38]](p5) :: (store 4 into %stack.1 + 152, align 256, addrspace 5)
-    ; CHECK: [[C42:%[0-9]+]]:_(s32) = G_CONSTANT i32 156
-    ; CHECK: [[PTR_ADD41:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C42]](s32)
-    ; CHECK: [[COPY39:%[0-9]+]]:_(p5) = COPY [[PTR_ADD41]](p5)
-    ; CHECK: G_STORE [[UV39]](s32), [[COPY39]](p5) :: (store 4 into %stack.1 + 156, align 256, addrspace 5)
-    ; CHECK: [[C43:%[0-9]+]]:_(s32) = G_CONSTANT i32 160
-    ; CHECK: [[PTR_ADD42:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C43]](s32)
-    ; CHECK: [[COPY40:%[0-9]+]]:_(p5) = COPY [[PTR_ADD42]](p5)
-    ; CHECK: G_STORE [[UV40]](s32), [[COPY40]](p5) :: (store 4 into %stack.1 + 160, align 256, addrspace 5)
-    ; CHECK: [[C44:%[0-9]+]]:_(s32) = G_CONSTANT i32 164
-    ; CHECK: [[PTR_ADD43:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C44]](s32)
-    ; CHECK: [[COPY41:%[0-9]+]]:_(p5) = COPY [[PTR_ADD43]](p5)
-    ; CHECK: G_STORE [[UV41]](s32), [[COPY41]](p5) :: (store 4 into %stack.1 + 164, align 256, addrspace 5)
-    ; CHECK: [[C45:%[0-9]+]]:_(s32) = G_CONSTANT i32 168
-    ; CHECK: [[PTR_ADD44:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C45]](s32)
-    ; CHECK: [[COPY42:%[0-9]+]]:_(p5) = COPY [[PTR_ADD44]](p5)
-    ; CHECK: G_STORE [[UV42]](s32), [[COPY42]](p5) :: (store 4 into %stack.1 + 168, align 256, addrspace 5)
-    ; CHECK: [[C46:%[0-9]+]]:_(s32) = G_CONSTANT i32 172
-    ; CHECK: [[PTR_ADD45:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C46]](s32)
-    ; CHECK: [[COPY43:%[0-9]+]]:_(p5) = COPY [[PTR_ADD45]](p5)
-    ; CHECK: G_STORE [[UV43]](s32), [[COPY43]](p5) :: (store 4 into %stack.1 + 172, align 256, addrspace 5)
-    ; CHECK: [[C47:%[0-9]+]]:_(s32) = G_CONSTANT i32 176
-    ; CHECK: [[PTR_ADD46:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C47]](s32)
-    ; CHECK: [[COPY44:%[0-9]+]]:_(p5) = COPY [[PTR_ADD46]](p5)
-    ; CHECK: G_STORE [[UV44]](s32), [[COPY44]](p5) :: (store 4 into %stack.1 + 176, align 256, addrspace 5)
-    ; CHECK: [[C48:%[0-9]+]]:_(s32) = G_CONSTANT i32 180
-    ; CHECK: [[PTR_ADD47:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C48]](s32)
-    ; CHECK: [[COPY45:%[0-9]+]]:_(p5) = COPY [[PTR_ADD47]](p5)
-    ; CHECK: G_STORE [[UV45]](s32), [[COPY45]](p5) :: (store 4 into %stack.1 + 180, align 256, addrspace 5)
-    ; CHECK: [[C49:%[0-9]+]]:_(s32) = G_CONSTANT i32 184
-    ; CHECK: [[PTR_ADD48:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C49]](s32)
-    ; CHECK: [[COPY46:%[0-9]+]]:_(p5) = COPY [[PTR_ADD48]](p5)
-    ; CHECK: G_STORE [[UV46]](s32), [[COPY46]](p5) :: (store 4 into %stack.1 + 184, align 256, addrspace 5)
-    ; CHECK: [[C50:%[0-9]+]]:_(s32) = G_CONSTANT i32 188
-    ; CHECK: [[PTR_ADD49:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C50]](s32)
-    ; CHECK: [[COPY47:%[0-9]+]]:_(p5) = COPY [[PTR_ADD49]](p5)
-    ; CHECK: G_STORE [[UV47]](s32), [[COPY47]](p5) :: (store 4 into %stack.1 + 188, align 256, addrspace 5)
-    ; CHECK: [[C51:%[0-9]+]]:_(s32) = G_CONSTANT i32 192
-    ; CHECK: [[PTR_ADD50:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C51]](s32)
-    ; CHECK: [[COPY48:%[0-9]+]]:_(p5) = COPY [[PTR_ADD50]](p5)
-    ; CHECK: G_STORE [[UV48]](s32), [[COPY48]](p5) :: (store 4 into %stack.1 + 192, align 256, addrspace 5)
-    ; CHECK: [[C52:%[0-9]+]]:_(s32) = G_CONSTANT i32 196
-    ; CHECK: [[PTR_ADD51:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C52]](s32)
-    ; CHECK: [[COPY49:%[0-9]+]]:_(p5) = COPY [[PTR_ADD51]](p5)
-    ; CHECK: G_STORE [[UV49]](s32), [[COPY49]](p5) :: (store 4 into %stack.1 + 196, align 256, addrspace 5)
-    ; CHECK: [[C53:%[0-9]+]]:_(s32) = G_CONSTANT i32 200
-    ; CHECK: [[PTR_ADD52:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C53]](s32)
-    ; CHECK: [[COPY50:%[0-9]+]]:_(p5) = COPY [[PTR_ADD52]](p5)
-    ; CHECK: G_STORE [[UV50]](s32), [[COPY50]](p5) :: (store 4 into %stack.1 + 200, align 256, addrspace 5)
-    ; CHECK: [[C54:%[0-9]+]]:_(s32) = G_CONSTANT i32 204
-    ; CHECK: [[PTR_ADD53:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C54]](s32)
-    ; CHECK: [[COPY51:%[0-9]+]]:_(p5) = COPY [[PTR_ADD53]](p5)
-    ; CHECK: G_STORE [[UV51]](s32), [[COPY51]](p5) :: (store 4 into %stack.1 + 204, align 256, addrspace 5)
-    ; CHECK: [[C55:%[0-9]+]]:_(s32) = G_CONSTANT i32 208
-    ; CHECK: [[PTR_ADD54:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C55]](s32)
-    ; CHECK: [[COPY52:%[0-9]+]]:_(p5) = COPY [[PTR_ADD54]](p5)
-    ; CHECK: G_STORE [[UV52]](s32), [[COPY52]](p5) :: (store 4 into %stack.1 + 208, align 256, addrspace 5)
-    ; CHECK: [[C56:%[0-9]+]]:_(s32) = G_CONSTANT i32 212
-    ; CHECK: [[PTR_ADD55:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C56]](s32)
-    ; CHECK: [[COPY53:%[0-9]+]]:_(p5) = COPY [[PTR_ADD55]](p5)
-    ; CHECK: G_STORE [[UV53]](s32), [[COPY53]](p5) :: (store 4 into %stack.1 + 212, align 256, addrspace 5)
-    ; CHECK: [[C57:%[0-9]+]]:_(s32) = G_CONSTANT i32 216
-    ; CHECK: [[PTR_ADD56:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C57]](s32)
-    ; CHECK: [[COPY54:%[0-9]+]]:_(p5) = COPY [[PTR_ADD56]](p5)
-    ; CHECK: G_STORE [[UV54]](s32), [[COPY54]](p5) :: (store 4 into %stack.1 + 216, align 256, addrspace 5)
-    ; CHECK: [[C58:%[0-9]+]]:_(s32) = G_CONSTANT i32 220
-    ; CHECK: [[PTR_ADD57:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C58]](s32)
-    ; CHECK: [[COPY55:%[0-9]+]]:_(p5) = COPY [[PTR_ADD57]](p5)
-    ; CHECK: G_STORE [[UV55]](s32), [[COPY55]](p5) :: (store 4 into %stack.1 + 220, align 256, addrspace 5)
-    ; CHECK: [[C59:%[0-9]+]]:_(s32) = G_CONSTANT i32 224
-    ; CHECK: [[PTR_ADD58:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C59]](s32)
-    ; CHECK: [[COPY56:%[0-9]+]]:_(p5) = COPY [[PTR_ADD58]](p5)
-    ; CHECK: G_STORE [[UV56]](s32), [[COPY56]](p5) :: (store 4 into %stack.1 + 224, align 256, addrspace 5)
-    ; CHECK: [[C60:%[0-9]+]]:_(s32) = G_CONSTANT i32 228
-    ; CHECK: [[PTR_ADD59:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C60]](s32)
-    ; CHECK: [[COPY57:%[0-9]+]]:_(p5) = COPY [[PTR_ADD59]](p5)
-    ; CHECK: G_STORE [[UV57]](s32), [[COPY57]](p5) :: (store 4 into %stack.1 + 228, align 256, addrspace 5)
-    ; CHECK: [[C61:%[0-9]+]]:_(s32) = G_CONSTANT i32 232
-    ; CHECK: [[PTR_ADD60:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C61]](s32)
-    ; CHECK: [[COPY58:%[0-9]+]]:_(p5) = COPY [[PTR_ADD60]](p5)
-    ; CHECK: G_STORE [[UV58]](s32), [[COPY58]](p5) :: (store 4 into %stack.1 + 232, align 256, addrspace 5)
-    ; CHECK: [[C62:%[0-9]+]]:_(s32) = G_CONSTANT i32 236
-    ; CHECK: [[PTR_ADD61:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C62]](s32)
-    ; CHECK: [[COPY59:%[0-9]+]]:_(p5) = COPY [[PTR_ADD61]](p5)
-    ; CHECK: G_STORE [[UV59]](s32), [[COPY59]](p5) :: (store 4 into %stack.1 + 236, align 256, addrspace 5)
-    ; CHECK: [[C63:%[0-9]+]]:_(s32) = G_CONSTANT i32 240
-    ; CHECK: [[PTR_ADD62:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C63]](s32)
-    ; CHECK: [[COPY60:%[0-9]+]]:_(p5) = COPY [[PTR_ADD62]](p5)
-    ; CHECK: G_STORE [[UV60]](s32), [[COPY60]](p5) :: (store 4 into %stack.1 + 240, align 256, addrspace 5)
-    ; CHECK: [[C64:%[0-9]+]]:_(s32) = G_CONSTANT i32 244
-    ; CHECK: [[PTR_ADD63:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C64]](s32)
-    ; CHECK: [[COPY61:%[0-9]+]]:_(p5) = COPY [[PTR_ADD63]](p5)
-    ; CHECK: G_STORE [[UV61]](s32), [[COPY61]](p5) :: (store 4 into %stack.1 + 244, align 256, addrspace 5)
-    ; CHECK: [[C65:%[0-9]+]]:_(s32) = G_CONSTANT i32 248
-    ; CHECK: [[PTR_ADD64:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C65]](s32)
-    ; CHECK: [[COPY62:%[0-9]+]]:_(p5) = COPY [[PTR_ADD64]](p5)
-    ; CHECK: G_STORE [[UV62]](s32), [[COPY62]](p5) :: (store 4 into %stack.1 + 248, align 256, addrspace 5)
-    ; CHECK: [[C66:%[0-9]+]]:_(s32) = G_CONSTANT i32 252
-    ; CHECK: [[PTR_ADD65:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C66]](s32)
-    ; CHECK: [[COPY63:%[0-9]+]]:_(p5) = COPY [[PTR_ADD65]](p5)
-    ; CHECK: G_STORE [[UV63]](s32), [[COPY63]](p5) :: (store 4 into %stack.1 + 252, align 256, addrspace 5)
-    ; CHECK: [[C67:%[0-9]+]]:_(s32) = G_CONSTANT i32 256
-    ; CHECK: [[PTR_ADD66:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C67]](s32)
-    ; CHECK: G_STORE [[C4]](s32), [[PTR_ADD66]](p5) :: (store 4 into %stack.1 + 256, align 256, addrspace 5)
-    ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (load 4 from %stack.1 + 256, align 256, addrspace 5)
-    ; CHECK: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load 4 from %stack.1 + 260, align 256, addrspace 5)
-    ; CHECK: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load 4 from %stack.1 + 264, align 256, addrspace 5)
-    ; CHECK: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load 4 from %stack.1 + 268, align 256, addrspace 5)
-    ; CHECK: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load 4 from %stack.1 + 272, align 256, addrspace 5)
-    ; CHECK: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p5) :: (load 4 from %stack.1 + 276, align 256, addrspace 5)
-    ; CHECK: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load 4 from %stack.1 + 280, align 256, addrspace 5)
-    ; CHECK: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p5) :: (load 4 from %stack.1 + 284, align 256, addrspace 5)
-    ; CHECK: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load 4 from %stack.1 + 288, align 256, addrspace 5)
-    ; CHECK: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p5) :: (load 4 from %stack.1 + 292, align 256, addrspace 5)
-    ; CHECK: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p5) :: (load 4 from %stack.1 + 296, align 256, addrspace 5)
-    ; CHECK: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p5) :: (load 4 from %stack.1 + 300, align 256, addrspace 5)
-    ; CHECK: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load 4 from %stack.1 + 304, align 256, addrspace 5)
-    ; CHECK: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD15]](p5) :: (load 4 from %stack.1 + 308, align 256, addrspace 5)
-    ; CHECK: [[LOAD18:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD16]](p5) :: (load 4 from %stack.1 + 312, align 256, addrspace 5)
-    ; CHECK: [[LOAD19:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD17]](p5) :: (load 4 from %stack.1 + 316, align 256, addrspace 5)
-    ; CHECK: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load 4 from %stack.1 + 320, align 256, addrspace 5)
-    ; CHECK: [[LOAD21:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD19]](p5) :: (load 4 from %stack.1 + 324, align 256, addrspace 5)
-    ; CHECK: [[LOAD22:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD20]](p5) :: (load 4 from %stack.1 + 328, align 256, addrspace 5)
-    ; CHECK: [[LOAD23:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD21]](p5) :: (load 4 from %stack.1 + 332, align 256, addrspace 5)
-    ; CHECK: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load 4 from %stack.1 + 336, align 256, addrspace 5)
-    ; CHECK: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD23]](p5) :: (load 4 from %stack.1 + 340, align 256, addrspace 5)
-    ; CHECK: [[LOAD26:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD24]](p5) :: (load 4 from %stack.1 + 344, align 256, addrspace 5)
-    ; CHECK: [[LOAD27:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD25]](p5) :: (load 4 from %stack.1 + 348, align 256, addrspace 5)
-    ; CHECK: [[LOAD28:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p5) :: (load 4 from %stack.1 + 352, align 256, addrspace 5)
-    ; CHECK: [[LOAD29:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD27]](p5) :: (load 4 from %stack.1 + 356, align 256, addrspace 5)
-    ; CHECK: [[LOAD30:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD28]](p5) :: (load 4 from %stack.1 + 360, align 256, addrspace 5)
-    ; CHECK: [[LOAD31:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD29]](p5) :: (load 4 from %stack.1 + 364, align 256, addrspace 5)
-    ; CHECK: [[LOAD32:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p5) :: (load 4 from %stack.1 + 368, align 256, addrspace 5)
-    ; CHECK: [[LOAD33:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD31]](p5) :: (load 4 from %stack.1 + 372, align 256, addrspace 5)
-    ; CHECK: [[LOAD34:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD32]](p5) :: (load 4 from %stack.1 + 376, align 256, addrspace 5)
-    ; CHECK: [[LOAD35:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD33]](p5) :: (load 4 from %stack.1 + 380, align 256, addrspace 5)
-    ; CHECK: [[LOAD36:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD34]](p5) :: (load 4 from %stack.1 + 384, align 256, addrspace 5)
-    ; CHECK: [[LOAD37:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD35]](p5) :: (load 4 from %stack.1 + 388, align 256, addrspace 5)
-    ; CHECK: [[LOAD38:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD36]](p5) :: (load 4 from %stack.1 + 392, align 256, addrspace 5)
-    ; CHECK: [[LOAD39:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD37]](p5) :: (load 4 from %stack.1 + 396, align 256, addrspace 5)
-    ; CHECK: [[LOAD40:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD38]](p5) :: (load 4 from %stack.1 + 400, align 256, addrspace 5)
-    ; CHECK: [[LOAD41:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD39]](p5) :: (load 4 from %stack.1 + 404, align 256, addrspace 5)
-    ; CHECK: [[LOAD42:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD40]](p5) :: (load 4 from %stack.1 + 408, align 256, addrspace 5)
-    ; CHECK: [[LOAD43:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD41]](p5) :: (load 4 from %stack.1 + 412, align 256, addrspace 5)
-    ; CHECK: [[LOAD44:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD42]](p5) :: (load 4 from %stack.1 + 416, align 256, addrspace 5)
-    ; CHECK: [[LOAD45:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD43]](p5) :: (load 4 from %stack.1 + 420, align 256, addrspace 5)
-    ; CHECK: [[LOAD46:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD44]](p5) :: (load 4 from %stack.1 + 424, align 256, addrspace 5)
-    ; CHECK: [[LOAD47:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD45]](p5) :: (load 4 from %stack.1 + 428, align 256, addrspace 5)
-    ; CHECK: [[LOAD48:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD46]](p5) :: (load 4 from %stack.1 + 432, align 256, addrspace 5)
-    ; CHECK: [[LOAD49:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD47]](p5) :: (load 4 from %stack.1 + 436, align 256, addrspace 5)
-    ; CHECK: [[LOAD50:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD48]](p5) :: (load 4 from %stack.1 + 440, align 256, addrspace 5)
-    ; CHECK: [[LOAD51:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD49]](p5) :: (load 4 from %stack.1 + 444, align 256, addrspace 5)
-    ; CHECK: [[LOAD52:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD50]](p5) :: (load 4 from %stack.1 + 448, align 256, addrspace 5)
-    ; CHECK: [[LOAD53:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD51]](p5) :: (load 4 from %stack.1 + 452, align 256, addrspace 5)
-    ; CHECK: [[LOAD54:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD52]](p5) :: (load 4 from %stack.1 + 456, align 256, addrspace 5)
-    ; CHECK: [[LOAD55:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD53]](p5) :: (load 4 from %stack.1 + 460, align 256, addrspace 5)
-    ; CHECK: [[LOAD56:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD54]](p5) :: (load 4 from %stack.1 + 464, align 256, addrspace 5)
-    ; CHECK: [[LOAD57:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD55]](p5) :: (load 4 from %stack.1 + 468, align 256, addrspace 5)
-    ; CHECK: [[LOAD58:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD56]](p5) :: (load 4 from %stack.1 + 472, align 256, addrspace 5)
-    ; CHECK: [[LOAD59:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD57]](p5) :: (load 4 from %stack.1 + 476, align 256, addrspace 5)
-    ; CHECK: [[LOAD60:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD58]](p5) :: (load 4 from %stack.1 + 480, align 256, addrspace 5)
-    ; CHECK: [[LOAD61:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD59]](p5) :: (load 4 from %stack.1 + 484, align 256, addrspace 5)
-    ; CHECK: [[LOAD62:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD60]](p5) :: (load 4 from %stack.1 + 488, align 256, addrspace 5)
-    ; CHECK: [[LOAD63:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD61]](p5) :: (load 4 from %stack.1 + 492, align 256, addrspace 5)
-    ; CHECK: [[LOAD64:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD62]](p5) :: (load 4 from %stack.1 + 496, align 256, addrspace 5)
-    ; CHECK: [[LOAD65:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD63]](p5) :: (load 4 from %stack.1 + 500, align 256, addrspace 5)
-    ; CHECK: [[LOAD66:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD64]](p5) :: (load 4 from %stack.1 + 504, align 256, addrspace 5)
-    ; CHECK: [[LOAD67:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD65]](p5) :: (load 4 from %stack.1 + 508, align 256, addrspace 5)
-    ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
-    ; CHECK: [[UV64:%[0-9]+]]:_(s32), [[UV65:%[0-9]+]]:_(s32), [[UV66:%[0-9]+]]:_(s32), [[UV67:%[0-9]+]]:_(s32), [[UV68:%[0-9]+]]:_(s32), [[UV69:%[0-9]+]]:_(s32), [[UV70:%[0-9]+]]:_(s32), [[UV71:%[0-9]+]]:_(s32), [[UV72:%[0-9]+]]:_(s32), [[UV73:%[0-9]+]]:_(s32), [[UV74:%[0-9]+]]:_(s32), [[UV75:%[0-9]+]]:_(s32), [[UV76:%[0-9]+]]:_(s32), [[UV77:%[0-9]+]]:_(s32), [[UV78:%[0-9]+]]:_(s32), [[UV79:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<16 x s32>)
-    ; CHECK: [[UV80:%[0-9]+]]:_(s32), [[UV81:%[0-9]+]]:_(s32), [[UV82:%[0-9]+]]:_(s32), [[UV83:%[0-9]+]]:_(s32), [[UV84:%[0-9]+]]:_(s32), [[UV85:%[0-9]+]]:_(s32), [[UV86:%[0-9]+]]:_(s32), [[UV87:%[0-9]+]]:_(s32), [[UV88:%[0-9]+]]:_(s32), [[UV89:%[0-9]+]]:_(s32), [[UV90:%[0-9]+]]:_(s32), [[UV91:%[0-9]+]]:_(s32), [[UV92:%[0-9]+]]:_(s32), [[UV93:%[0-9]+]]:_(s32), [[UV94:%[0-9]+]]:_(s32), [[UV95:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<16 x s32>)
-    ; CHECK: [[UV96:%[0-9]+]]:_(s32), [[UV97:%[0-9]+]]:_(s32), [[UV98:%[0-9]+]]:_(s32), [[UV99:%[0-9]+]]:_(s32), [[UV100:%[0-9]+]]:_(s32), [[UV101:%[0-9]+]]:_(s32), [[UV102:%[0-9]+]]:_(s32), [[UV103:%[0-9]+]]:_(s32), [[UV104:%[0-9]+]]:_(s32), [[UV105:%[0-9]+]]:_(s32), [[UV106:%[0-9]+]]:_(s32), [[UV107:%[0-9]+]]:_(s32), [[UV108:%[0-9]+]]:_(s32), [[UV109:%[0-9]+]]:_(s32), [[UV110:%[0-9]+]]:_(s32), [[UV111:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD2]](<16 x s32>)
-    ; CHECK: [[UV112:%[0-9]+]]:_(s32), [[UV113:%[0-9]+]]:_(s32), [[UV114:%[0-9]+]]:_(s32), [[UV115:%[0-9]+]]:_(s32), [[UV116:%[0-9]+]]:_(s32), [[UV117:%[0-9]+]]:_(s32), [[UV118:%[0-9]+]]:_(s32), [[UV119:%[0-9]+]]:_(s32), [[UV120:%[0-9]+]]:_(s32), [[UV121:%[0-9]+]]:_(s32), [[UV122:%[0-9]+]]:_(s32), [[UV123:%[0-9]+]]:_(s32), [[UV124:%[0-9]+]]:_(s32), [[UV125:%[0-9]+]]:_(s32), [[UV126:%[0-9]+]]:_(s32), [[UV127:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD3]](<16 x s32>)
-    ; CHECK: G_STORE [[UV64]](s32), [[FRAME_INDEX1]](p5) :: (store 4 into %stack.0, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD67:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C5]](s32)
-    ; CHECK: [[COPY64:%[0-9]+]]:_(p5) = COPY [[PTR_ADD67]](p5)
-    ; CHECK: G_STORE [[UV65]](s32), [[COPY64]](p5) :: (store 4 into %stack.0 + 4, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD68:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C6]](s32)
-    ; CHECK: [[COPY65:%[0-9]+]]:_(p5) = COPY [[PTR_ADD68]](p5)
-    ; CHECK: G_STORE [[UV66]](s32), [[COPY65]](p5) :: (store 4 into %stack.0 + 8, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD69:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C7]](s32)
-    ; CHECK: [[COPY66:%[0-9]+]]:_(p5) = COPY [[PTR_ADD69]](p5)
-    ; CHECK: G_STORE [[UV67]](s32), [[COPY66]](p5) :: (store 4 into %stack.0 + 12, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD70:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C8]](s32)
-    ; CHECK: [[COPY67:%[0-9]+]]:_(p5) = COPY [[PTR_ADD70]](p5)
-    ; CHECK: G_STORE [[UV68]](s32), [[COPY67]](p5) :: (store 4 into %stack.0 + 16, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD71:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C9]](s32)
-    ; CHECK: [[COPY68:%[0-9]+]]:_(p5) = COPY [[PTR_ADD71]](p5)
-    ; CHECK: G_STORE [[UV69]](s32), [[COPY68]](p5) :: (store 4 into %stack.0 + 20, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD72:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C10]](s32)
-    ; CHECK: [[COPY69:%[0-9]+]]:_(p5) = COPY [[PTR_ADD72]](p5)
-    ; CHECK: G_STORE [[UV70]](s32), [[COPY69]](p5) :: (store 4 into %stack.0 + 24, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD73:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C11]](s32)
-    ; CHECK: [[COPY70:%[0-9]+]]:_(p5) = COPY [[PTR_ADD73]](p5)
-    ; CHECK: G_STORE [[UV71]](s32), [[COPY70]](p5) :: (store 4 into %stack.0 + 28, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD74:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C12]](s32)
-    ; CHECK: [[COPY71:%[0-9]+]]:_(p5) = COPY [[PTR_ADD74]](p5)
-    ; CHECK: G_STORE [[UV72]](s32), [[COPY71]](p5) :: (store 4 into %stack.0 + 32, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD75:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C13]](s32)
-    ; CHECK: [[COPY72:%[0-9]+]]:_(p5) = COPY [[PTR_ADD75]](p5)
-    ; CHECK: G_STORE [[UV73]](s32), [[COPY72]](p5) :: (store 4 into %stack.0 + 36, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD76:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C14]](s32)
-    ; CHECK: [[COPY73:%[0-9]+]]:_(p5) = COPY [[PTR_ADD76]](p5)
-    ; CHECK: G_STORE [[UV74]](s32), [[COPY73]](p5) :: (store 4 into %stack.0 + 40, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD77:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C15]](s32)
-    ; CHECK: [[COPY74:%[0-9]+]]:_(p5) = COPY [[PTR_ADD77]](p5)
-    ; CHECK: G_STORE [[UV75]](s32), [[COPY74]](p5) :: (store 4 into %stack.0 + 44, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD78:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C16]](s32)
-    ; CHECK: [[COPY75:%[0-9]+]]:_(p5) = COPY [[PTR_ADD78]](p5)
-    ; CHECK: G_STORE [[UV76]](s32), [[COPY75]](p5) :: (store 4 into %stack.0 + 48, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD79:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C17]](s32)
-    ; CHECK: [[COPY76:%[0-9]+]]:_(p5) = COPY [[PTR_ADD79]](p5)
-    ; CHECK: G_STORE [[UV77]](s32), [[COPY76]](p5) :: (store 4 into %stack.0 + 52, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD80:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C18]](s32)
-    ; CHECK: [[COPY77:%[0-9]+]]:_(p5) = COPY [[PTR_ADD80]](p5)
-    ; CHECK: G_STORE [[UV78]](s32), [[COPY77]](p5) :: (store 4 into %stack.0 + 56, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD81:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C19]](s32)
-    ; CHECK: [[COPY78:%[0-9]+]]:_(p5) = COPY [[PTR_ADD81]](p5)
-    ; CHECK: G_STORE [[UV79]](s32), [[COPY78]](p5) :: (store 4 into %stack.0 + 60, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD82:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C]](s32)
-    ; CHECK: [[COPY79:%[0-9]+]]:_(p5) = COPY [[PTR_ADD82]](p5)
-    ; CHECK: G_STORE [[UV80]](s32), [[COPY79]](p5) :: (store 4 into %stack.0 + 64, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD83:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C20]](s32)
-    ; CHECK: [[COPY80:%[0-9]+]]:_(p5) = COPY [[PTR_ADD83]](p5)
-    ; CHECK: G_STORE [[UV81]](s32), [[COPY80]](p5) :: (store 4 into %stack.0 + 68, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD84:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C21]](s32)
-    ; CHECK: [[COPY81:%[0-9]+]]:_(p5) = COPY [[PTR_ADD84]](p5)
-    ; CHECK: G_STORE [[UV82]](s32), [[COPY81]](p5) :: (store 4 into %stack.0 + 72, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD85:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C22]](s32)
-    ; CHECK: [[COPY82:%[0-9]+]]:_(p5) = COPY [[PTR_ADD85]](p5)
-    ; CHECK: G_STORE [[UV83]](s32), [[COPY82]](p5) :: (store 4 into %stack.0 + 76, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD86:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C23]](s32)
-    ; CHECK: [[COPY83:%[0-9]+]]:_(p5) = COPY [[PTR_ADD86]](p5)
-    ; CHECK: G_STORE [[UV84]](s32), [[COPY83]](p5) :: (store 4 into %stack.0 + 80, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD87:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C24]](s32)
-    ; CHECK: [[COPY84:%[0-9]+]]:_(p5) = COPY [[PTR_ADD87]](p5)
-    ; CHECK: G_STORE [[UV85]](s32), [[COPY84]](p5) :: (store 4 into %stack.0 + 84, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD88:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C25]](s32)
-    ; CHECK: [[COPY85:%[0-9]+]]:_(p5) = COPY [[PTR_ADD88]](p5)
-    ; CHECK: G_STORE [[UV86]](s32), [[COPY85]](p5) :: (store 4 into %stack.0 + 88, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD89:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C26]](s32)
-    ; CHECK: [[COPY86:%[0-9]+]]:_(p5) = COPY [[PTR_ADD89]](p5)
-    ; CHECK: G_STORE [[UV87]](s32), [[COPY86]](p5) :: (store 4 into %stack.0 + 92, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD90:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C27]](s32)
-    ; CHECK: [[COPY87:%[0-9]+]]:_(p5) = COPY [[PTR_ADD90]](p5)
-    ; CHECK: G_STORE [[UV88]](s32), [[COPY87]](p5) :: (store 4 into %stack.0 + 96, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD91:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C28]](s32)
-    ; CHECK: [[COPY88:%[0-9]+]]:_(p5) = COPY [[PTR_ADD91]](p5)
-    ; CHECK: G_STORE [[UV89]](s32), [[COPY88]](p5) :: (store 4 into %stack.0 + 100, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD92:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C29]](s32)
-    ; CHECK: [[COPY89:%[0-9]+]]:_(p5) = COPY [[PTR_ADD92]](p5)
-    ; CHECK: G_STORE [[UV90]](s32), [[COPY89]](p5) :: (store 4 into %stack.0 + 104, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD93:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C30]](s32)
-    ; CHECK: [[COPY90:%[0-9]+]]:_(p5) = COPY [[PTR_ADD93]](p5)
-    ; CHECK: G_STORE [[UV91]](s32), [[COPY90]](p5) :: (store 4 into %stack.0 + 108, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD94:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C31]](s32)
-    ; CHECK: [[COPY91:%[0-9]+]]:_(p5) = COPY [[PTR_ADD94]](p5)
-    ; CHECK: G_STORE [[UV92]](s32), [[COPY91]](p5) :: (store 4 into %stack.0 + 112, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD95:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C32]](s32)
-    ; CHECK: [[COPY92:%[0-9]+]]:_(p5) = COPY [[PTR_ADD95]](p5)
-    ; CHECK: G_STORE [[UV93]](s32), [[COPY92]](p5) :: (store 4 into %stack.0 + 116, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD96:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C33]](s32)
-    ; CHECK: [[COPY93:%[0-9]+]]:_(p5) = COPY [[PTR_ADD96]](p5)
-    ; CHECK: G_STORE [[UV94]](s32), [[COPY93]](p5) :: (store 4 into %stack.0 + 120, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD97:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C34]](s32)
-    ; CHECK: [[COPY94:%[0-9]+]]:_(p5) = COPY [[PTR_ADD97]](p5)
-    ; CHECK: G_STORE [[UV95]](s32), [[COPY94]](p5) :: (store 4 into %stack.0 + 124, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD98:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C35]](s32)
-    ; CHECK: [[COPY95:%[0-9]+]]:_(p5) = COPY [[PTR_ADD98]](p5)
-    ; CHECK: G_STORE [[UV96]](s32), [[COPY95]](p5) :: (store 4 into %stack.0 + 128, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD99:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C36]](s32)
-    ; CHECK: [[COPY96:%[0-9]+]]:_(p5) = COPY [[PTR_ADD99]](p5)
-    ; CHECK: G_STORE [[UV97]](s32), [[COPY96]](p5) :: (store 4 into %stack.0 + 132, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD100:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C37]](s32)
-    ; CHECK: [[COPY97:%[0-9]+]]:_(p5) = COPY [[PTR_ADD100]](p5)
-    ; CHECK: G_STORE [[UV98]](s32), [[COPY97]](p5) :: (store 4 into %stack.0 + 136, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD101:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C38]](s32)
-    ; CHECK: [[COPY98:%[0-9]+]]:_(p5) = COPY [[PTR_ADD101]](p5)
-    ; CHECK: G_STORE [[UV99]](s32), [[COPY98]](p5) :: (store 4 into %stack.0 + 140, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD102:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C39]](s32)
-    ; CHECK: [[COPY99:%[0-9]+]]:_(p5) = COPY [[PTR_ADD102]](p5)
-    ; CHECK: G_STORE [[UV100]](s32), [[COPY99]](p5) :: (store 4 into %stack.0 + 144, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD103:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C40]](s32)
-    ; CHECK: [[COPY100:%[0-9]+]]:_(p5) = COPY [[PTR_ADD103]](p5)
-    ; CHECK: G_STORE [[UV101]](s32), [[COPY100]](p5) :: (store 4 into %stack.0 + 148, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD104:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C41]](s32)
-    ; CHECK: [[COPY101:%[0-9]+]]:_(p5) = COPY [[PTR_ADD104]](p5)
-    ; CHECK: G_STORE [[UV102]](s32), [[COPY101]](p5) :: (store 4 into %stack.0 + 152, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD105:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C42]](s32)
-    ; CHECK: [[COPY102:%[0-9]+]]:_(p5) = COPY [[PTR_ADD105]](p5)
-    ; CHECK: G_STORE [[UV103]](s32), [[COPY102]](p5) :: (store 4 into %stack.0 + 156, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD106:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C43]](s32)
-    ; CHECK: [[COPY103:%[0-9]+]]:_(p5) = COPY [[PTR_ADD106]](p5)
-    ; CHECK: G_STORE [[UV104]](s32), [[COPY103]](p5) :: (store 4 into %stack.0 + 160, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD107:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C44]](s32)
-    ; CHECK: [[COPY104:%[0-9]+]]:_(p5) = COPY [[PTR_ADD107]](p5)
-    ; CHECK: G_STORE [[UV105]](s32), [[COPY104]](p5) :: (store 4 into %stack.0 + 164, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD108:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C45]](s32)
-    ; CHECK: [[COPY105:%[0-9]+]]:_(p5) = COPY [[PTR_ADD108]](p5)
-    ; CHECK: G_STORE [[UV106]](s32), [[COPY105]](p5) :: (store 4 into %stack.0 + 168, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD109:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C46]](s32)
-    ; CHECK: [[COPY106:%[0-9]+]]:_(p5) = COPY [[PTR_ADD109]](p5)
-    ; CHECK: G_STORE [[UV107]](s32), [[COPY106]](p5) :: (store 4 into %stack.0 + 172, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD110:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C47]](s32)
-    ; CHECK: [[COPY107:%[0-9]+]]:_(p5) = COPY [[PTR_ADD110]](p5)
-    ; CHECK: G_STORE [[UV108]](s32), [[COPY107]](p5) :: (store 4 into %stack.0 + 176, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD111:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C48]](s32)
-    ; CHECK: [[COPY108:%[0-9]+]]:_(p5) = COPY [[PTR_ADD111]](p5)
-    ; CHECK: G_STORE [[UV109]](s32), [[COPY108]](p5) :: (store 4 into %stack.0 + 180, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD112:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C49]](s32)
-    ; CHECK: [[COPY109:%[0-9]+]]:_(p5) = COPY [[PTR_ADD112]](p5)
-    ; CHECK: G_STORE [[UV110]](s32), [[COPY109]](p5) :: (store 4 into %stack.0 + 184, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD113:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C50]](s32)
-    ; CHECK: [[COPY110:%[0-9]+]]:_(p5) = COPY [[PTR_ADD113]](p5)
-    ; CHECK: G_STORE [[UV111]](s32), [[COPY110]](p5) :: (store 4 into %stack.0 + 188, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD114:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C51]](s32)
-    ; CHECK: [[COPY111:%[0-9]+]]:_(p5) = COPY [[PTR_ADD114]](p5)
-    ; CHECK: G_STORE [[UV112]](s32), [[COPY111]](p5) :: (store 4 into %stack.0 + 192, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD115:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C52]](s32)
-    ; CHECK: [[COPY112:%[0-9]+]]:_(p5) = COPY [[PTR_ADD115]](p5)
-    ; CHECK: G_STORE [[UV113]](s32), [[COPY112]](p5) :: (store 4 into %stack.0 + 196, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD116:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C53]](s32)
-    ; CHECK: [[COPY113:%[0-9]+]]:_(p5) = COPY [[PTR_ADD116]](p5)
-    ; CHECK: G_STORE [[UV114]](s32), [[COPY113]](p5) :: (store 4 into %stack.0 + 200, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD117:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C54]](s32)
-    ; CHECK: [[COPY114:%[0-9]+]]:_(p5) = COPY [[PTR_ADD117]](p5)
-    ; CHECK: G_STORE [[UV115]](s32), [[COPY114]](p5) :: (store 4 into %stack.0 + 204, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD118:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C55]](s32)
-    ; CHECK: [[COPY115:%[0-9]+]]:_(p5) = COPY [[PTR_ADD118]](p5)
-    ; CHECK: G_STORE [[UV116]](s32), [[COPY115]](p5) :: (store 4 into %stack.0 + 208, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD119:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C56]](s32)
-    ; CHECK: [[COPY116:%[0-9]+]]:_(p5) = COPY [[PTR_ADD119]](p5)
-    ; CHECK: G_STORE [[UV117]](s32), [[COPY116]](p5) :: (store 4 into %stack.0 + 212, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD120:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C57]](s32)
-    ; CHECK: [[COPY117:%[0-9]+]]:_(p5) = COPY [[PTR_ADD120]](p5)
-    ; CHECK: G_STORE [[UV118]](s32), [[COPY117]](p5) :: (store 4 into %stack.0 + 216, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD121:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C58]](s32)
-    ; CHECK: [[COPY118:%[0-9]+]]:_(p5) = COPY [[PTR_ADD121]](p5)
-    ; CHECK: G_STORE [[UV119]](s32), [[COPY118]](p5) :: (store 4 into %stack.0 + 220, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD122:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C59]](s32)
-    ; CHECK: [[COPY119:%[0-9]+]]:_(p5) = COPY [[PTR_ADD122]](p5)
-    ; CHECK: G_STORE [[UV120]](s32), [[COPY119]](p5) :: (store 4 into %stack.0 + 224, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD123:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C60]](s32)
-    ; CHECK: [[COPY120:%[0-9]+]]:_(p5) = COPY [[PTR_ADD123]](p5)
-    ; CHECK: G_STORE [[UV121]](s32), [[COPY120]](p5) :: (store 4 into %stack.0 + 228, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD124:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C61]](s32)
-    ; CHECK: [[COPY121:%[0-9]+]]:_(p5) = COPY [[PTR_ADD124]](p5)
-    ; CHECK: G_STORE [[UV122]](s32), [[COPY121]](p5) :: (store 4 into %stack.0 + 232, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD125:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C62]](s32)
-    ; CHECK: [[COPY122:%[0-9]+]]:_(p5) = COPY [[PTR_ADD125]](p5)
-    ; CHECK: G_STORE [[UV123]](s32), [[COPY122]](p5) :: (store 4 into %stack.0 + 236, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD126:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C63]](s32)
-    ; CHECK: [[COPY123:%[0-9]+]]:_(p5) = COPY [[PTR_ADD126]](p5)
-    ; CHECK: G_STORE [[UV124]](s32), [[COPY123]](p5) :: (store 4 into %stack.0 + 240, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD127:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C64]](s32)
-    ; CHECK: [[COPY124:%[0-9]+]]:_(p5) = COPY [[PTR_ADD127]](p5)
-    ; CHECK: G_STORE [[UV125]](s32), [[COPY124]](p5) :: (store 4 into %stack.0 + 244, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD128:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C65]](s32)
-    ; CHECK: [[COPY125:%[0-9]+]]:_(p5) = COPY [[PTR_ADD128]](p5)
-    ; CHECK: G_STORE [[UV126]](s32), [[COPY125]](p5) :: (store 4 into %stack.0 + 248, align 256, addrspace 5)
-    ; CHECK: [[PTR_ADD129:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C66]](s32)
-    ; CHECK: [[COPY126:%[0-9]+]]:_(p5) = COPY [[PTR_ADD129]](p5)
-    ; CHECK: G_STORE [[UV127]](s32), [[COPY126]](p5) :: (store 4 into %stack.0 + 252, align 256, addrspace 5)
-    ; CHECK: [[C68:%[0-9]+]]:_(s32) = G_CONSTANT i32 260
-    ; CHECK: [[PTR_ADD130:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C68]](s32)
-    ; CHECK: G_STORE [[C4]](s32), [[PTR_ADD130]](p5) :: (store 4 into %stack.0 + 260, addrspace 5)
-    ; CHECK: [[LOAD68:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p5) :: (load 4 from %stack.0 + 260, align 256, addrspace 5)
-    ; CHECK: [[LOAD69:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD67]](p5) :: (load 4 from %stack.0 + 264, align 256, addrspace 5)
-    ; CHECK: [[LOAD70:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD68]](p5) :: (load 4 from %stack.0 + 268, align 256, addrspace 5)
-    ; CHECK: [[LOAD71:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD69]](p5) :: (load 4 from %stack.0 + 272, align 256, addrspace 5)
-    ; CHECK: [[LOAD72:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD70]](p5) :: (load 4 from %stack.0 + 276, align 256, addrspace 5)
-    ; CHECK: [[LOAD73:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD71]](p5) :: (load 4 from %stack.0 + 280, align 256, addrspace 5)
-    ; CHECK: [[LOAD74:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD72]](p5) :: (load 4 from %stack.0 + 284, align 256, addrspace 5)
-    ; CHECK: [[LOAD75:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD73]](p5) :: (load 4 from %stack.0 + 288, align 256, addrspace 5)
-    ; CHECK: [[LOAD76:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD74]](p5) :: (load 4 from %stack.0 + 292, align 256, addrspace 5)
-    ; CHECK: [[LOAD77:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD75]](p5) :: (load 4 from %stack.0 + 296, align 256, addrspace 5)
-    ; CHECK: [[LOAD78:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD76]](p5) :: (load 4 from %stack.0 + 300, align 256, addrspace 5)
-    ; CHECK: [[LOAD79:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD77]](p5) :: (load 4 from %stack.0 + 304, align 256, addrspace 5)
-    ; CHECK: [[LOAD80:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD78]](p5) :: (load 4 from %stack.0 + 308, align 256, addrspace 5)
-    ; CHECK: [[LOAD81:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD79]](p5) :: (load 4 from %stack.0 + 312, align 256, addrspace 5)
-    ; CHECK: [[LOAD82:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD80]](p5) :: (load 4 from %stack.0 + 316, align 256, addrspace 5)
-    ; CHECK: [[LOAD83:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD81]](p5) :: (load 4 from %stack.0 + 320, align 256, addrspace 5)
-    ; CHECK: [[LOAD84:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD82]](p5) :: (load 4 from %stack.0 + 324, align 256, addrspace 5)
-    ; CHECK: [[LOAD85:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD83]](p5) :: (load 4 from %stack.0 + 328, align 256, addrspace 5)
-    ; CHECK: [[LOAD86:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD84]](p5) :: (load 4 from %stack.0 + 332, align 256, addrspace 5)
-    ; CHECK: [[LOAD87:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD85]](p5) :: (load 4 from %stack.0 + 336, align 256, addrspace 5)
-    ; CHECK: [[LOAD88:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD86]](p5) :: (load 4 from %stack.0 + 340, align 256, addrspace 5)
-    ; CHECK: [[LOAD89:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD87]](p5) :: (load 4 from %stack.0 + 344, align 256, addrspace 5)
-    ; CHECK: [[LOAD90:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD88]](p5) :: (load 4 from %stack.0 + 348, align 256, addrspace 5)
-    ; CHECK: [[LOAD91:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD89]](p5) :: (load 4 from %stack.0 + 352, align 256, addrspace 5)
-    ; CHECK: [[LOAD92:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD90]](p5) :: (load 4 from %stack.0 + 356, align 256, addrspace 5)
-    ; CHECK: [[LOAD93:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD91]](p5) :: (load 4 from %stack.0 + 360, align 256, addrspace 5)
-    ; CHECK: [[LOAD94:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD92]](p5) :: (load 4 from %stack.0 + 364, align 256, addrspace 5)
-    ; CHECK: [[LOAD95:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD93]](p5) :: (load 4 from %stack.0 + 368, align 256, addrspace 5)
-    ; CHECK: [[LOAD96:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD94]](p5) :: (load 4 from %stack.0 + 372, align 256, addrspace 5)
-    ; CHECK: [[LOAD97:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD95]](p5) :: (load 4 from %stack.0 + 376, align 256, addrspace 5)
-    ; CHECK: [[LOAD98:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD96]](p5) :: (load 4 from %stack.0 + 380, align 256, addrspace 5)
-    ; CHECK: [[LOAD99:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD97]](p5) :: (load 4 from %stack.0 + 384, align 256, addrspace 5)
-    ; CHECK: [[LOAD100:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD98]](p5) :: (load 4 from %stack.0 + 388, align 256, addrspace 5)
-    ; CHECK: [[LOAD101:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD99]](p5) :: (load 4 from %stack.0 + 392, align 256, addrspace 5)
-    ; CHECK: [[LOAD102:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD100]](p5) :: (load 4 from %stack.0 + 396, align 256, addrspace 5)
-    ; CHECK: [[LOAD103:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD101]](p5) :: (load 4 from %stack.0 + 400, align 256, addrspace 5)
-    ; CHECK: [[LOAD104:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD102]](p5) :: (load 4 from %stack.0 + 404, align 256, addrspace 5)
-    ; CHECK: [[LOAD105:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD103]](p5) :: (load 4 from %stack.0 + 408, align 256, addrspace 5)
-    ; CHECK: [[LOAD106:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD104]](p5) :: (load 4 from %stack.0 + 412, align 256, addrspace 5)
-    ; CHECK: [[LOAD107:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD105]](p5) :: (load 4 from %stack.0 + 416, align 256, addrspace 5)
-    ; CHECK: [[LOAD108:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD106]](p5) :: (load 4 from %stack.0 + 420, align 256, addrspace 5)
-    ; CHECK: [[LOAD109:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD107]](p5) :: (load 4 from %stack.0 + 424, align 256, addrspace 5)
-    ; CHECK: [[LOAD110:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD108]](p5) :: (load 4 from %stack.0 + 428, align 256, addrspace 5)
-    ; CHECK: [[LOAD111:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD109]](p5) :: (load 4 from %stack.0 + 432, align 256, addrspace 5)
-    ; CHECK: [[LOAD112:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD110]](p5) :: (load 4 from %stack.0 + 436, align 256, addrspace 5)
-    ; CHECK: [[LOAD113:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD111]](p5) :: (load 4 from %stack.0 + 440, align 256, addrspace 5)
-    ; CHECK: [[LOAD114:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD112]](p5) :: (load 4 from %stack.0 + 444, align 256, addrspace 5)
-    ; CHECK: [[LOAD115:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD113]](p5) :: (load 4 from %stack.0 + 448, align 256, addrspace 5)
-    ; CHECK: [[LOAD116:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD114]](p5) :: (load 4 from %stack.0 + 452, align 256, addrspace 5)
-    ; CHECK: [[LOAD117:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD115]](p5) :: (load 4 from %stack.0 + 456, align 256, addrspace 5)
-    ; CHECK: [[LOAD118:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD116]](p5) :: (load 4 from %stack.0 + 460, align 256, addrspace 5)
-    ; CHECK: [[LOAD119:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD117]](p5) :: (load 4 from %stack.0 + 464, align 256, addrspace 5)
-    ; CHECK: [[LOAD120:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD118]](p5) :: (load 4 from %stack.0 + 468, align 256, addrspace 5)
-    ; CHECK: [[LOAD121:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD119]](p5) :: (load 4 from %stack.0 + 472, align 256, addrspace 5)
-    ; CHECK: [[LOAD122:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD120]](p5) :: (load 4 from %stack.0 + 476, align 256, addrspace 5)
-    ; CHECK: [[LOAD123:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD121]](p5) :: (load 4 from %stack.0 + 480, align 256, addrspace 5)
-    ; CHECK: [[LOAD124:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD122]](p5) :: (load 4 from %stack.0 + 484, align 256, addrspace 5)
-    ; CHECK: [[LOAD125:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD123]](p5) :: (load 4 from %stack.0 + 488, align 256, addrspace 5)
-    ; CHECK: [[LOAD126:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD124]](p5) :: (load 4 from %stack.0 + 492, align 256, addrspace 5)
-    ; CHECK: [[LOAD127:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD125]](p5) :: (load 4 from %stack.0 + 496, align 256, addrspace 5)
-    ; CHECK: [[LOAD128:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD126]](p5) :: (load 4 from %stack.0 + 500, align 256, addrspace 5)
-    ; CHECK: [[LOAD129:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD127]](p5) :: (load 4 from %stack.0 + 504, align 256, addrspace 5)
-    ; CHECK: [[LOAD130:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD128]](p5) :: (load 4 from %stack.0 + 508, align 256, addrspace 5)
-    ; CHECK: [[LOAD131:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD129]](p5) :: (load 4 from %stack.0 + 512, align 256, addrspace 5)
-    ; CHECK: [[COPY127:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; CHECK: [[COPY128:%[0-9]+]]:_(p1) = COPY $vgpr2_vgpr3
-    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
-    ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32)
-    ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
-    ; CHECK: [[BUILD_VECTOR3:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD16]](s32), [[LOAD17]](s32), [[LOAD18]](s32), [[LOAD19]](s32)
-    ; CHECK: [[BUILD_VECTOR4:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD20]](s32), [[LOAD21]](s32), [[LOAD22]](s32), [[LOAD23]](s32)
-    ; CHECK: [[BUILD_VECTOR5:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD24]](s32), [[LOAD25]](s32), [[LOAD26]](s32), [[LOAD27]](s32)
-    ; CHECK: [[BUILD_VECTOR6:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD28]](s32), [[LOAD29]](s32), [[LOAD30]](s32), [[LOAD31]](s32)
-    ; CHECK: [[BUILD_VECTOR7:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD32]](s32), [[LOAD33]](s32), [[LOAD34]](s32), [[LOAD35]](s32)
-    ; CHECK: [[BUILD_VECTOR8:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD36]](s32), [[LOAD37]](s32), [[LOAD38]](s32), [[LOAD39]](s32)
-    ; CHECK: [[BUILD_VECTOR9:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD40]](s32), [[LOAD41]](s32), [[LOAD42]](s32), [[LOAD43]](s32)
-    ; CHECK: [[BUILD_VECTOR10:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD44]](s32), [[LOAD45]](s32), [[LOAD46]](s32), [[LOAD47]](s32)
-    ; CHECK: [[BUILD_VECTOR11:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD48]](s32), [[LOAD49]](s32), [[LOAD50]](s32), [[LOAD51]](s32)
-    ; CHECK: [[BUILD_VECTOR12:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD52]](s32), [[LOAD53]](s32), [[LOAD54]](s32), [[LOAD55]](s32)
-    ; CHECK: [[BUILD_VECTOR13:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD56]](s32), [[LOAD57]](s32), [[LOAD58]](s32), [[LOAD59]](s32)
-    ; CHECK: [[BUILD_VECTOR14:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD60]](s32), [[LOAD61]](s32), [[LOAD62]](s32), [[LOAD63]](s32)
-    ; CHECK: [[BUILD_VECTOR15:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD64]](s32), [[LOAD65]](s32), [[LOAD66]](s32), [[LOAD67]](s32)
-    ; CHECK: G_STORE [[BUILD_VECTOR]](<4 x s32>), [[COPY127]](p1) :: (store 16, align 4, addrspace 1)
-    ; CHECK: [[C69:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; CHECK: [[PTR_ADD131:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C69]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR1]](<4 x s32>), [[PTR_ADD131]](p1) :: (store 16 + 16, align 4, addrspace 1)
-    ; CHECK: [[C70:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-    ; CHECK: [[PTR_ADD132:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C70]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR2]](<4 x s32>), [[PTR_ADD132]](p1) :: (store 16 + 32, align 4, addrspace 1)
-    ; CHECK: [[C71:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
-    ; CHECK: [[PTR_ADD133:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C71]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR3]](<4 x s32>), [[PTR_ADD133]](p1) :: (store 16 + 48, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD134:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C1]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR4]](<4 x s32>), [[PTR_ADD134]](p1) :: (store 16 + 64, align 4, addrspace 1)
-    ; CHECK: [[C72:%[0-9]+]]:_(s64) = G_CONSTANT i64 80
-    ; CHECK: [[PTR_ADD135:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C72]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR5]](<4 x s32>), [[PTR_ADD135]](p1) :: (store 16 + 80, align 4, addrspace 1)
-    ; CHECK: [[C73:%[0-9]+]]:_(s64) = G_CONSTANT i64 96
-    ; CHECK: [[PTR_ADD136:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C73]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR6]](<4 x s32>), [[PTR_ADD136]](p1) :: (store 16 + 96, align 4, addrspace 1)
-    ; CHECK: [[C74:%[0-9]+]]:_(s64) = G_CONSTANT i64 112
-    ; CHECK: [[PTR_ADD137:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C74]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR7]](<4 x s32>), [[PTR_ADD137]](p1) :: (store 16 + 112, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD138:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C2]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR8]](<4 x s32>), [[PTR_ADD138]](p1) :: (store 16 + 128, align 4, addrspace 1)
-    ; CHECK: [[C75:%[0-9]+]]:_(s64) = G_CONSTANT i64 144
-    ; CHECK: [[PTR_ADD139:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C75]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR9]](<4 x s32>), [[PTR_ADD139]](p1) :: (store 16 + 144, align 4, addrspace 1)
-    ; CHECK: [[C76:%[0-9]+]]:_(s64) = G_CONSTANT i64 160
-    ; CHECK: [[PTR_ADD140:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C76]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR10]](<4 x s32>), [[PTR_ADD140]](p1) :: (store 16 + 160, align 4, addrspace 1)
-    ; CHECK: [[C77:%[0-9]+]]:_(s64) = G_CONSTANT i64 176
-    ; CHECK: [[PTR_ADD141:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C77]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR11]](<4 x s32>), [[PTR_ADD141]](p1) :: (store 16 + 176, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD142:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C3]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR12]](<4 x s32>), [[PTR_ADD142]](p1) :: (store 16 + 192, align 4, addrspace 1)
-    ; CHECK: [[C78:%[0-9]+]]:_(s64) = G_CONSTANT i64 208
-    ; CHECK: [[PTR_ADD143:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C78]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR13]](<4 x s32>), [[PTR_ADD143]](p1) :: (store 16 + 208, align 4, addrspace 1)
-    ; CHECK: [[C79:%[0-9]+]]:_(s64) = G_CONSTANT i64 224
-    ; CHECK: [[PTR_ADD144:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C79]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR14]](<4 x s32>), [[PTR_ADD144]](p1) :: (store 16 + 224, align 4, addrspace 1)
-    ; CHECK: [[C80:%[0-9]+]]:_(s64) = G_CONSTANT i64 240
-    ; CHECK: [[PTR_ADD145:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY127]], [[C80]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR15]](<4 x s32>), [[PTR_ADD145]](p1) :: (store 16 + 240, align 4, addrspace 1)
-    ; CHECK: [[BUILD_VECTOR16:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD68]](s32), [[LOAD69]](s32), [[LOAD70]](s32), [[LOAD71]](s32)
-    ; CHECK: [[BUILD_VECTOR17:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD72]](s32), [[LOAD73]](s32), [[LOAD74]](s32), [[LOAD75]](s32)
-    ; CHECK: [[BUILD_VECTOR18:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD76]](s32), [[LOAD77]](s32), [[LOAD78]](s32), [[LOAD79]](s32)
-    ; CHECK: [[BUILD_VECTOR19:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD80]](s32), [[LOAD81]](s32), [[LOAD82]](s32), [[LOAD83]](s32)
-    ; CHECK: [[BUILD_VECTOR20:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD84]](s32), [[LOAD85]](s32), [[LOAD86]](s32), [[LOAD87]](s32)
-    ; CHECK: [[BUILD_VECTOR21:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD88]](s32), [[LOAD89]](s32), [[LOAD90]](s32), [[LOAD91]](s32)
-    ; CHECK: [[BUILD_VECTOR22:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD92]](s32), [[LOAD93]](s32), [[LOAD94]](s32), [[LOAD95]](s32)
-    ; CHECK: [[BUILD_VECTOR23:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD96]](s32), [[LOAD97]](s32), [[LOAD98]](s32), [[LOAD99]](s32)
-    ; CHECK: [[BUILD_VECTOR24:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD100]](s32), [[LOAD101]](s32), [[LOAD102]](s32), [[LOAD103]](s32)
-    ; CHECK: [[BUILD_VECTOR25:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD104]](s32), [[LOAD105]](s32), [[LOAD106]](s32), [[LOAD107]](s32)
-    ; CHECK: [[BUILD_VECTOR26:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD108]](s32), [[LOAD109]](s32), [[LOAD110]](s32), [[LOAD111]](s32)
-    ; CHECK: [[BUILD_VECTOR27:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD112]](s32), [[LOAD113]](s32), [[LOAD114]](s32), [[LOAD115]](s32)
-    ; CHECK: [[BUILD_VECTOR28:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD116]](s32), [[LOAD117]](s32), [[LOAD118]](s32), [[LOAD119]](s32)
-    ; CHECK: [[BUILD_VECTOR29:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD120]](s32), [[LOAD121]](s32), [[LOAD122]](s32), [[LOAD123]](s32)
-    ; CHECK: [[BUILD_VECTOR30:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD124]](s32), [[LOAD125]](s32), [[LOAD126]](s32), [[LOAD127]](s32)
-    ; CHECK: [[BUILD_VECTOR31:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD128]](s32), [[LOAD129]](s32), [[LOAD130]](s32), [[LOAD131]](s32)
-    ; CHECK: G_STORE [[BUILD_VECTOR16]](<4 x s32>), [[COPY128]](p1) :: (store 16, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD146:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C69]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR17]](<4 x s32>), [[PTR_ADD146]](p1) :: (store 16 + 16, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD147:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C70]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR18]](<4 x s32>), [[PTR_ADD147]](p1) :: (store 16 + 32, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD148:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C71]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR19]](<4 x s32>), [[PTR_ADD148]](p1) :: (store 16 + 48, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD149:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C1]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR20]](<4 x s32>), [[PTR_ADD149]](p1) :: (store 16 + 64, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD150:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C72]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR21]](<4 x s32>), [[PTR_ADD150]](p1) :: (store 16 + 80, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD151:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C73]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR22]](<4 x s32>), [[PTR_ADD151]](p1) :: (store 16 + 96, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD152:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C74]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR23]](<4 x s32>), [[PTR_ADD152]](p1) :: (store 16 + 112, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD153:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C2]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR24]](<4 x s32>), [[PTR_ADD153]](p1) :: (store 16 + 128, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD154:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C75]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR25]](<4 x s32>), [[PTR_ADD154]](p1) :: (store 16 + 144, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD155:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C76]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR26]](<4 x s32>), [[PTR_ADD155]](p1) :: (store 16 + 160, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD156:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C77]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR27]](<4 x s32>), [[PTR_ADD156]](p1) :: (store 16 + 176, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD157:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C3]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR28]](<4 x s32>), [[PTR_ADD157]](p1) :: (store 16 + 192, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD158:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C78]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR29]](<4 x s32>), [[PTR_ADD158]](p1) :: (store 16 + 208, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD159:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C79]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR30]](<4 x s32>), [[PTR_ADD159]](p1) :: (store 16 + 224, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD160:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY128]], [[C80]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR31]](<4 x s32>), [[PTR_ADD160]](p1) :: (store 16 + 240, align 4, addrspace 1)
+    ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF
+    ; CHECK: [[COPY1:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(p1) = COPY $vgpr2_vgpr3
+    ; CHECK: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>), [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
+    ; CHECK: [[UV4:%[0-9]+]]:_(<4 x s32>), [[UV5:%[0-9]+]]:_(<4 x s32>), [[UV6:%[0-9]+]]:_(<4 x s32>), [[UV7:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
+    ; CHECK: [[UV8:%[0-9]+]]:_(<4 x s32>), [[UV9:%[0-9]+]]:_(<4 x s32>), [[UV10:%[0-9]+]]:_(<4 x s32>), [[UV11:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
+    ; CHECK: [[UV12:%[0-9]+]]:_(<4 x s32>), [[UV13:%[0-9]+]]:_(<4 x s32>), [[UV14:%[0-9]+]]:_(<4 x s32>), [[UV15:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
+    ; CHECK: G_STORE [[UV]](<4 x s32>), [[COPY1]](p1) :: (store 16, align 4, addrspace 1)
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C]](s64)
+    ; CHECK: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store 16 + 16, align 4, addrspace 1)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+    ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C1]](s64)
+    ; CHECK: G_STORE [[UV2]](<4 x s32>), [[PTR_ADD1]](p1) :: (store 16 + 32, align 4, addrspace 1)
+    ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C2]](s64)
+    ; CHECK: G_STORE [[UV3]](<4 x s32>), [[PTR_ADD2]](p1) :: (store 16 + 48, align 4, addrspace 1)
+    ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+    ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C3]](s64)
+    ; CHECK: G_STORE [[UV4]](<4 x s32>), [[PTR_ADD3]](p1) :: (store 16 + 64, align 4, addrspace 1)
+    ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 80
+    ; CHECK: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C4]](s64)
+    ; CHECK: G_STORE [[UV5]](<4 x s32>), [[PTR_ADD4]](p1) :: (store 16 + 80, align 4, addrspace 1)
+    ; CHECK: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 96
+    ; CHECK: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C5]](s64)
+    ; CHECK: G_STORE [[UV6]](<4 x s32>), [[PTR_ADD5]](p1) :: (store 16 + 96, align 4, addrspace 1)
+    ; CHECK: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 112
+    ; CHECK: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C6]](s64)
+    ; CHECK: G_STORE [[UV7]](<4 x s32>), [[PTR_ADD6]](p1) :: (store 16 + 112, align 4, addrspace 1)
+    ; CHECK: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 128
+    ; CHECK: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C7]](s64)
+    ; CHECK: G_STORE [[UV8]](<4 x s32>), [[PTR_ADD7]](p1) :: (store 16 + 128, align 4, addrspace 1)
+    ; CHECK: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 144
+    ; CHECK: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C8]](s64)
+    ; CHECK: G_STORE [[UV9]](<4 x s32>), [[PTR_ADD8]](p1) :: (store 16 + 144, align 4, addrspace 1)
+    ; CHECK: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 160
+    ; CHECK: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C9]](s64)
+    ; CHECK: G_STORE [[UV10]](<4 x s32>), [[PTR_ADD9]](p1) :: (store 16 + 160, align 4, addrspace 1)
+    ; CHECK: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 176
+    ; CHECK: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C10]](s64)
+    ; CHECK: G_STORE [[UV11]](<4 x s32>), [[PTR_ADD10]](p1) :: (store 16 + 176, align 4, addrspace 1)
+    ; CHECK: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 192
+    ; CHECK: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C11]](s64)
+    ; CHECK: G_STORE [[UV12]](<4 x s32>), [[PTR_ADD11]](p1) :: (store 16 + 192, align 4, addrspace 1)
+    ; CHECK: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 208
+    ; CHECK: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C12]](s64)
+    ; CHECK: G_STORE [[UV13]](<4 x s32>), [[PTR_ADD12]](p1) :: (store 16 + 208, align 4, addrspace 1)
+    ; CHECK: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 224
+    ; CHECK: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C13]](s64)
+    ; CHECK: G_STORE [[UV14]](<4 x s32>), [[PTR_ADD13]](p1) :: (store 16 + 224, align 4, addrspace 1)
+    ; CHECK: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 240
+    ; CHECK: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C14]](s64)
+    ; CHECK: G_STORE [[UV15]](<4 x s32>), [[PTR_ADD14]](p1) :: (store 16 + 240, align 4, addrspace 1)
+    ; CHECK: [[UV16:%[0-9]+]]:_(<4 x s32>), [[UV17:%[0-9]+]]:_(<4 x s32>), [[UV18:%[0-9]+]]:_(<4 x s32>), [[UV19:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
+    ; CHECK: [[UV20:%[0-9]+]]:_(<4 x s32>), [[UV21:%[0-9]+]]:_(<4 x s32>), [[UV22:%[0-9]+]]:_(<4 x s32>), [[UV23:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
+    ; CHECK: [[UV24:%[0-9]+]]:_(<4 x s32>), [[UV25:%[0-9]+]]:_(<4 x s32>), [[UV26:%[0-9]+]]:_(<4 x s32>), [[UV27:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
+    ; CHECK: [[UV28:%[0-9]+]]:_(<4 x s32>), [[UV29:%[0-9]+]]:_(<4 x s32>), [[UV30:%[0-9]+]]:_(<4 x s32>), [[UV31:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[DEF]](<16 x s32>)
+    ; CHECK: G_STORE [[UV16]](<4 x s32>), [[COPY2]](p1) :: (store 16, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD15:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C]](s64)
+    ; CHECK: G_STORE [[UV17]](<4 x s32>), [[PTR_ADD15]](p1) :: (store 16 + 16, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD16:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C1]](s64)
+    ; CHECK: G_STORE [[UV18]](<4 x s32>), [[PTR_ADD16]](p1) :: (store 16 + 32, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD17:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C2]](s64)
+    ; CHECK: G_STORE [[UV19]](<4 x s32>), [[PTR_ADD17]](p1) :: (store 16 + 48, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD18:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C3]](s64)
+    ; CHECK: G_STORE [[UV20]](<4 x s32>), [[PTR_ADD18]](p1) :: (store 16 + 64, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD19:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C4]](s64)
+    ; CHECK: G_STORE [[UV21]](<4 x s32>), [[PTR_ADD19]](p1) :: (store 16 + 80, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD20:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C5]](s64)
+    ; CHECK: G_STORE [[UV22]](<4 x s32>), [[PTR_ADD20]](p1) :: (store 16 + 96, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD21:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C6]](s64)
+    ; CHECK: G_STORE [[UV23]](<4 x s32>), [[PTR_ADD21]](p1) :: (store 16 + 112, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD22:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C7]](s64)
+    ; CHECK: G_STORE [[UV24]](<4 x s32>), [[PTR_ADD22]](p1) :: (store 16 + 128, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD23:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C8]](s64)
+    ; CHECK: G_STORE [[UV25]](<4 x s32>), [[PTR_ADD23]](p1) :: (store 16 + 144, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD24:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C9]](s64)
+    ; CHECK: G_STORE [[UV26]](<4 x s32>), [[PTR_ADD24]](p1) :: (store 16 + 160, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD25:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C10]](s64)
+    ; CHECK: G_STORE [[UV27]](<4 x s32>), [[PTR_ADD25]](p1) :: (store 16 + 176, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD26:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C11]](s64)
+    ; CHECK: G_STORE [[UV28]](<4 x s32>), [[PTR_ADD26]](p1) :: (store 16 + 192, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD27:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C12]](s64)
+    ; CHECK: G_STORE [[UV29]](<4 x s32>), [[PTR_ADD27]](p1) :: (store 16 + 208, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD28:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C13]](s64)
+    ; CHECK: G_STORE [[UV30]](<4 x s32>), [[PTR_ADD28]](p1) :: (store 16 + 224, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD29:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY2]], [[C14]](s64)
+    ; CHECK: G_STORE [[UV31]](<4 x s32>), [[PTR_ADD29]](p1) :: (store 16 + 240, align 4, addrspace 1)
     %0:_(p1) = COPY $sgpr0_sgpr1
     %1:_(s32) = G_CONSTANT i32 64
     %2:_(<64 x s32>) = G_LOAD %0 :: (load 256, align 4, addrspace 4)
@@ -929,390 +311,55 @@ body: |
     ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
     ; CHECK: [[LOAD3:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load 64 + 192, align 4, addrspace 4)
     ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 12345
-    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0
-    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<16 x s32>)
-    ; CHECK: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<16 x s32>)
-    ; CHECK: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32), [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32), [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32), [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32), [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32), [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32), [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32), [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD2]](<16 x s32>)
-    ; CHECK: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32), [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32), [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32), [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32), [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32), [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32), [[UV60:%[0-9]+]]:_(s32), [[UV61:%[0-9]+]]:_(s32), [[UV62:%[0-9]+]]:_(s32), [[UV63:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD3]](<16 x s32>)
-    ; CHECK: G_STORE [[UV]](s32), [[FRAME_INDEX]](p5) :: (store 4 into %stack.0, align 256, addrspace 5)
-    ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C4]](s32)
-    ; CHECK: [[COPY1:%[0-9]+]]:_(p5) = COPY [[PTR_ADD3]](p5)
-    ; CHECK: G_STORE [[UV1]](s32), [[COPY1]](p5) :: (store 4 into %stack.0 + 4, align 256, addrspace 5)
-    ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C5]](s32)
-    ; CHECK: [[COPY2:%[0-9]+]]:_(p5) = COPY [[PTR_ADD4]](p5)
-    ; CHECK: G_STORE [[UV2]](s32), [[COPY2]](p5) :: (store 4 into %stack.0 + 8, align 256, addrspace 5)
-    ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; CHECK: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C6]](s32)
-    ; CHECK: [[COPY3:%[0-9]+]]:_(p5) = COPY [[PTR_ADD5]](p5)
-    ; CHECK: G_STORE [[UV3]](s32), [[COPY3]](p5) :: (store 4 into %stack.0 + 12, align 256, addrspace 5)
-    ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C7]](s32)
-    ; CHECK: [[COPY4:%[0-9]+]]:_(p5) = COPY [[PTR_ADD6]](p5)
-    ; CHECK: G_STORE [[UV4]](s32), [[COPY4]](p5) :: (store 4 into %stack.0 + 16, align 256, addrspace 5)
-    ; CHECK: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-    ; CHECK: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C8]](s32)
-    ; CHECK: [[COPY5:%[0-9]+]]:_(p5) = COPY [[PTR_ADD7]](p5)
-    ; CHECK: G_STORE [[UV5]](s32), [[COPY5]](p5) :: (store 4 into %stack.0 + 20, align 256, addrspace 5)
-    ; CHECK: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; CHECK: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C9]](s32)
-    ; CHECK: [[COPY6:%[0-9]+]]:_(p5) = COPY [[PTR_ADD8]](p5)
-    ; CHECK: G_STORE [[UV6]](s32), [[COPY6]](p5) :: (store 4 into %stack.0 + 24, align 256, addrspace 5)
-    ; CHECK: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
-    ; CHECK: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C10]](s32)
-    ; CHECK: [[COPY7:%[0-9]+]]:_(p5) = COPY [[PTR_ADD9]](p5)
-    ; CHECK: G_STORE [[UV7]](s32), [[COPY7]](p5) :: (store 4 into %stack.0 + 28, align 256, addrspace 5)
-    ; CHECK: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; CHECK: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C11]](s32)
-    ; CHECK: [[COPY8:%[0-9]+]]:_(p5) = COPY [[PTR_ADD10]](p5)
-    ; CHECK: G_STORE [[UV8]](s32), [[COPY8]](p5) :: (store 4 into %stack.0 + 32, align 256, addrspace 5)
-    ; CHECK: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 36
-    ; CHECK: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C12]](s32)
-    ; CHECK: [[COPY9:%[0-9]+]]:_(p5) = COPY [[PTR_ADD11]](p5)
-    ; CHECK: G_STORE [[UV9]](s32), [[COPY9]](p5) :: (store 4 into %stack.0 + 36, align 256, addrspace 5)
-    ; CHECK: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 40
-    ; CHECK: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C13]](s32)
-    ; CHECK: [[COPY10:%[0-9]+]]:_(p5) = COPY [[PTR_ADD12]](p5)
-    ; CHECK: G_STORE [[UV10]](s32), [[COPY10]](p5) :: (store 4 into %stack.0 + 40, align 256, addrspace 5)
-    ; CHECK: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 44
-    ; CHECK: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C14]](s32)
-    ; CHECK: [[COPY11:%[0-9]+]]:_(p5) = COPY [[PTR_ADD13]](p5)
-    ; CHECK: G_STORE [[UV11]](s32), [[COPY11]](p5) :: (store 4 into %stack.0 + 44, align 256, addrspace 5)
-    ; CHECK: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
-    ; CHECK: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C15]](s32)
-    ; CHECK: [[COPY12:%[0-9]+]]:_(p5) = COPY [[PTR_ADD14]](p5)
-    ; CHECK: G_STORE [[UV12]](s32), [[COPY12]](p5) :: (store 4 into %stack.0 + 48, align 256, addrspace 5)
-    ; CHECK: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 52
-    ; CHECK: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C16]](s32)
-    ; CHECK: [[COPY13:%[0-9]+]]:_(p5) = COPY [[PTR_ADD15]](p5)
-    ; CHECK: G_STORE [[UV13]](s32), [[COPY13]](p5) :: (store 4 into %stack.0 + 52, align 256, addrspace 5)
-    ; CHECK: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 56
-    ; CHECK: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C17]](s32)
-    ; CHECK: [[COPY14:%[0-9]+]]:_(p5) = COPY [[PTR_ADD16]](p5)
-    ; CHECK: G_STORE [[UV14]](s32), [[COPY14]](p5) :: (store 4 into %stack.0 + 56, align 256, addrspace 5)
-    ; CHECK: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 60
-    ; CHECK: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C18]](s32)
-    ; CHECK: [[COPY15:%[0-9]+]]:_(p5) = COPY [[PTR_ADD17]](p5)
-    ; CHECK: G_STORE [[UV15]](s32), [[COPY15]](p5) :: (store 4 into %stack.0 + 60, align 256, addrspace 5)
-    ; CHECK: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
-    ; CHECK: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C19]](s32)
-    ; CHECK: [[COPY16:%[0-9]+]]:_(p5) = COPY [[PTR_ADD18]](p5)
-    ; CHECK: G_STORE [[UV16]](s32), [[COPY16]](p5) :: (store 4 into %stack.0 + 64, align 256, addrspace 5)
-    ; CHECK: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 68
-    ; CHECK: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C20]](s32)
-    ; CHECK: [[COPY17:%[0-9]+]]:_(p5) = COPY [[PTR_ADD19]](p5)
-    ; CHECK: G_STORE [[UV17]](s32), [[COPY17]](p5) :: (store 4 into %stack.0 + 68, align 256, addrspace 5)
-    ; CHECK: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 72
-    ; CHECK: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C21]](s32)
-    ; CHECK: [[COPY18:%[0-9]+]]:_(p5) = COPY [[PTR_ADD20]](p5)
-    ; CHECK: G_STORE [[UV18]](s32), [[COPY18]](p5) :: (store 4 into %stack.0 + 72, align 256, addrspace 5)
-    ; CHECK: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 76
-    ; CHECK: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C22]](s32)
-    ; CHECK: [[COPY19:%[0-9]+]]:_(p5) = COPY [[PTR_ADD21]](p5)
-    ; CHECK: G_STORE [[UV19]](s32), [[COPY19]](p5) :: (store 4 into %stack.0 + 76, align 256, addrspace 5)
-    ; CHECK: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 80
-    ; CHECK: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C23]](s32)
-    ; CHECK: [[COPY20:%[0-9]+]]:_(p5) = COPY [[PTR_ADD22]](p5)
-    ; CHECK: G_STORE [[UV20]](s32), [[COPY20]](p5) :: (store 4 into %stack.0 + 80, align 256, addrspace 5)
-    ; CHECK: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 84
-    ; CHECK: [[PTR_ADD23:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C24]](s32)
-    ; CHECK: [[COPY21:%[0-9]+]]:_(p5) = COPY [[PTR_ADD23]](p5)
-    ; CHECK: G_STORE [[UV21]](s32), [[COPY21]](p5) :: (store 4 into %stack.0 + 84, align 256, addrspace 5)
-    ; CHECK: [[C25:%[0-9]+]]:_(s32) = G_CONSTANT i32 88
-    ; CHECK: [[PTR_ADD24:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C25]](s32)
-    ; CHECK: [[COPY22:%[0-9]+]]:_(p5) = COPY [[PTR_ADD24]](p5)
-    ; CHECK: G_STORE [[UV22]](s32), [[COPY22]](p5) :: (store 4 into %stack.0 + 88, align 256, addrspace 5)
-    ; CHECK: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 92
-    ; CHECK: [[PTR_ADD25:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C26]](s32)
-    ; CHECK: [[COPY23:%[0-9]+]]:_(p5) = COPY [[PTR_ADD25]](p5)
-    ; CHECK: G_STORE [[UV23]](s32), [[COPY23]](p5) :: (store 4 into %stack.0 + 92, align 256, addrspace 5)
-    ; CHECK: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 96
-    ; CHECK: [[PTR_ADD26:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C27]](s32)
-    ; CHECK: [[COPY24:%[0-9]+]]:_(p5) = COPY [[PTR_ADD26]](p5)
-    ; CHECK: G_STORE [[UV24]](s32), [[COPY24]](p5) :: (store 4 into %stack.0 + 96, align 256, addrspace 5)
-    ; CHECK: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-    ; CHECK: [[PTR_ADD27:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C28]](s32)
-    ; CHECK: [[COPY25:%[0-9]+]]:_(p5) = COPY [[PTR_ADD27]](p5)
-    ; CHECK: G_STORE [[UV25]](s32), [[COPY25]](p5) :: (store 4 into %stack.0 + 100, align 256, addrspace 5)
-    ; CHECK: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 104
-    ; CHECK: [[PTR_ADD28:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C29]](s32)
-    ; CHECK: [[COPY26:%[0-9]+]]:_(p5) = COPY [[PTR_ADD28]](p5)
-    ; CHECK: G_STORE [[UV26]](s32), [[COPY26]](p5) :: (store 4 into %stack.0 + 104, align 256, addrspace 5)
-    ; CHECK: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 108
-    ; CHECK: [[PTR_ADD29:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C30]](s32)
-    ; CHECK: [[COPY27:%[0-9]+]]:_(p5) = COPY [[PTR_ADD29]](p5)
-    ; CHECK: G_STORE [[UV27]](s32), [[COPY27]](p5) :: (store 4 into %stack.0 + 108, align 256, addrspace 5)
-    ; CHECK: [[C31:%[0-9]+]]:_(s32) = G_CONSTANT i32 112
-    ; CHECK: [[PTR_ADD30:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C31]](s32)
-    ; CHECK: [[COPY28:%[0-9]+]]:_(p5) = COPY [[PTR_ADD30]](p5)
-    ; CHECK: G_STORE [[UV28]](s32), [[COPY28]](p5) :: (store 4 into %stack.0 + 112, align 256, addrspace 5)
-    ; CHECK: [[C32:%[0-9]+]]:_(s32) = G_CONSTANT i32 116
-    ; CHECK: [[PTR_ADD31:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C32]](s32)
-    ; CHECK: [[COPY29:%[0-9]+]]:_(p5) = COPY [[PTR_ADD31]](p5)
-    ; CHECK: G_STORE [[UV29]](s32), [[COPY29]](p5) :: (store 4 into %stack.0 + 116, align 256, addrspace 5)
-    ; CHECK: [[C33:%[0-9]+]]:_(s32) = G_CONSTANT i32 120
-    ; CHECK: [[PTR_ADD32:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C33]](s32)
-    ; CHECK: [[COPY30:%[0-9]+]]:_(p5) = COPY [[PTR_ADD32]](p5)
-    ; CHECK: G_STORE [[UV30]](s32), [[COPY30]](p5) :: (store 4 into %stack.0 + 120, align 256, addrspace 5)
-    ; CHECK: [[C34:%[0-9]+]]:_(s32) = G_CONSTANT i32 124
-    ; CHECK: [[PTR_ADD33:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C34]](s32)
-    ; CHECK: [[COPY31:%[0-9]+]]:_(p5) = COPY [[PTR_ADD33]](p5)
-    ; CHECK: G_STORE [[UV31]](s32), [[COPY31]](p5) :: (store 4 into %stack.0 + 124, align 256, addrspace 5)
-    ; CHECK: [[C35:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
-    ; CHECK: [[PTR_ADD34:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C35]](s32)
-    ; CHECK: [[COPY32:%[0-9]+]]:_(p5) = COPY [[PTR_ADD34]](p5)
-    ; CHECK: G_STORE [[UV32]](s32), [[COPY32]](p5) :: (store 4 into %stack.0 + 128, align 256, addrspace 5)
-    ; CHECK: [[C36:%[0-9]+]]:_(s32) = G_CONSTANT i32 132
-    ; CHECK: [[PTR_ADD35:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C36]](s32)
-    ; CHECK: [[COPY33:%[0-9]+]]:_(p5) = COPY [[PTR_ADD35]](p5)
-    ; CHECK: G_STORE [[UV33]](s32), [[COPY33]](p5) :: (store 4 into %stack.0 + 132, align 256, addrspace 5)
-    ; CHECK: [[C37:%[0-9]+]]:_(s32) = G_CONSTANT i32 136
-    ; CHECK: [[PTR_ADD36:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C37]](s32)
-    ; CHECK: [[COPY34:%[0-9]+]]:_(p5) = COPY [[PTR_ADD36]](p5)
-    ; CHECK: G_STORE [[UV34]](s32), [[COPY34]](p5) :: (store 4 into %stack.0 + 136, align 256, addrspace 5)
-    ; CHECK: [[C38:%[0-9]+]]:_(s32) = G_CONSTANT i32 140
-    ; CHECK: [[PTR_ADD37:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C38]](s32)
-    ; CHECK: [[COPY35:%[0-9]+]]:_(p5) = COPY [[PTR_ADD37]](p5)
-    ; CHECK: G_STORE [[UV35]](s32), [[COPY35]](p5) :: (store 4 into %stack.0 + 140, align 256, addrspace 5)
-    ; CHECK: [[C39:%[0-9]+]]:_(s32) = G_CONSTANT i32 144
-    ; CHECK: [[PTR_ADD38:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C39]](s32)
-    ; CHECK: [[COPY36:%[0-9]+]]:_(p5) = COPY [[PTR_ADD38]](p5)
-    ; CHECK: G_STORE [[UV36]](s32), [[COPY36]](p5) :: (store 4 into %stack.0 + 144, align 256, addrspace 5)
-    ; CHECK: [[C40:%[0-9]+]]:_(s32) = G_CONSTANT i32 148
-    ; CHECK: [[PTR_ADD39:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C40]](s32)
-    ; CHECK: [[COPY37:%[0-9]+]]:_(p5) = COPY [[PTR_ADD39]](p5)
-    ; CHECK: G_STORE [[UV37]](s32), [[COPY37]](p5) :: (store 4 into %stack.0 + 148, align 256, addrspace 5)
-    ; CHECK: [[C41:%[0-9]+]]:_(s32) = G_CONSTANT i32 152
-    ; CHECK: [[PTR_ADD40:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C41]](s32)
-    ; CHECK: [[COPY38:%[0-9]+]]:_(p5) = COPY [[PTR_ADD40]](p5)
-    ; CHECK: G_STORE [[UV38]](s32), [[COPY38]](p5) :: (store 4 into %stack.0 + 152, align 256, addrspace 5)
-    ; CHECK: [[C42:%[0-9]+]]:_(s32) = G_CONSTANT i32 156
-    ; CHECK: [[PTR_ADD41:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C42]](s32)
-    ; CHECK: [[COPY39:%[0-9]+]]:_(p5) = COPY [[PTR_ADD41]](p5)
-    ; CHECK: G_STORE [[UV39]](s32), [[COPY39]](p5) :: (store 4 into %stack.0 + 156, align 256, addrspace 5)
-    ; CHECK: [[C43:%[0-9]+]]:_(s32) = G_CONSTANT i32 160
-    ; CHECK: [[PTR_ADD42:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C43]](s32)
-    ; CHECK: [[COPY40:%[0-9]+]]:_(p5) = COPY [[PTR_ADD42]](p5)
-    ; CHECK: G_STORE [[UV40]](s32), [[COPY40]](p5) :: (store 4 into %stack.0 + 160, align 256, addrspace 5)
-    ; CHECK: [[C44:%[0-9]+]]:_(s32) = G_CONSTANT i32 164
-    ; CHECK: [[PTR_ADD43:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C44]](s32)
-    ; CHECK: [[COPY41:%[0-9]+]]:_(p5) = COPY [[PTR_ADD43]](p5)
-    ; CHECK: G_STORE [[UV41]](s32), [[COPY41]](p5) :: (store 4 into %stack.0 + 164, align 256, addrspace 5)
-    ; CHECK: [[C45:%[0-9]+]]:_(s32) = G_CONSTANT i32 168
-    ; CHECK: [[PTR_ADD44:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C45]](s32)
-    ; CHECK: [[COPY42:%[0-9]+]]:_(p5) = COPY [[PTR_ADD44]](p5)
-    ; CHECK: G_STORE [[UV42]](s32), [[COPY42]](p5) :: (store 4 into %stack.0 + 168, align 256, addrspace 5)
-    ; CHECK: [[C46:%[0-9]+]]:_(s32) = G_CONSTANT i32 172
-    ; CHECK: [[PTR_ADD45:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C46]](s32)
-    ; CHECK: [[COPY43:%[0-9]+]]:_(p5) = COPY [[PTR_ADD45]](p5)
-    ; CHECK: G_STORE [[UV43]](s32), [[COPY43]](p5) :: (store 4 into %stack.0 + 172, align 256, addrspace 5)
-    ; CHECK: [[C47:%[0-9]+]]:_(s32) = G_CONSTANT i32 176
-    ; CHECK: [[PTR_ADD46:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C47]](s32)
-    ; CHECK: [[COPY44:%[0-9]+]]:_(p5) = COPY [[PTR_ADD46]](p5)
-    ; CHECK: G_STORE [[UV44]](s32), [[COPY44]](p5) :: (store 4 into %stack.0 + 176, align 256, addrspace 5)
-    ; CHECK: [[C48:%[0-9]+]]:_(s32) = G_CONSTANT i32 180
-    ; CHECK: [[PTR_ADD47:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C48]](s32)
-    ; CHECK: [[COPY45:%[0-9]+]]:_(p5) = COPY [[PTR_ADD47]](p5)
-    ; CHECK: G_STORE [[UV45]](s32), [[COPY45]](p5) :: (store 4 into %stack.0 + 180, align 256, addrspace 5)
-    ; CHECK: [[C49:%[0-9]+]]:_(s32) = G_CONSTANT i32 184
-    ; CHECK: [[PTR_ADD48:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C49]](s32)
-    ; CHECK: [[COPY46:%[0-9]+]]:_(p5) = COPY [[PTR_ADD48]](p5)
-    ; CHECK: G_STORE [[UV46]](s32), [[COPY46]](p5) :: (store 4 into %stack.0 + 184, align 256, addrspace 5)
-    ; CHECK: [[C50:%[0-9]+]]:_(s32) = G_CONSTANT i32 188
-    ; CHECK: [[PTR_ADD49:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C50]](s32)
-    ; CHECK: [[COPY47:%[0-9]+]]:_(p5) = COPY [[PTR_ADD49]](p5)
-    ; CHECK: G_STORE [[UV47]](s32), [[COPY47]](p5) :: (store 4 into %stack.0 + 188, align 256, addrspace 5)
-    ; CHECK: [[C51:%[0-9]+]]:_(s32) = G_CONSTANT i32 192
-    ; CHECK: [[PTR_ADD50:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C51]](s32)
-    ; CHECK: [[COPY48:%[0-9]+]]:_(p5) = COPY [[PTR_ADD50]](p5)
-    ; CHECK: G_STORE [[UV48]](s32), [[COPY48]](p5) :: (store 4 into %stack.0 + 192, align 256, addrspace 5)
-    ; CHECK: [[C52:%[0-9]+]]:_(s32) = G_CONSTANT i32 196
-    ; CHECK: [[PTR_ADD51:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C52]](s32)
-    ; CHECK: [[COPY49:%[0-9]+]]:_(p5) = COPY [[PTR_ADD51]](p5)
-    ; CHECK: G_STORE [[UV49]](s32), [[COPY49]](p5) :: (store 4 into %stack.0 + 196, align 256, addrspace 5)
-    ; CHECK: [[C53:%[0-9]+]]:_(s32) = G_CONSTANT i32 200
-    ; CHECK: [[PTR_ADD52:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C53]](s32)
-    ; CHECK: [[COPY50:%[0-9]+]]:_(p5) = COPY [[PTR_ADD52]](p5)
-    ; CHECK: G_STORE [[UV50]](s32), [[COPY50]](p5) :: (store 4 into %stack.0 + 200, align 256, addrspace 5)
-    ; CHECK: [[C54:%[0-9]+]]:_(s32) = G_CONSTANT i32 204
-    ; CHECK: [[PTR_ADD53:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C54]](s32)
-    ; CHECK: [[COPY51:%[0-9]+]]:_(p5) = COPY [[PTR_ADD53]](p5)
-    ; CHECK: G_STORE [[UV51]](s32), [[COPY51]](p5) :: (store 4 into %stack.0 + 204, align 256, addrspace 5)
-    ; CHECK: [[C55:%[0-9]+]]:_(s32) = G_CONSTANT i32 208
-    ; CHECK: [[PTR_ADD54:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C55]](s32)
-    ; CHECK: [[COPY52:%[0-9]+]]:_(p5) = COPY [[PTR_ADD54]](p5)
-    ; CHECK: G_STORE [[UV52]](s32), [[COPY52]](p5) :: (store 4 into %stack.0 + 208, align 256, addrspace 5)
-    ; CHECK: [[C56:%[0-9]+]]:_(s32) = G_CONSTANT i32 212
-    ; CHECK: [[PTR_ADD55:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C56]](s32)
-    ; CHECK: [[COPY53:%[0-9]+]]:_(p5) = COPY [[PTR_ADD55]](p5)
-    ; CHECK: G_STORE [[UV53]](s32), [[COPY53]](p5) :: (store 4 into %stack.0 + 212, align 256, addrspace 5)
-    ; CHECK: [[C57:%[0-9]+]]:_(s32) = G_CONSTANT i32 216
-    ; CHECK: [[PTR_ADD56:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C57]](s32)
-    ; CHECK: [[COPY54:%[0-9]+]]:_(p5) = COPY [[PTR_ADD56]](p5)
-    ; CHECK: G_STORE [[UV54]](s32), [[COPY54]](p5) :: (store 4 into %stack.0 + 216, align 256, addrspace 5)
-    ; CHECK: [[C58:%[0-9]+]]:_(s32) = G_CONSTANT i32 220
-    ; CHECK: [[PTR_ADD57:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C58]](s32)
-    ; CHECK: [[COPY55:%[0-9]+]]:_(p5) = COPY [[PTR_ADD57]](p5)
-    ; CHECK: G_STORE [[UV55]](s32), [[COPY55]](p5) :: (store 4 into %stack.0 + 220, align 256, addrspace 5)
-    ; CHECK: [[C59:%[0-9]+]]:_(s32) = G_CONSTANT i32 224
-    ; CHECK: [[PTR_ADD58:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C59]](s32)
-    ; CHECK: [[COPY56:%[0-9]+]]:_(p5) = COPY [[PTR_ADD58]](p5)
-    ; CHECK: G_STORE [[UV56]](s32), [[COPY56]](p5) :: (store 4 into %stack.0 + 224, align 256, addrspace 5)
-    ; CHECK: [[C60:%[0-9]+]]:_(s32) = G_CONSTANT i32 228
-    ; CHECK: [[PTR_ADD59:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C60]](s32)
-    ; CHECK: [[COPY57:%[0-9]+]]:_(p5) = COPY [[PTR_ADD59]](p5)
-    ; CHECK: G_STORE [[UV57]](s32), [[COPY57]](p5) :: (store 4 into %stack.0 + 228, align 256, addrspace 5)
-    ; CHECK: [[C61:%[0-9]+]]:_(s32) = G_CONSTANT i32 232
-    ; CHECK: [[PTR_ADD60:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C61]](s32)
-    ; CHECK: [[COPY58:%[0-9]+]]:_(p5) = COPY [[PTR_ADD60]](p5)
-    ; CHECK: G_STORE [[UV58]](s32), [[COPY58]](p5) :: (store 4 into %stack.0 + 232, align 256, addrspace 5)
-    ; CHECK: [[C62:%[0-9]+]]:_(s32) = G_CONSTANT i32 236
-    ; CHECK: [[PTR_ADD61:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C62]](s32)
-    ; CHECK: [[COPY59:%[0-9]+]]:_(p5) = COPY [[PTR_ADD61]](p5)
-    ; CHECK: G_STORE [[UV59]](s32), [[COPY59]](p5) :: (store 4 into %stack.0 + 236, align 256, addrspace 5)
-    ; CHECK: [[C63:%[0-9]+]]:_(s32) = G_CONSTANT i32 240
-    ; CHECK: [[PTR_ADD62:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C63]](s32)
-    ; CHECK: [[COPY60:%[0-9]+]]:_(p5) = COPY [[PTR_ADD62]](p5)
-    ; CHECK: G_STORE [[UV60]](s32), [[COPY60]](p5) :: (store 4 into %stack.0 + 240, align 256, addrspace 5)
-    ; CHECK: [[C64:%[0-9]+]]:_(s32) = G_CONSTANT i32 244
-    ; CHECK: [[PTR_ADD63:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C64]](s32)
-    ; CHECK: [[COPY61:%[0-9]+]]:_(p5) = COPY [[PTR_ADD63]](p5)
-    ; CHECK: G_STORE [[UV61]](s32), [[COPY61]](p5) :: (store 4 into %stack.0 + 244, align 256, addrspace 5)
-    ; CHECK: [[C65:%[0-9]+]]:_(s32) = G_CONSTANT i32 248
-    ; CHECK: [[PTR_ADD64:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C65]](s32)
-    ; CHECK: [[COPY62:%[0-9]+]]:_(p5) = COPY [[PTR_ADD64]](p5)
-    ; CHECK: G_STORE [[UV62]](s32), [[COPY62]](p5) :: (store 4 into %stack.0 + 248, align 256, addrspace 5)
-    ; CHECK: [[C66:%[0-9]+]]:_(s32) = G_CONSTANT i32 252
-    ; CHECK: [[PTR_ADD65:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C66]](s32)
-    ; CHECK: [[COPY63:%[0-9]+]]:_(p5) = COPY [[PTR_ADD65]](p5)
-    ; CHECK: G_STORE [[UV63]](s32), [[COPY63]](p5) :: (store 4 into %stack.0 + 252, align 256, addrspace 5)
-    ; CHECK: G_STORE [[C3]](s32), [[PTR_ADD35]](p5) :: (store 4 into %stack.0 + 132, addrspace 5)
-    ; CHECK: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p5) :: (load 4 from %stack.0 + 132, align 256, addrspace 5)
-    ; CHECK: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load 4 from %stack.0 + 136, align 256, addrspace 5)
-    ; CHECK: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load 4 from %stack.0 + 140, align 256, addrspace 5)
-    ; CHECK: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p5) :: (load 4 from %stack.0 + 144, align 256, addrspace 5)
-    ; CHECK: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load 4 from %stack.0 + 148, align 256, addrspace 5)
-    ; CHECK: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p5) :: (load 4 from %stack.0 + 152, align 256, addrspace 5)
-    ; CHECK: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load 4 from %stack.0 + 156, align 256, addrspace 5)
-    ; CHECK: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p5) :: (load 4 from %stack.0 + 160, align 256, addrspace 5)
-    ; CHECK: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load 4 from %stack.0 + 164, align 256, addrspace 5)
-    ; CHECK: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p5) :: (load 4 from %stack.0 + 168, align 256, addrspace 5)
-    ; CHECK: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p5) :: (load 4 from %stack.0 + 172, align 256, addrspace 5)
-    ; CHECK: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p5) :: (load 4 from %stack.0 + 176, align 256, addrspace 5)
-    ; CHECK: [[LOAD16:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load 4 from %stack.0 + 180, align 256, addrspace 5)
-    ; CHECK: [[LOAD17:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD15]](p5) :: (load 4 from %stack.0 + 184, align 256, addrspace 5)
-    ; CHECK: [[LOAD18:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD16]](p5) :: (load 4 from %stack.0 + 188, align 256, addrspace 5)
-    ; CHECK: [[LOAD19:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD17]](p5) :: (load 4 from %stack.0 + 192, align 256, addrspace 5)
-    ; CHECK: [[LOAD20:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load 4 from %stack.0 + 196, align 256, addrspace 5)
-    ; CHECK: [[LOAD21:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD19]](p5) :: (load 4 from %stack.0 + 200, align 256, addrspace 5)
-    ; CHECK: [[LOAD22:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD20]](p5) :: (load 4 from %stack.0 + 204, align 256, addrspace 5)
-    ; CHECK: [[LOAD23:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD21]](p5) :: (load 4 from %stack.0 + 208, align 256, addrspace 5)
-    ; CHECK: [[LOAD24:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load 4 from %stack.0 + 212, align 256, addrspace 5)
-    ; CHECK: [[LOAD25:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD23]](p5) :: (load 4 from %stack.0 + 216, align 256, addrspace 5)
-    ; CHECK: [[LOAD26:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD24]](p5) :: (load 4 from %stack.0 + 220, align 256, addrspace 5)
-    ; CHECK: [[LOAD27:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD25]](p5) :: (load 4 from %stack.0 + 224, align 256, addrspace 5)
-    ; CHECK: [[LOAD28:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p5) :: (load 4 from %stack.0 + 228, align 256, addrspace 5)
-    ; CHECK: [[LOAD29:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD27]](p5) :: (load 4 from %stack.0 + 232, align 256, addrspace 5)
-    ; CHECK: [[LOAD30:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD28]](p5) :: (load 4 from %stack.0 + 236, align 256, addrspace 5)
-    ; CHECK: [[LOAD31:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD29]](p5) :: (load 4 from %stack.0 + 240, align 256, addrspace 5)
-    ; CHECK: [[LOAD32:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p5) :: (load 4 from %stack.0 + 244, align 256, addrspace 5)
-    ; CHECK: [[LOAD33:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD31]](p5) :: (load 4 from %stack.0 + 248, align 256, addrspace 5)
-    ; CHECK: [[LOAD34:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD32]](p5) :: (load 4 from %stack.0 + 252, align 256, addrspace 5)
-    ; CHECK: [[LOAD35:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD33]](p5) :: (load 4 from %stack.0 + 256, align 256, addrspace 5)
-    ; CHECK: [[LOAD36:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD34]](p5) :: (load 4 from %stack.0 + 260, align 256, addrspace 5)
-    ; CHECK: [[COPY64:%[0-9]+]]:_(p5) = COPY [[PTR_ADD35]](p5)
-    ; CHECK: [[LOAD37:%[0-9]+]]:_(s32) = G_LOAD [[COPY64]](p5) :: (load 4 from %stack.0 + 264, align 256, addrspace 5)
-    ; CHECK: [[LOAD38:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD36]](p5) :: (load 4 from %stack.0 + 268, align 256, addrspace 5)
-    ; CHECK: [[LOAD39:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD37]](p5) :: (load 4 from %stack.0 + 272, align 256, addrspace 5)
-    ; CHECK: [[LOAD40:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD38]](p5) :: (load 4 from %stack.0 + 276, align 256, addrspace 5)
-    ; CHECK: [[LOAD41:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD39]](p5) :: (load 4 from %stack.0 + 280, align 256, addrspace 5)
-    ; CHECK: [[LOAD42:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD40]](p5) :: (load 4 from %stack.0 + 284, align 256, addrspace 5)
-    ; CHECK: [[LOAD43:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD41]](p5) :: (load 4 from %stack.0 + 288, align 256, addrspace 5)
-    ; CHECK: [[LOAD44:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD42]](p5) :: (load 4 from %stack.0 + 292, align 256, addrspace 5)
-    ; CHECK: [[LOAD45:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD43]](p5) :: (load 4 from %stack.0 + 296, align 256, addrspace 5)
-    ; CHECK: [[LOAD46:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD44]](p5) :: (load 4 from %stack.0 + 300, align 256, addrspace 5)
-    ; CHECK: [[LOAD47:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD45]](p5) :: (load 4 from %stack.0 + 304, align 256, addrspace 5)
-    ; CHECK: [[LOAD48:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD46]](p5) :: (load 4 from %stack.0 + 308, align 256, addrspace 5)
-    ; CHECK: [[LOAD49:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD47]](p5) :: (load 4 from %stack.0 + 312, align 256, addrspace 5)
-    ; CHECK: [[LOAD50:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD48]](p5) :: (load 4 from %stack.0 + 316, align 256, addrspace 5)
-    ; CHECK: [[LOAD51:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD49]](p5) :: (load 4 from %stack.0 + 320, align 256, addrspace 5)
-    ; CHECK: [[LOAD52:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD50]](p5) :: (load 4 from %stack.0 + 324, align 256, addrspace 5)
-    ; CHECK: [[LOAD53:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD51]](p5) :: (load 4 from %stack.0 + 328, align 256, addrspace 5)
-    ; CHECK: [[LOAD54:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD52]](p5) :: (load 4 from %stack.0 + 332, align 256, addrspace 5)
-    ; CHECK: [[LOAD55:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD53]](p5) :: (load 4 from %stack.0 + 336, align 256, addrspace 5)
-    ; CHECK: [[LOAD56:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD54]](p5) :: (load 4 from %stack.0 + 340, align 256, addrspace 5)
-    ; CHECK: [[LOAD57:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD55]](p5) :: (load 4 from %stack.0 + 344, align 256, addrspace 5)
-    ; CHECK: [[LOAD58:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD56]](p5) :: (load 4 from %stack.0 + 348, align 256, addrspace 5)
-    ; CHECK: [[LOAD59:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD57]](p5) :: (load 4 from %stack.0 + 352, align 256, addrspace 5)
-    ; CHECK: [[LOAD60:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD58]](p5) :: (load 4 from %stack.0 + 356, align 256, addrspace 5)
-    ; CHECK: [[LOAD61:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD59]](p5) :: (load 4 from %stack.0 + 360, align 256, addrspace 5)
-    ; CHECK: [[LOAD62:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD60]](p5) :: (load 4 from %stack.0 + 364, align 256, addrspace 5)
-    ; CHECK: [[LOAD63:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD61]](p5) :: (load 4 from %stack.0 + 368, align 256, addrspace 5)
-    ; CHECK: [[LOAD64:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD62]](p5) :: (load 4 from %stack.0 + 372, align 256, addrspace 5)
-    ; CHECK: [[LOAD65:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD63]](p5) :: (load 4 from %stack.0 + 376, align 256, addrspace 5)
-    ; CHECK: [[LOAD66:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD64]](p5) :: (load 4 from %stack.0 + 380, align 256, addrspace 5)
-    ; CHECK: [[LOAD67:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD65]](p5) :: (load 4 from %stack.0 + 384, align 256, addrspace 5)
-    ; CHECK: [[COPY65:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
-    ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32)
-    ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
-    ; CHECK: [[BUILD_VECTOR3:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD16]](s32), [[LOAD17]](s32), [[LOAD18]](s32), [[LOAD19]](s32)
-    ; CHECK: [[BUILD_VECTOR4:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD20]](s32), [[LOAD21]](s32), [[LOAD22]](s32), [[LOAD23]](s32)
-    ; CHECK: [[BUILD_VECTOR5:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD24]](s32), [[LOAD25]](s32), [[LOAD26]](s32), [[LOAD27]](s32)
-    ; CHECK: [[BUILD_VECTOR6:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD28]](s32), [[LOAD29]](s32), [[LOAD30]](s32), [[LOAD31]](s32)
-    ; CHECK: [[BUILD_VECTOR7:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD32]](s32), [[LOAD33]](s32), [[LOAD34]](s32), [[LOAD35]](s32)
-    ; CHECK: [[BUILD_VECTOR8:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD36]](s32), [[LOAD37]](s32), [[LOAD38]](s32), [[LOAD39]](s32)
-    ; CHECK: [[BUILD_VECTOR9:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD40]](s32), [[LOAD41]](s32), [[LOAD42]](s32), [[LOAD43]](s32)
-    ; CHECK: [[BUILD_VECTOR10:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD44]](s32), [[LOAD45]](s32), [[LOAD46]](s32), [[LOAD47]](s32)
-    ; CHECK: [[BUILD_VECTOR11:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD48]](s32), [[LOAD49]](s32), [[LOAD50]](s32), [[LOAD51]](s32)
-    ; CHECK: [[BUILD_VECTOR12:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD52]](s32), [[LOAD53]](s32), [[LOAD54]](s32), [[LOAD55]](s32)
-    ; CHECK: [[BUILD_VECTOR13:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD56]](s32), [[LOAD57]](s32), [[LOAD58]](s32), [[LOAD59]](s32)
-    ; CHECK: [[BUILD_VECTOR14:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD60]](s32), [[LOAD61]](s32), [[LOAD62]](s32), [[LOAD63]](s32)
-    ; CHECK: [[BUILD_VECTOR15:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD64]](s32), [[LOAD65]](s32), [[LOAD66]](s32), [[LOAD67]](s32)
-    ; CHECK: G_STORE [[BUILD_VECTOR]](<4 x s32>), [[COPY65]](p1) :: (store 16, align 4, addrspace 1)
-    ; CHECK: [[C67:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; CHECK: [[PTR_ADD66:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C67]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR1]](<4 x s32>), [[PTR_ADD66]](p1) :: (store 16 + 16, align 4, addrspace 1)
-    ; CHECK: [[C68:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-    ; CHECK: [[PTR_ADD67:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C68]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR2]](<4 x s32>), [[PTR_ADD67]](p1) :: (store 16 + 32, align 4, addrspace 1)
-    ; CHECK: [[C69:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
-    ; CHECK: [[PTR_ADD68:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C69]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR3]](<4 x s32>), [[PTR_ADD68]](p1) :: (store 16 + 48, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD69:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR4]](<4 x s32>), [[PTR_ADD69]](p1) :: (store 16 + 64, align 4, addrspace 1)
-    ; CHECK: [[C70:%[0-9]+]]:_(s64) = G_CONSTANT i64 80
-    ; CHECK: [[PTR_ADD70:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C70]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR5]](<4 x s32>), [[PTR_ADD70]](p1) :: (store 16 + 80, align 4, addrspace 1)
-    ; CHECK: [[C71:%[0-9]+]]:_(s64) = G_CONSTANT i64 96
-    ; CHECK: [[PTR_ADD71:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C71]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR6]](<4 x s32>), [[PTR_ADD71]](p1) :: (store 16 + 96, align 4, addrspace 1)
-    ; CHECK: [[C72:%[0-9]+]]:_(s64) = G_CONSTANT i64 112
-    ; CHECK: [[PTR_ADD72:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C72]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR7]](<4 x s32>), [[PTR_ADD72]](p1) :: (store 16 + 112, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD73:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C1]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR8]](<4 x s32>), [[PTR_ADD73]](p1) :: (store 16 + 128, align 4, addrspace 1)
-    ; CHECK: [[C73:%[0-9]+]]:_(s64) = G_CONSTANT i64 144
-    ; CHECK: [[PTR_ADD74:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C73]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR9]](<4 x s32>), [[PTR_ADD74]](p1) :: (store 16 + 144, align 4, addrspace 1)
-    ; CHECK: [[C74:%[0-9]+]]:_(s64) = G_CONSTANT i64 160
-    ; CHECK: [[PTR_ADD75:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C74]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR10]](<4 x s32>), [[PTR_ADD75]](p1) :: (store 16 + 160, align 4, addrspace 1)
-    ; CHECK: [[C75:%[0-9]+]]:_(s64) = G_CONSTANT i64 176
-    ; CHECK: [[PTR_ADD76:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C75]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR11]](<4 x s32>), [[PTR_ADD76]](p1) :: (store 16 + 176, align 4, addrspace 1)
-    ; CHECK: [[PTR_ADD77:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C2]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR12]](<4 x s32>), [[PTR_ADD77]](p1) :: (store 16 + 192, align 4, addrspace 1)
-    ; CHECK: [[C76:%[0-9]+]]:_(s64) = G_CONSTANT i64 208
-    ; CHECK: [[PTR_ADD78:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C76]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR13]](<4 x s32>), [[PTR_ADD78]](p1) :: (store 16 + 208, align 4, addrspace 1)
-    ; CHECK: [[C77:%[0-9]+]]:_(s64) = G_CONSTANT i64 224
-    ; CHECK: [[PTR_ADD79:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C77]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR14]](<4 x s32>), [[PTR_ADD79]](p1) :: (store 16 + 224, align 4, addrspace 1)
-    ; CHECK: [[C78:%[0-9]+]]:_(s64) = G_CONSTANT i64 240
-    ; CHECK: [[PTR_ADD80:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY65]], [[C78]](s64)
-    ; CHECK: G_STORE [[BUILD_VECTOR15]](<4 x s32>), [[PTR_ADD80]](p1) :: (store 16 + 240, align 4, addrspace 1)
+    ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[LOAD2]](<16 x s32>), [[LOAD3]](<16 x s32>)
+    ; CHECK: [[INSERT:%[0-9]+]]:_(<32 x s32>) = G_INSERT [[CONCAT_VECTORS]], [[C3]](s32), 32
+    ; CHECK: [[COPY1:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
+    ; CHECK: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>), [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[LOAD]](<16 x s32>)
+    ; CHECK: [[UV4:%[0-9]+]]:_(<4 x s32>), [[UV5:%[0-9]+]]:_(<4 x s32>), [[UV6:%[0-9]+]]:_(<4 x s32>), [[UV7:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[LOAD1]](<16 x s32>)
+    ; CHECK: [[UV8:%[0-9]+]]:_(<4 x s32>), [[UV9:%[0-9]+]]:_(<4 x s32>), [[UV10:%[0-9]+]]:_(<4 x s32>), [[UV11:%[0-9]+]]:_(<4 x s32>), [[UV12:%[0-9]+]]:_(<4 x s32>), [[UV13:%[0-9]+]]:_(<4 x s32>), [[UV14:%[0-9]+]]:_(<4 x s32>), [[UV15:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[INSERT]](<32 x s32>)
+    ; CHECK: G_STORE [[UV]](<4 x s32>), [[COPY1]](p1) :: (store 16, align 4, addrspace 1)
+    ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C4]](s64)
+    ; CHECK: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD3]](p1) :: (store 16 + 16, align 4, addrspace 1)
+    ; CHECK: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+    ; CHECK: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C5]](s64)
+    ; CHECK: G_STORE [[UV2]](<4 x s32>), [[PTR_ADD4]](p1) :: (store 16 + 32, align 4, addrspace 1)
+    ; CHECK: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; CHECK: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C6]](s64)
+    ; CHECK: G_STORE [[UV3]](<4 x s32>), [[PTR_ADD5]](p1) :: (store 16 + 48, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C]](s64)
+    ; CHECK: G_STORE [[UV4]](<4 x s32>), [[PTR_ADD6]](p1) :: (store 16 + 64, align 4, addrspace 1)
+    ; CHECK: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 80
+    ; CHECK: [[PTR_ADD7:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C7]](s64)
+    ; CHECK: G_STORE [[UV5]](<4 x s32>), [[PTR_ADD7]](p1) :: (store 16 + 80, align 4, addrspace 1)
+    ; CHECK: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 96
+    ; CHECK: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C8]](s64)
+    ; CHECK: G_STORE [[UV6]](<4 x s32>), [[PTR_ADD8]](p1) :: (store 16 + 96, align 4, addrspace 1)
+    ; CHECK: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 112
+    ; CHECK: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C9]](s64)
+    ; CHECK: G_STORE [[UV7]](<4 x s32>), [[PTR_ADD9]](p1) :: (store 16 + 112, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C1]](s64)
+    ; CHECK: G_STORE [[UV8]](<4 x s32>), [[PTR_ADD10]](p1) :: (store 16 + 128, align 4, addrspace 1)
+    ; CHECK: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 144
+    ; CHECK: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C10]](s64)
+    ; CHECK: G_STORE [[UV9]](<4 x s32>), [[PTR_ADD11]](p1) :: (store 16 + 144, align 4, addrspace 1)
+    ; CHECK: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 160
+    ; CHECK: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C11]](s64)
+    ; CHECK: G_STORE [[UV10]](<4 x s32>), [[PTR_ADD12]](p1) :: (store 16 + 160, align 4, addrspace 1)
+    ; CHECK: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 176
+    ; CHECK: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C12]](s64)
+    ; CHECK: G_STORE [[UV11]](<4 x s32>), [[PTR_ADD13]](p1) :: (store 16 + 176, align 4, addrspace 1)
+    ; CHECK: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C2]](s64)
+    ; CHECK: G_STORE [[UV12]](<4 x s32>), [[PTR_ADD14]](p1) :: (store 16 + 192, align 4, addrspace 1)
+    ; CHECK: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 208
+    ; CHECK: [[PTR_ADD15:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C13]](s64)
+    ; CHECK: G_STORE [[UV13]](<4 x s32>), [[PTR_ADD15]](p1) :: (store 16 + 208, align 4, addrspace 1)
+    ; CHECK: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 224
+    ; CHECK: [[PTR_ADD16:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C14]](s64)
+    ; CHECK: G_STORE [[UV14]](<4 x s32>), [[PTR_ADD16]](p1) :: (store 16 + 224, align 4, addrspace 1)
+    ; CHECK: [[C15:%[0-9]+]]:_(s64) = G_CONSTANT i64 240
+    ; CHECK: [[PTR_ADD17:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C15]](s64)
+    ; CHECK: G_STORE [[UV15]](<4 x s32>), [[PTR_ADD17]](p1) :: (store 16 + 240, align 4, addrspace 1)
     %0:_(p1) = COPY $sgpr0_sgpr1
     %1:_(s32) = G_CONSTANT i32 33
     %2:_(<64 x s32>) = G_LOAD %0 :: (load 256, align 4, addrspace 4)
diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
index 3a9fb59cae6f03..bd54b6d7e35919 100644
--- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp
@@ -3051,4 +3051,85 @@ TEST_F(AArch64GISelMITest, MoreElementsFreeze) {
   EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF;
 }
 
+// Test fewer elements of G_INSERT_VECTOR_ELEMENT
+TEST_F(AArch64GISelMITest, FewerElementsInsertVectorElt) {
+  setUp();
+  if (!TM)
+    return;
+
+  DefineLegalizerInfo(A, {});
+
+  LLT P0{LLT::pointer(0, 64)};
+  LLT S64{LLT::scalar(64)};
+  LLT S16{LLT::scalar(16)};
+  LLT V2S16{LLT::vector(2, 16)};
+  LLT V3S16{LLT::vector(3, 16)};
+  LLT V8S16{LLT::vector(8, 16)};
+
+  auto Ptr0 = B.buildIntToPtr(P0, Copies[0]);
+  auto VectorV8 = B.buildLoad(V8S16, Ptr0, MachinePointerInfo(), Align(8));
+  auto Value = B.buildTrunc(S16, Copies[1]);
+
+  auto Seven = B.buildConstant(S64, 7);
+  auto InsertV8Constant7_0 =
+      B.buildInsertVectorElement(V8S16, VectorV8, Value, Seven);
+  auto InsertV8Constant7_1 =
+      B.buildInsertVectorElement(V8S16, VectorV8, Value, Seven);
+
+  B.buildStore(InsertV8Constant7_0, Ptr0, MachinePointerInfo(), Align(8),
+               MachineMemOperand::MOVolatile);
+  B.buildStore(InsertV8Constant7_1, Ptr0, MachinePointerInfo(), Align(8),
+               MachineMemOperand::MOVolatile);
+
+  AInfo Info(MF->getSubtarget());
+  DummyGISelObserver Observer;
+  LegalizerHelper Helper(*MF, Info, Observer, B);
+
+  // Perform Legalization
+  B.setInsertPt(*EntryMBB, InsertV8Constant7_0->getIterator());
+
+  // This should index the high element of the 4th piece of an unmerge.
+  EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized,
+            Helper.fewerElementsVector(*InsertV8Constant7_0, 0, V2S16));
+
+  // This case requires extracting an intermediate vector type into the target
+  // v4s16.
+  B.setInsertPt(*EntryMBB, InsertV8Constant7_1->getIterator());
+  EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized,
+            Helper.fewerElementsVector(*InsertV8Constant7_1, 0, V3S16));
+
+  const auto *CheckStr = R"(
+  CHECK: [[COPY0:%[0-9]+]]:_(s64) = COPY
+  CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY
+  CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY
+  CHECK: [[PTR0:%[0-9]+]]:_(p0) = G_INTTOPTR [[COPY0]]
+  CHECK: [[VEC8:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[PTR0]]:_(p0) :: (load 16, align 8)
+  CHECK: [[INSERT_VAL:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]]
+
+
+  CHECK: [[UNMERGE0:%[0-9]+]]:_(<2 x s16>), [[UNMERGE1:%[0-9]+]]:_(<2 x s16>), [[UNMERGE2:%[0-9]+]]:_(<2 x s16>), [[UNMERGE3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[VEC8]]
+  CHECK: [[ONE:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  CHECK: [[SUB_INSERT_7:%[0-9]+]]:_(<2 x s16>) = G_INSERT_VECTOR_ELT [[UNMERGE3]]:_, [[INSERT_VAL]]:_(s16), [[ONE]]
+  CHECK: [[INSERT_V8_7_0:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[UNMERGE0]]:_(<2 x s16>), [[UNMERGE1]]:_(<2 x s16>), [[UNMERGE2]]:_(<2 x s16>), [[SUB_INSERT_7]]:_(<2 x s16>)
+
+
+  CHECK: [[UNMERGE1_0:%[0-9]+]]:_(s16), [[UNMERGE1_1:%[0-9]+]]:_(s16), [[UNMERGE1_2:%[0-9]+]]:_(s16), [[UNMERGE1_3:%[0-9]+]]:_(s16), [[UNMERGE1_4:%[0-9]+]]:_(s16), [[UNMERGE1_5:%[0-9]+]]:_(s16), [[UNMERGE1_6:%[0-9]+]]:_(s16), [[UNMERGE1_7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[VEC8]]:_(<8 x s16>)
+  CHECK: [[IMPDEF_S16:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+  CHECK: [[BUILD0:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UNMERGE1_0]]:_(s16), [[UNMERGE1_1]]:_(s16), [[UNMERGE1_2]]:_(s16)
+  CHECK: [[BUILD1:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UNMERGE1_3]]:_(s16), [[UNMERGE1_4]]:_(s16), [[UNMERGE1_5]]:_(s16)
+  CHECK: [[BUILD2:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UNMERGE1_6]]:_(s16), [[UNMERGE1_7]]:_(s16), [[IMPDEF_S16]]:_(s16)
+  CHECK: [[IMPDEF_V3S16:%[0-9]+]]:_(<3 x s16>) = G_IMPLICIT_DEF
+  CHECK: [[ONE_1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+  CHECK: [[SUB_INSERT_7_V3S16:%[0-9]+]]:_(<3 x s16>) = G_INSERT_VECTOR_ELT [[BUILD2]]:_, [[INSERT_VAL]]:_(s16), [[ONE_1]]
+  CHECK: [[WIDE_CONCAT:%[0-9]+]]:_(<24 x s16>) = G_CONCAT_VECTORS [[BUILD0]]:_(<3 x s16>), [[BUILD1]]:_(<3 x s16>), [[SUB_INSERT_7_V3S16]]:_(<3 x s16>), [[IMPDEF_V3S16]]:_(<3 x s16>), [[IMPDEF_V3S16]]:_(<3 x s16>), [[IMPDEF_V3S16]]:_(<3 x s16>), [[IMPDEF_V3S16]]:_(<3 x s16>), [[IMPDEF_V3S16]]:_(<3 x s16>)
+  CHECK: [[INSERT_V8_7_1:%[0-9]+]]:_(<8 x s16>) = G_EXTRACT [[WIDE_CONCAT]]:_(<24 x s16>), 0
+
+  CHECK: G_STORE [[INSERT_V8_7_0]]
+  CHECK: G_STORE [[INSERT_V8_7_1]]
+  )";
+
+  // Check
+  EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF;
+}
+
 } // namespace

From 9887a70e7a768f6fca135587ce3e62d691a3646d Mon Sep 17 00:00:00 2001
From: Tue Ly <lntue@google.com>
Date: Tue, 28 Jul 2020 01:35:18 -0400
Subject: [PATCH 072/101] [libc] Add ULP function to MPFRNumber class to test
 correctly rounded functions such as SQRT, FMA.

Add ULP function to MPFRNumber class to test correctly rounded functions.

Differential Revision: https://reviews.llvm.org/D84725
---
 libc/utils/MPFRWrapper/MPFRUtils.cpp | 100 +++++++++++++++++++++++----
 libc/utils/MPFRWrapper/MPFRUtils.h   |  22 ++++--
 2 files changed, 106 insertions(+), 16 deletions(-)

diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index c6020f471e88d8..c97e89ce9b2b46 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -9,6 +9,7 @@
 #include "MPFRUtils.h"
 
 #include "utils/FPUtil/FPBits.h"
+#include "utils/FPUtil/TestHelpers.h"
 
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -119,6 +120,9 @@ class MPFRNumber {
     case Operation::Sin:
       mpfr_sin(value, mpfrInput.value, MPFR_RNDN);
       break;
+    case Operation::Sqrt:
+      mpfr_sqrt(value, mpfrInput.value, MPFR_RNDN);
+      break;
     case Operation::Trunc:
       mpfr_trunc(value, mpfrInput.value);
       break;
@@ -155,9 +159,59 @@ class MPFRNumber {
   }
 
   // These functions are useful for debugging.
-  float asFloat() const { return mpfr_get_flt(value, MPFR_RNDN); }
-  double asDouble() const { return mpfr_get_d(value, MPFR_RNDN); }
+  template <typename T> T as() const;
+
+  template <> float as<float>() const { return mpfr_get_flt(value, MPFR_RNDN); }
+  template <> double as<double>() const { return mpfr_get_d(value, MPFR_RNDN); }
+  template <> long double as<long double>() const {
+    return mpfr_get_ld(value, MPFR_RNDN);
+  }
+
   void dump(const char *msg) const { mpfr_printf("%s%.128Rf\n", msg, value); }
+
+  // Return the ULP (units-in-the-last-place) difference between the
+  // stored MPFR and a floating point number.
+  //
+  // We define:
+  //   ULP(mpfr_value, value) = abs(mpfr_value - value) / eps(value)
+  //
+  // Remarks:
+  // 1. ULP < 0.5 will imply that the value is correctly rounded.
+  // 2. We expect that this value and the value to be compared (the [input]
+  //    argument) are reasonable close, and we will provide an upper bound
+  //    of ULP value for testing.  Morever, most of the fractional parts of
+  //    ULP value do not matter much, so using double as the return type
+  //    should be good enough.
+  template <typename T>
+  cpp::EnableIfType<cpp::IsFloatingPointType<T>::Value, double> ulp(T input) {
+    fputil::FPBits<T> bits(input);
+    MPFRNumber mpfrInput(input);
+
+    // abs(value - input)
+    mpfr_sub(mpfrInput.value, value, mpfrInput.value, MPFR_RNDN);
+    mpfr_abs(mpfrInput.value, mpfrInput.value, MPFR_RNDN);
+
+    // get eps(input)
+    int epsExponent = bits.exponent - fputil::FPBits<T>::exponentBias -
+                      fputil::MantissaWidth<T>::value;
+    if (bits.exponent == 0) {
+      // correcting denormal exponent
+      ++epsExponent;
+    } else if ((bits.mantissa == 0) && (bits.exponent > 1) &&
+               mpfr_less_p(value, mpfrInput.value)) {
+      // when the input is exactly 2^n, distance (epsilon) between the input
+      // and the next floating point number is different from the distance to
+      // the previous floating point number.  So in that case, if the correct
+      // value from MPFR is smaller than the input, we use the smaller epsilon
+      --epsExponent;
+    }
+
+    // Since eps(value) is of the form 2^e, instead of dividing such number,
+    // we multiply by its inverse 2^{-e}.
+    mpfr_mul_2si(mpfrInput.value, mpfrInput.value, -epsExponent, MPFR_RNDN);
+
+    return mpfrInput.as<double>();
+  }
 };
 
 namespace internal {
@@ -167,19 +221,26 @@ void MPFRMatcher<T>::explainError(testutils::StreamWrapper &OS) {
   MPFRNumber mpfrResult(operation, input);
   MPFRNumber mpfrInput(input);
   MPFRNumber mpfrMatchValue(matchValue);
-  MPFRNumber mpfrToleranceValue(matchValue, tolerance);
   FPBits<T> inputBits(input);
   FPBits<T> matchBits(matchValue);
-  // TODO: Call to llvm::utohexstr implicitly converts __uint128_t values to
-  // uint64_t values. This can be fixed using a custom wrapper for
-  // llvm::utohexstr to handle __uint128_t values correctly.
+  FPBits<T> mpfrResultBits(mpfrResult.as<T>());
   OS << "Match value not within tolerance value of MPFR result:\n"
-     << "  Input decimal: " << mpfrInput.str() << '\n'
-     << "     Input bits: 0x" << llvm::utohexstr(inputBits.bitsAsUInt()) << '\n'
-     << "  Match decimal: " << mpfrMatchValue.str() << '\n'
-     << "     Match bits: 0x" << llvm::utohexstr(matchBits.bitsAsUInt()) << '\n'
-     << "    MPFR result: " << mpfrResult.str() << '\n'
-     << "Tolerance value: " << mpfrToleranceValue.str() << '\n';
+     << "  Input decimal: " << mpfrInput.str() << '\n';
+  __llvm_libc::fputil::testing::describeValue("     Input bits: ", input, OS);
+  OS << '\n' << "  Match decimal: " << mpfrMatchValue.str() << '\n';
+  __llvm_libc::fputil::testing::describeValue("     Match bits: ", matchValue,
+                                              OS);
+  OS << '\n' << "    MPFR result: " << mpfrResult.str() << '\n';
+  __llvm_libc::fputil::testing::describeValue(
+      "   MPFR rounded: ", mpfrResult.as<T>(), OS);
+  OS << '\n';
+  if (useULP) {
+    OS << "      ULP error: " << std::to_string(mpfrResult.ulp(matchValue))
+       << '\n';
+  } else {
+    MPFRNumber mpfrToleranceValue = MPFRNumber(matchValue, tolerance);
+    OS << "Tolerance value: " << mpfrToleranceValue.str() << '\n';
+  }
 }
 
 template void MPFRMatcher<float>::explainError(testutils::StreamWrapper &);
@@ -201,6 +262,21 @@ template bool compare<double>(Operation, double, double, const Tolerance &);
 template bool compare<long double>(Operation, long double, long double,
                                    const Tolerance &);
 
+template <typename T>
+bool compare(Operation op, T input, T libcResult, double ulpError) {
+  // If the ulp error is exactly 0.5 (i.e a tie), we would check that the result
+  // is rounded to the nearest even.
+  MPFRNumber mpfrResult(op, input);
+  double ulp = mpfrResult.ulp(libcResult);
+  bool bitsAreEven = ((FPBits<T>(libcResult).bitsAsUInt() & 1) == 0);
+  return (ulp < ulpError) ||
+         ((ulp == ulpError) && ((ulp != 0.5) || bitsAreEven));
+}
+
+template bool compare<float>(Operation, float, float, double);
+template bool compare<double>(Operation, double, double, double);
+template bool compare<long double>(Operation, long double, long double, double);
+
 } // namespace internal
 
 } // namespace mpfr
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index 56281656533250..633c67ff8570fb 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -48,6 +48,7 @@ enum class Operation : int {
   Floor,
   Round,
   Sin,
+  Sqrt,
   Trunc
 };
 
@@ -56,6 +57,9 @@ namespace internal {
 template <typename T>
 bool compare(Operation op, T input, T libcOutput, const Tolerance &t);
 
+template <typename T>
+bool compare(Operation op, T input, T libcOutput, double t);
+
 template <typename T> class MPFRMatcher : public testing::Matcher<T> {
   static_assert(__llvm_libc::cpp::IsFloatingPointType<T>::Value,
                 "MPFRMatcher can only be used with floating point values.");
@@ -64,14 +68,21 @@ template <typename T> class MPFRMatcher : public testing::Matcher<T> {
   T input;
   Tolerance tolerance;
   T matchValue;
+  double ulpTolerance;
+  bool useULP;
 
 public:
   MPFRMatcher(Operation op, T testInput, Tolerance &t)
-      : operation(op), input(testInput), tolerance(t) {}
+      : operation(op), input(testInput), tolerance(t), useULP(false) {}
+  MPFRMatcher(Operation op, T testInput, double ulpTolerance)
+      : operation(op), input(testInput), ulpTolerance(ulpTolerance),
+        useULP(true) {}
 
   bool match(T libcResult) {
     matchValue = libcResult;
-    return internal::compare(operation, input, libcResult, tolerance);
+    return (useULP
+                ? internal::compare(operation, input, libcResult, ulpTolerance)
+                : internal::compare(operation, input, libcResult, tolerance));
   }
 
   void explainError(testutils::StreamWrapper &OS) override;
@@ -79,9 +90,12 @@ template <typename T> class MPFRMatcher : public testing::Matcher<T> {
 
 } // namespace internal
 
-template <typename T>
+template <typename T, typename U>
 __attribute__((no_sanitize("address")))
-internal::MPFRMatcher<T> getMPFRMatcher(Operation op, T input, Tolerance t) {
+typename cpp::EnableIfType<cpp::IsSameV<U, Tolerance> ||
+                               cpp::IsSameV<U, double>,
+                           internal::MPFRMatcher<T>>
+getMPFRMatcher(Operation op, T input, U t) {
   static_assert(
       __llvm_libc::cpp::IsFloatingPointType<T>::Value,
       "getMPFRMatcher can only be used to match floating point results.");

From f768eb216f5924219c845515c606ab0703825634 Mon Sep 17 00:00:00 2001
From: Siva Chandra Reddy <sivachandra@google.com>
Date: Tue, 18 Aug 2020 11:04:58 -0700
Subject: [PATCH 073/101] [libc][obvious] Fix link order of math tests.

---
 libc/test/src/math/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index f5a1e0e51a715f..e73de54035642e 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -18,10 +18,10 @@ function(add_fp_unittest name)
 
   add_libc_unittest(${name} ${MATH_UNITTEST_UNPARSED_ARGUMENTS})
   get_fq_target_name(${name} fq_target_name)
-  target_link_libraries(${fq_target_name} PRIVATE LibcFPTestHelpers)
   if(MATH_UNITTEST_NEED_MPFR)
     target_link_libraries(${fq_target_name} PRIVATE libcMPFRWrapper -lmpfr -lgmp)
   endif()
+  target_link_libraries(${fq_target_name} PRIVATE LibcFPTestHelpers)
 endfunction(add_fp_unittest)
 
 add_fp_unittest(

From f29e6277ad6bcff36ed950dbf8effddc59ba9c28 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Tue, 18 Aug 2020 09:23:48 -0700
Subject: [PATCH 074/101] [GlobalISel][CallLowering] Don't tail call with
 non-forwarded explicit sret

Similar to this commit:

faf8065a99817bcb10e6f09b558fe3e0972c35ce

Testcase is pretty much the same as

test/CodeGen/AArch64/tailcall-explicit-sret.ll

Except it uses i64 (since we don't handle the i1024 return values yet), and
doesn't have indirect tail call testcases (because we can't translate those
yet).

Differential Revision: https://reviews.llvm.org/D86148
---
 llvm/lib/CodeGen/GlobalISel/CallLowering.cpp  |  19 ++--
 .../call-translator-tail-call-sret.ll         | 100 ++++++++++++++++++
 2 files changed, 113 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/call-translator-tail-call-sret.ll

diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index e443f603def6b3..cf1059c67b4a05 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -65,6 +65,12 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
                              std::function<unsigned()> GetCalleeReg) const {
   CallLoweringInfo Info;
   const DataLayout &DL = MIRBuilder.getDataLayout();
+  MachineFunction &MF = MIRBuilder.getMF();
+  bool CanBeTailCalled = CB.isTailCall() &&
+                         isInTailCallPosition(CB, MF.getTarget()) &&
+                         (MF.getFunction()
+                              .getFnAttribute("disable-tail-calls")
+                              .getValueAsString() != "true");
 
   // First step is to marshall all the function's parameters into the correct
   // physregs and memory locations. Gather the sequence of argument types that
@@ -75,6 +81,12 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
     ArgInfo OrigArg{ArgRegs[i], Arg->getType(), getAttributesForArgIdx(CB, i),
                     i < NumFixedArgs};
     setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CB);
+
+    // If we have an explicit sret argument that is an Instruction, (i.e., it
+    // might point to function-local memory), we can't meaningfully tail-call.
+    if (OrigArg.Flags[0].isSRet() && isa<Instruction>(&Arg))
+      CanBeTailCalled = false;
+
     Info.OrigArgs.push_back(OrigArg);
     ++i;
   }
@@ -91,16 +103,11 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   if (!Info.OrigRet.Ty->isVoidTy())
     setArgFlags(Info.OrigRet, AttributeList::ReturnIndex, DL, CB);
 
-  MachineFunction &MF = MIRBuilder.getMF();
   Info.KnownCallees = CB.getMetadata(LLVMContext::MD_callees);
   Info.CallConv = CB.getCallingConv();
   Info.SwiftErrorVReg = SwiftErrorVReg;
   Info.IsMustTailCall = CB.isMustTailCall();
-  Info.IsTailCall =
-      CB.isTailCall() && isInTailCallPosition(CB, MF.getTarget()) &&
-      (MF.getFunction()
-           .getFnAttribute("disable-tail-calls")
-           .getValueAsString() != "true");
+  Info.IsTailCall = CanBeTailCalled;
   Info.IsVarArg = CB.getFunctionType()->isVarArg();
   return lowerCall(MIRBuilder, Info);
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/call-translator-tail-call-sret.ll b/llvm/test/CodeGen/AArch64/GlobalISel/call-translator-tail-call-sret.ll
new file mode 100644
index 00000000000000..a9a93d1b7f7a93
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/call-translator-tail-call-sret.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc < %s -mtriple arm64-apple-darwin -global-isel -stop-after=irtranslator -verify-machineinstrs | FileCheck %s
+
+; Check that we don't try to tail-call with a non-forwarded sret parameter.
+declare void @test_explicit_sret(i64* sret)
+
+; Forwarded explicit sret pointer => we can tail call.
+define void @can_tail_call_forwarded_explicit_sret_ptr(i64* sret %arg) {
+  ; CHECK-LABEL: name: can_tail_call_forwarded_explicit_sret_ptr
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $x8
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $x8
+  ; CHECK:   $x8 = COPY [[COPY]](p0)
+  ; CHECK:   TCRETURNdi @test_explicit_sret, 0, csr_darwin_aarch64_aapcs, implicit $sp, implicit $x8
+  tail call void @test_explicit_sret(i64* %arg)
+  ret void
+}
+
+; Not marked as tail, so don't tail call.
+define void @test_call_explicit_sret(i64* sret %arg) {
+  ; CHECK-LABEL: name: test_call_explicit_sret
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $x8
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $x8
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   $x8 = COPY [[COPY]](p0)
+  ; CHECK:   BL @test_explicit_sret, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x8
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   RET_ReallyLR
+  call void @test_explicit_sret(i64* %arg)
+  ret void
+}
+
+define void @dont_tail_call_explicit_sret_alloca_unused() {
+  ; CHECK-LABEL: name: dont_tail_call_explicit_sret_alloca_unused
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.l
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   $x8 = COPY [[FRAME_INDEX]](p0)
+  ; CHECK:   BL @test_explicit_sret, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x8
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   RET_ReallyLR
+  %l = alloca i64, align 8
+  tail call void @test_explicit_sret(i64* %l)
+  ret void
+}
+
+define void @dont_tail_call_explicit_sret_alloca_dummyusers(i64* %ptr) {
+  ; CHECK-LABEL: name: dont_tail_call_explicit_sret_alloca_dummyusers
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $x0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.l
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load 8 from %ir.ptr)
+  ; CHECK:   G_STORE [[LOAD]](s64), [[FRAME_INDEX]](p0) :: (store 8 into %ir.l)
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   $x8 = COPY [[FRAME_INDEX]](p0)
+  ; CHECK:   BL @test_explicit_sret, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x8
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   RET_ReallyLR
+  %l = alloca i64, align 8
+  %r = load i64, i64* %ptr, align 8
+  store i64 %r, i64* %l, align 8
+  tail call void @test_explicit_sret(i64* %l)
+  ret void
+}
+
+define void @dont_tail_call_tailcall_explicit_sret_gep(i64* %ptr) {
+  ; CHECK-LABEL: name: dont_tail_call_tailcall_explicit_sret_gep
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $x0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   $x8 = COPY [[PTR_ADD]](p0)
+  ; CHECK:   BL @test_explicit_sret, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x8
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   RET_ReallyLR
+  %ptr2 = getelementptr i64, i64* %ptr, i32 1
+  tail call void @test_explicit_sret(i64* %ptr2)
+  ret void
+}
+
+define i64 @dont_tail_call_sret_alloca_returned() {
+  ; CHECK-LABEL: name: dont_tail_call_sret_alloca_returned
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0.l
+  ; CHECK:   ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   $x8 = COPY [[FRAME_INDEX]](p0)
+  ; CHECK:   BL @test_explicit_sret, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x8
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (dereferenceable load 8 from %ir.l)
+  ; CHECK:   $x0 = COPY [[LOAD]](s64)
+  ; CHECK:   RET_ReallyLR implicit $x0
+  %l = alloca i64, align 8
+  tail call void @test_explicit_sret(i64* %l)
+  %r = load i64, i64* %l, align 8
+  ret i64 %r
+}

From bf36e902953a4bf8ac0aae5a498445951fbc3882 Mon Sep 17 00:00:00 2001
From: Jessica Paquette <jpaquette@apple.com>
Date: Tue, 18 Aug 2020 10:37:10 -0700
Subject: [PATCH 075/101] [GlobalISel][CallLowering] NFC: Unify flag-setting
 from CallBase + AttributeList

It's annoying to have to maintain multiple, nearly identical chains of if
statements which all set the same attributes.

Add a helper function, `addFlagsUsingAttrFn` which performs the attribute
setting.

Then, use wrappers for that function in `lowerCall` and `setArgFlags`.

(Note that the flag-setting code in `setArgFlags` was missing the returned
attribute. There's no selection for this yet, so no test. It's an example of
the kind of thing this lets us avoid, though.)

Differential Revision: https://reviews.llvm.org/D86159
---
 .../llvm/CodeGen/GlobalISel/CallLowering.h    |  6 ++
 llvm/lib/CodeGen/GlobalISel/CallLowering.cpp  | 66 +++++++++----------
 2 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
index 1eec08f5106220..ef93042f6690d8 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -213,6 +213,12 @@ class CallLowering {
   ISD::ArgFlagsTy getAttributesForArgIdx(const CallBase &Call,
                                          unsigned ArgIdx) const;
 
+  /// Adds flags to \p Flags based off of the attributes in \p Attrs.
+  /// \p OpIdx is the index in \p Attrs to add flags from.
+  void addArgFlagsFromAttributes(ISD::ArgFlagsTy &Flags,
+                                 const AttributeList &Attrs,
+                                 unsigned OpIdx) const;
+
   template <typename FuncInfoTy>
   void setArgFlags(ArgInfo &Arg, unsigned OpIdx, const DataLayout &DL,
                    const FuncInfoTy &FuncInfo) const;
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index cf1059c67b4a05..49d101a81e933e 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -30,34 +30,51 @@ using namespace llvm;
 
 void CallLowering::anchor() {}
 
-ISD::ArgFlagsTy CallLowering::getAttributesForArgIdx(const CallBase &Call,
-                                                     unsigned ArgIdx) const {
-  ISD::ArgFlagsTy Flags;
-  if (Call.paramHasAttr(ArgIdx, Attribute::SExt))
+/// Helper function which updates \p Flags when \p AttrFn returns true.
+static void
+addFlagsUsingAttrFn(ISD::ArgFlagsTy &Flags,
+                    const std::function<bool(Attribute::AttrKind)> &AttrFn) {
+  if (AttrFn(Attribute::SExt))
     Flags.setSExt();
-  if (Call.paramHasAttr(ArgIdx, Attribute::ZExt))
+  if (AttrFn(Attribute::ZExt))
     Flags.setZExt();
-  if (Call.paramHasAttr(ArgIdx, Attribute::InReg))
+  if (AttrFn(Attribute::InReg))
     Flags.setInReg();
-  if (Call.paramHasAttr(ArgIdx, Attribute::StructRet))
+  if (AttrFn(Attribute::StructRet))
     Flags.setSRet();
-  if (Call.paramHasAttr(ArgIdx, Attribute::Nest))
+  if (AttrFn(Attribute::Nest))
     Flags.setNest();
-  if (Call.paramHasAttr(ArgIdx, Attribute::ByVal))
+  if (AttrFn(Attribute::ByVal))
     Flags.setByVal();
-  if (Call.paramHasAttr(ArgIdx, Attribute::Preallocated))
+  if (AttrFn(Attribute::Preallocated))
     Flags.setPreallocated();
-  if (Call.paramHasAttr(ArgIdx, Attribute::InAlloca))
+  if (AttrFn(Attribute::InAlloca))
     Flags.setInAlloca();
-  if (Call.paramHasAttr(ArgIdx, Attribute::Returned))
+  if (AttrFn(Attribute::Returned))
     Flags.setReturned();
-  if (Call.paramHasAttr(ArgIdx, Attribute::SwiftSelf))
+  if (AttrFn(Attribute::SwiftSelf))
     Flags.setSwiftSelf();
-  if (Call.paramHasAttr(ArgIdx, Attribute::SwiftError))
+  if (AttrFn(Attribute::SwiftError))
     Flags.setSwiftError();
+}
+
+ISD::ArgFlagsTy CallLowering::getAttributesForArgIdx(const CallBase &Call,
+                                                     unsigned ArgIdx) const {
+  ISD::ArgFlagsTy Flags;
+  addFlagsUsingAttrFn(Flags, [&Call, &ArgIdx](Attribute::AttrKind Attr) {
+    return Call.paramHasAttr(ArgIdx, Attr);
+  });
   return Flags;
 }
 
+void CallLowering::addArgFlagsFromAttributes(ISD::ArgFlagsTy &Flags,
+                                             const AttributeList &Attrs,
+                                             unsigned OpIdx) const {
+  addFlagsUsingAttrFn(Flags, [&Attrs, &OpIdx](Attribute::AttrKind Attr) {
+    return Attrs.hasAttribute(OpIdx, Attr);
+  });
+}
+
 bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
                              ArrayRef<Register> ResRegs,
                              ArrayRef<ArrayRef<Register>> ArgRegs,
@@ -118,24 +135,7 @@ void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
                                const FuncInfoTy &FuncInfo) const {
   auto &Flags = Arg.Flags[0];
   const AttributeList &Attrs = FuncInfo.getAttributes();
-  if (Attrs.hasAttribute(OpIdx, Attribute::ZExt))
-    Flags.setZExt();
-  if (Attrs.hasAttribute(OpIdx, Attribute::SExt))
-    Flags.setSExt();
-  if (Attrs.hasAttribute(OpIdx, Attribute::InReg))
-    Flags.setInReg();
-  if (Attrs.hasAttribute(OpIdx, Attribute::StructRet))
-    Flags.setSRet();
-  if (Attrs.hasAttribute(OpIdx, Attribute::SwiftSelf))
-    Flags.setSwiftSelf();
-  if (Attrs.hasAttribute(OpIdx, Attribute::SwiftError))
-    Flags.setSwiftError();
-  if (Attrs.hasAttribute(OpIdx, Attribute::ByVal))
-    Flags.setByVal();
-  if (Attrs.hasAttribute(OpIdx, Attribute::Preallocated))
-    Flags.setPreallocated();
-  if (Attrs.hasAttribute(OpIdx, Attribute::InAlloca))
-    Flags.setInAlloca();
+  addArgFlagsFromAttributes(Flags, Attrs, OpIdx);
 
   if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) {
     Type *ElementTy = cast<PointerType>(Arg.Ty)->getElementType();
@@ -152,8 +152,6 @@ void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
       FrameAlign = Align(getTLI()->getByValTypeAlignment(ElementTy, DL));
     Flags.setByValAlign(FrameAlign);
   }
-  if (Attrs.hasAttribute(OpIdx, Attribute::Nest))
-    Flags.setNest();
   Flags.setOrigAlign(DL.getABITypeAlign(Arg.Ty));
 }
 

From 62dbbcf6d7c67b02fd540a5a1e55c494bf88adea Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 18 Aug 2020 19:03:40 +0000
Subject: [PATCH 076/101] Remove MLIREDSCInterface library which isn't used
 anywhere (NFC)

Reviewed By: nicolasvasilache, ftynse

Differential Revision: https://reviews.llvm.org/D85042
---
 mlir/lib/EDSC/CMakeLists.txt | 11 -----
 mlir/lib/EDSC/CoreAPIs.cpp   | 93 ------------------------------------
 2 files changed, 104 deletions(-)
 delete mode 100644 mlir/lib/EDSC/CoreAPIs.cpp

diff --git a/mlir/lib/EDSC/CMakeLists.txt b/mlir/lib/EDSC/CMakeLists.txt
index 6d56f263f2713a..33e1d8e80e2fc5 100644
--- a/mlir/lib/EDSC/CMakeLists.txt
+++ b/mlir/lib/EDSC/CMakeLists.txt
@@ -14,14 +14,3 @@ add_mlir_library(MLIREDSC
   MLIRSupport
   )
 
-add_mlir_library(MLIREDSCInterface
-  CoreAPIs.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/EDSC
-
-  LINK_LIBS PUBLIC
-  MLIRIR
-  MLIRSupport
-  MLIRParser
-  )
diff --git a/mlir/lib/EDSC/CoreAPIs.cpp b/mlir/lib/EDSC/CoreAPIs.cpp
deleted file mode 100644
index 55b7c2c77a0efe..00000000000000
--- a/mlir/lib/EDSC/CoreAPIs.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-//===- Types.cpp - Implementations of MLIR Core C APIs --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir-c/Core.h"
-
-#include "mlir/IR/AffineMap.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/Types.h"
-#include "mlir/Support/LLVM.h"
-
-#include "mlir/Parser.h"
-
-#include "llvm/ADT/StringSwitch.h"
-
-using namespace mlir;
-
-mlir_type_t makeMemRefType(mlir_context_t context, mlir_type_t elemType,
-                           int64_list_t sizes) {
-  auto t = mlir::MemRefType::get(
-      ArrayRef<int64_t>(sizes.values, sizes.n),
-      mlir::Type::getFromOpaquePointer(elemType),
-      {mlir::AffineMap::getMultiDimIdentityMap(
-          sizes.n, reinterpret_cast<mlir::MLIRContext *>(context))},
-      0);
-  return mlir_type_t{t.getAsOpaquePointer()};
-}
-
-mlir_type_t makeFunctionType(mlir_context_t context, mlir_type_list_t inputs,
-                             mlir_type_list_t outputs) {
-  SmallVector<mlir::Type, 8> ins(inputs.n), outs(outputs.n);
-  for (unsigned i = 0; i < inputs.n; ++i) {
-    ins[i] = mlir::Type::getFromOpaquePointer(inputs.types[i]);
-  }
-  for (unsigned i = 0; i < outputs.n; ++i) {
-    outs[i] = mlir::Type::getFromOpaquePointer(outputs.types[i]);
-  }
-  auto ft = mlir::FunctionType::get(
-      ins, outs, reinterpret_cast<mlir::MLIRContext *>(context));
-  return mlir_type_t{ft.getAsOpaquePointer()};
-}
-
-mlir_type_t makeIndexType(mlir_context_t context) {
-  auto *ctx = reinterpret_cast<mlir::MLIRContext *>(context);
-  auto type = mlir::IndexType::get(ctx);
-  return mlir_type_t{type.getAsOpaquePointer()};
-}
-
-mlir_attr_t makeIntegerAttr(mlir_type_t type, int64_t value) {
-  auto ty = Type::getFromOpaquePointer(reinterpret_cast<const void *>(type));
-  auto attr = IntegerAttr::get(ty, value);
-  return mlir_attr_t{attr.getAsOpaquePointer()};
-}
-
-mlir_attr_t makeBoolAttr(mlir_context_t context, bool value) {
-  auto *ctx = reinterpret_cast<mlir::MLIRContext *>(context);
-  auto attr = BoolAttr::get(value, ctx);
-  return mlir_attr_t{attr.getAsOpaquePointer()};
-}
-
-mlir_attr_t makeFloatAttr(mlir_context_t context, float value) {
-  auto *ctx = reinterpret_cast<mlir::MLIRContext *>(context);
-  auto attr = FloatAttr::get(FloatType::getF32(ctx), APFloat(value));
-  return mlir_attr_t{attr.getAsOpaquePointer()};
-}
-
-mlir_attr_t makeStringAttr(mlir_context_t context, const char *value) {
-  auto *ctx = reinterpret_cast<mlir::MLIRContext *>(context);
-  auto attr = StringAttr::get(value, ctx);
-  return mlir_attr_t{attr.getAsOpaquePointer()};
-}
-
-unsigned getFunctionArity(mlir_func_t function) {
-  auto f = mlir::FuncOp::getFromOpaquePointer(function);
-  return f.getNumArguments();
-}
-
-mlir_type_t mlirParseType(const char *type, mlir_context_t context,
-                          uint64_t *charsRead) {
-  auto *ctx = reinterpret_cast<MLIRContext *>(context);
-  size_t numRead = 0;
-  Type ty = parseType(type, ctx, numRead);
-  if (charsRead)
-    *charsRead = numRead;
-  return mlir_type_t{ty.getAsOpaquePointer()};
-}

From 6b1f9f2bd4437910804d571284b7c5bb66eac250 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 18 Aug 2020 12:29:58 -0700
Subject: [PATCH 077/101] [X86] Don't call SemaBuiltinConstantArg from
 CheckX86BuiltinTileDuplicate if Argument is Type or Value Dependent.

SemaBuiltinConstantArg has an early exit for that case that doesn't
produce an error and doesn't update the APInt. We need to detect that
case and not use the APInt value.

While there delete the signature of CheckX86BuiltinTileArgumentsRange
that takes a single Argument index to check. There's another version
that takes an ArrayRef and single value is convertible to an ArrayRef.
---
 clang/include/clang/Sema/Sema.h |  1 -
 clang/lib/Sema/SemaChecking.cpp | 17 +++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 1d12551a8ad212..19d58b889ef75e 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -12192,7 +12192,6 @@ class Sema final {
   bool CheckX86BuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall,
                                          ArrayRef<int> ArgNums);
-  bool CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall, int ArgNum);
   bool CheckX86BuiltinTileDuplicate(CallExpr *TheCall, ArrayRef<int> ArgNums);
   bool CheckX86BuiltinTileRangeAndDuplicate(CallExpr *TheCall,
                                             ArrayRef<int> ArgNums);
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 70d3a682fc7028..deceffdb0ba50c 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3705,7 +3705,7 @@ bool Sema::CheckX86BuiltinGatherScatterScale(unsigned BuiltinID,
 enum { TileRegLow = 0, TileRegHigh = 7 };
 
 bool Sema::CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall,
-                                    ArrayRef<int> ArgNums) {
+                                             ArrayRef<int> ArgNums) {
   for (int ArgNum : ArgNums) {
     if (SemaBuiltinConstantArgRange(TheCall, ArgNum, TileRegLow, TileRegHigh))
       return true;
@@ -3713,19 +3713,20 @@ bool Sema::CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall,
   return false;
 }
 
-bool Sema::CheckX86BuiltinTileArgumentsRange(CallExpr *TheCall, int ArgNum) {
-  return SemaBuiltinConstantArgRange(TheCall, ArgNum, TileRegLow, TileRegHigh);
-}
-
 bool Sema::CheckX86BuiltinTileDuplicate(CallExpr *TheCall,
                                         ArrayRef<int> ArgNums) {
   // Because the max number of tile register is TileRegHigh + 1, so here we use
   // each bit to represent the usage of them in bitset.
   std::bitset<TileRegHigh + 1> ArgValues;
   for (int ArgNum : ArgNums) {
-    llvm::APSInt Arg;
-    SemaBuiltinConstantArg(TheCall, ArgNum, Arg);
-    int ArgExtValue = Arg.getExtValue();
+    Expr *Arg = TheCall->getArg(ArgNum);
+    if (Arg->isTypeDependent() || Arg->isValueDependent())
+      continue;
+
+    llvm::APSInt Result;
+    if (SemaBuiltinConstantArg(TheCall, ArgNum, Result))
+      return true;
+    int ArgExtValue = Result.getExtValue();
     assert((ArgExtValue >= TileRegLow || ArgExtValue <= TileRegHigh) &&
            "Incorrect tile register num.");
     if (ArgValues.test(ArgExtValue))

From 673dbe1b5eef09db39783c828a84f1213a47bad0 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Thu, 30 Jul 2020 17:32:39 -0700
Subject: [PATCH 078/101] [clang codegen] Use IR "align" attribute for static
 array arguments.

Without the "align" attribute, marking the argument dereferenceable is
basically useless.  See also D80166.

Fixes https://bugs.llvm.org/show_bug.cgi?id=46876 .

Differential Revision: https://reviews.llvm.org/D84992
---
 clang/lib/CodeGen/CGCall.cpp | 16 ++++++++++++----
 clang/test/CodeGen/vla.c     | 10 +++++-----
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 9d225b23e3c335..98ba1efc20de6f 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2520,6 +2520,9 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
             // bytes).
             if (ArrTy->getSizeModifier() == ArrayType::Static) {
               QualType ETy = ArrTy->getElementType();
+              llvm::Align Alignment =
+                  CGM.getNaturalTypeAlignment(ETy).getAsAlign();
+              AI->addAttrs(llvm::AttrBuilder().addAlignmentAttr(Alignment));
               uint64_t ArrSize = ArrTy->getSize().getZExtValue();
               if (!ETy->isIncompleteType() && ETy->isConstantSizeType() &&
                   ArrSize) {
@@ -2539,10 +2542,15 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
             // For C99 VLAs with the static keyword, we don't know the size so
             // we can't use the dereferenceable attribute, but in addrspace(0)
             // we know that it must be nonnull.
-            if (ArrTy->getSizeModifier() == VariableArrayType::Static &&
-                !getContext().getTargetAddressSpace(ArrTy->getElementType()) &&
-                !CGM.getCodeGenOpts().NullPointerIsValid)
-              AI->addAttr(llvm::Attribute::NonNull);
+            if (ArrTy->getSizeModifier() == VariableArrayType::Static) {
+              QualType ETy = ArrTy->getElementType();
+              llvm::Align Alignment =
+                  CGM.getNaturalTypeAlignment(ETy).getAsAlign();
+              AI->addAttrs(llvm::AttrBuilder().addAlignmentAttr(Alignment));
+              if (!getContext().getTargetAddressSpace(ETy) &&
+                  !CGM.getCodeGenOpts().NullPointerIsValid)
+                AI->addAttr(llvm::Attribute::NonNull);
+            }
           }
 
           // Set `align` attribute if any.
diff --git a/clang/test/CodeGen/vla.c b/clang/test/CodeGen/vla.c
index 16b82f4acc7d38..3142050149aaab 100644
--- a/clang/test/CodeGen/vla.c
+++ b/clang/test/CodeGen/vla.c
@@ -200,13 +200,13 @@ void test7(int a[b(0)]) {
 // Make sure we emit dereferenceable or nonnull when the static keyword is
 // provided.
 void test8(int a[static 3]) { }
-// CHECK: define void @test8(i32* dereferenceable(12) %a)
+// CHECK: define void @test8(i32* align 4 dereferenceable(12) %a)
 
 void test9(int n, int a[static n]) { }
-// NULL-INVALID: define void @test9(i32 %n, i32* nonnull %a)
-// NULL-VALID: define void @test9(i32 %n, i32* %a)
+// NULL-INVALID: define void @test9(i32 %n, i32* nonnull align 4 %a)
+// NULL-VALID: define void @test9(i32 %n, i32* align 4 %a)
 
 // Make sure a zero-sized static array extent is still required to be nonnull.
 void test10(int a[static 0]) {}
-// NULL-INVALID: define void @test10(i32* nonnull %a)
-// NULL-VALID: define void @test10(i32* %a)
+// NULL-INVALID: define void @test10(i32* nonnull align 4 %a)
+// NULL-VALID: define void @test10(i32* align 4 %a)

From bb18532399cf01c712e18f85ecb1cfb612d664ac Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Tue, 4 Aug 2020 14:57:16 -0700
Subject: [PATCH 079/101] [AArch64][SVE] Allow llvm.aarch64.sve.st2/3/4 with
 vectors of pointers.

This isn't necessaary for ACLE, but could be useful in other situations.
And the change is simple.

Differential Revision: https://reviews.llvm.org/D85251
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 15 ++++----
 .../CodeGen/AArch64/sve-intrinsics-stores.ll  | 38 +++++++++++++++++++
 2 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2cd2f67171205f..b37947495a4329 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9462,16 +9462,17 @@ SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
 
 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
 template <unsigned NumVecs>
-static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info,
-                          const CallInst &CI) {
+static bool
+setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
+              AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
   Info.opc = ISD::INTRINSIC_VOID;
   // Retrieve EC from first vector argument.
-  const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType());
+  const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
   ElementCount EC = VT.getVectorElementCount();
 #ifndef NDEBUG
   // Check the assumption that all input vectors are the same type.
   for (unsigned I = 0; I < NumVecs; ++I)
-    assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) &&
+    assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
            "Invalid type.");
 #endif
   // memVT is `NumVecs * VT`.
@@ -9494,11 +9495,11 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   auto &DL = I.getModule()->getDataLayout();
   switch (Intrinsic) {
   case Intrinsic::aarch64_sve_st2:
-    return setInfoSVEStN<2>(Info, I);
+    return setInfoSVEStN<2>(*this, DL, Info, I);
   case Intrinsic::aarch64_sve_st3:
-    return setInfoSVEStN<3>(Info, I);
+    return setInfoSVEStN<3>(*this, DL, Info, I);
   case Intrinsic::aarch64_sve_st4:
-    return setInfoSVEStN<4>(Info, I);
+    return setInfoSVEStN<4>(*this, DL, Info, I);
   case Intrinsic::aarch64_neon_ld2:
   case Intrinsic::aarch64_neon_ld3:
   case Intrinsic::aarch64_neon_ld4:
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll
index 92877233b2c9c1..f3fcddbaa2fcf0 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll
@@ -108,6 +108,17 @@ define void @st2d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vsc
   ret void
 }
 
+define void @st2d_ptr(<vscale x 2 x i8*> %v0, <vscale x 2 x i8*> %v1, <vscale x 2 x i1> %pred, i8** %addr) {
+; CHECK-LABEL: st2d_ptr:
+; CHECK: st2d { z0.d, z1.d }, p0, [x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.st2.nxv2p0i8(<vscale x 2 x i8*> %v0,
+                                           <vscale x 2 x i8*> %v1,
+                                           <vscale x 2 x i1> %pred,
+                                           i8** %addr)
+  ret void
+}
+
 ;
 ; ST3B
 ;
@@ -220,6 +231,18 @@ define void @st3d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vsc
   ret void
 }
 
+define void @st3d_ptr(<vscale x 2 x i8*> %v0, <vscale x 2 x i8*> %v1, <vscale x 2 x i8*> %v2, <vscale x 2 x i1> %pred, i8** %addr) {
+; CHECK-LABEL: st3d_ptr:
+; CHECK: st3d { z0.d, z1.d, z2.d }, p0, [x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.st3.nxv2p0i8(<vscale x 2 x i8*> %v0,
+                                           <vscale x 2 x i8*> %v1,
+                                           <vscale x 2 x i8*> %v2,
+                                           <vscale x 2 x i1> %pred,
+                                           i8** %addr)
+  ret void
+}
+
 ;
 ; ST4B
 ;
@@ -340,6 +363,18 @@ define void @st4d_f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vsc
   ret void
 }
 
+define void @st4d_ptr(<vscale x 2 x i8*> %v0, <vscale x 2 x i8*> %v1, <vscale x 2 x i8*> %v2, <vscale x 2 x i8*> %v3, <vscale x 2 x i1> %pred, i8** %addr) {
+; CHECK-LABEL: st4d_ptr:
+; CHECK: st4d { z0.d, z1.d, z2.d, z3.d }, p0, [x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.st4.nxv2p0i8(<vscale x 2 x i8*> %v0,
+                                           <vscale x 2 x i8*> %v1,
+                                           <vscale x 2 x i8*> %v2,
+                                           <vscale x 2 x i8*> %v3,
+                                           <vscale x 2 x i1> %pred,
+                                           i8** %addr)
+  ret void
+}
 ;
 ; STNT1B
 ;
@@ -508,6 +543,7 @@ declare void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x ha
 declare void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
 declare void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
 declare void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
+declare void @llvm.aarch64.sve.st2.nxv2p0i8(<vscale x 2 x i8*>, <vscale x 2 x i8*>, <vscale x 2 x i1>, i8** nocapture)
 
 declare void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
 declare void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
@@ -517,6 +553,7 @@ declare void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half>, <vscale x 8 x ha
 declare void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
 declare void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
 declare void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
+declare void @llvm.aarch64.sve.st3.nxv2p0i8(<vscale x 2 x i8*>, <vscale x 2 x i8*>, <vscale x 2 x i8*>, <vscale x 2 x i1>, i8** nocapture)
 
 declare void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
 declare void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i16*)
@@ -526,6 +563,7 @@ declare void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x ha
 declare void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, bfloat*)
 declare void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, float*)
 declare void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, double*)
+declare void @llvm.aarch64.sve.st4.nxv2p0i8(<vscale x 2 x i8*>, <vscale x 2 x i8*>, <vscale x 2 x i8*>, <vscale x 2 x i8*>, <vscale x 2 x i1>, i8** nocapture)
 
 declare void @llvm.aarch64.sve.stnt1.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8*)
 declare void @llvm.aarch64.sve.stnt1.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16*)

From be944c85f375c0faa36ee5c7ccbc79ff9a78a0d5 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Mon, 10 Aug 2020 12:23:03 -0700
Subject: [PATCH 080/101] [AArch64][SVE] Add patterns for integer mla/mls.

We probably want to introduce pseudo-instructions at some point, like
we have for binary operations, but this seems okay for now.

One thing I'm not sure about is whether we should be doing this as a
DAGCombine instead of directly pattern-matching it. I don't see any big
downside to doing it this way, though.

Differential Revision: https://reviews.llvm.org/D85681
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   9 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  12 +-
 .../CodeGen/AArch64/llvm-ir-to-intrinsic.ll   |  24 +-
 llvm/test/CodeGen/AArch64/sve-gep.ll          |  17 +-
 llvm/test/CodeGen/AArch64/sve-int-arith.ll    | 207 ++++++++++++------
 5 files changed, 170 insertions(+), 99 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index ea4c6cab5c35d3..03c0bebd44b135 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -233,6 +233,11 @@ def setoeq_or_seteq : PatFrags<(ops node:$lhs, node:$rhs),
 def setone_or_setne : PatFrags<(ops node:$lhs, node:$rhs),
                                 [(setone node:$lhs, node:$rhs),
                                  (setne node:$lhs, node:$rhs)]>;
+def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2),
+                                  (AArch64mul_p node:$pred, node:$src1, node:$src2), [{
+  return N->hasOneUse();
+}]>;
+
 
 let Predicates = [HasSVE] in {
   defm RDFFR_PPz  : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
@@ -281,8 +286,8 @@ let Predicates = [HasSVE] in {
 
   defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>;
   defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>;
-  defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla>;
-  defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls>;
+  defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla, add, AArch64mul_p_oneuse>;
+  defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls, sub, AArch64mul_p_oneuse>;
 
   // SVE predicated integer reductions.
   defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", int_aarch64_sve_saddv>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 5eb811b9c78eb5..65b0a8623b7d29 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -2518,7 +2518,8 @@ class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op> {
+multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op,
+                                 SDPatternOperator outerop, SDPatternOperator mulop> {
   def _B : sve_int_mlas_vvv_pred<0b00, opc, asm, ZPR8>;
   def _H : sve_int_mlas_vvv_pred<0b01, opc, asm, ZPR16>;
   def _S : sve_int_mlas_vvv_pred<0b10, opc, asm, ZPR32>;
@@ -2528,6 +2529,15 @@ multiclass sve_int_mlas_vvv_pred<bits<1> opc, string asm, SDPatternOperator op>
   def : SVE_4_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
   def : SVE_4_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_4_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+  def : Pat<(outerop nxv16i8:$Op1, (mulop nxv16i1:$pred, nxv16i8:$Op2, nxv16i8:$Op3)),
+            (!cast<Instruction>(NAME # _B) $pred, $Op1, $Op2, $Op3)>;
+  def : Pat<(outerop nxv8i16:$Op1, (mulop nxv8i1:$pred, nxv8i16:$Op2, nxv8i16:$Op3)),
+            (!cast<Instruction>(NAME # _H) $pred, $Op1, $Op2, $Op3)>;
+  def : Pat<(outerop nxv4i32:$Op1, (mulop nxv4i1:$pred, nxv4i32:$Op2, nxv4i32:$Op3)),
+            (!cast<Instruction>(NAME # _S) $pred, $Op1, $Op2, $Op3)>;
+  def : Pat<(outerop nxv2i64:$Op1, (mulop nxv2i1:$pred, nxv2i64:$Op2, nxv2i64:$Op3)),
+            (!cast<Instruction>(NAME # _D) $pred, $Op1, $Op2, $Op3)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
index d43dcda36231aa..bcff9e056f18c9 100644
--- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
+++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
@@ -131,8 +131,7 @@ define <vscale x 16 x i8> @srem_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 ; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
 ; CHECK-NEXT:    uzp1 z2.b, z3.b, z2.b
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z2.b
-; CHECK-NEXT:    sub z0.b, z0.b, z1.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    ret
   %div = srem <vscale x 16 x i8> %a, %b
   ret <vscale x 16 x i8> %div
@@ -151,8 +150,7 @@ define <vscale x 8 x i16> @srem_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b
 ; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mul z1.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    sub z0.h, z0.h, z1.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    ret
   %div = srem <vscale x 8 x i16> %a, %b
   ret <vscale x 8 x i16> %div
@@ -164,8 +162,7 @@ define <vscale x 4 x i32> @srem_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.s, p0/m, z2.s, z1.s
-; CHECK-NEXT:    mul z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    sub z0.s, z0.s, z1.s
+; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    ret
   %div = srem <vscale x 4 x i32> %a, %b
   ret <vscale x 4 x i32> %div
@@ -177,8 +174,7 @@ define <vscale x 2 x i64> @srem_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    sdiv z2.d, p0/m, z2.d, z1.d
-; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    sub z0.d, z0.d, z1.d
+; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    ret
   %div = srem <vscale x 2 x i64> %a, %b
   ret <vscale x 2 x i64> %div
@@ -315,8 +311,7 @@ define <vscale x 16 x i8> @urem_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 ; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
 ; CHECK-NEXT:    uzp1 z2.b, z3.b, z2.b
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z2.b
-; CHECK-NEXT:    sub z0.b, z0.b, z1.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    ret
   %div = urem <vscale x 16 x i8> %a, %b
   ret <vscale x 16 x i8> %div
@@ -335,8 +330,7 @@ define <vscale x 8 x i16> @urem_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b
 ; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mul z1.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    sub z0.h, z0.h, z1.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    ret
   %div = urem <vscale x 8 x i16> %a, %b
   ret <vscale x 8 x i16> %div
@@ -348,8 +342,7 @@ define <vscale x 4 x i32> @urem_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.s, p0/m, z2.s, z1.s
-; CHECK-NEXT:    mul z1.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    sub z0.s, z0.s, z1.s
+; CHECK-NEXT:    mls z0.s, p0/m, z2.s, z1.s
 ; CHECK-NEXT:    ret
   %div = urem <vscale x 4 x i32> %a, %b
   ret <vscale x 4 x i32> %div
@@ -361,8 +354,7 @@ define <vscale x 2 x i64> @urem_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    udiv z2.d, p0/m, z2.d, z1.d
-; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    sub z0.d, z0.d, z1.d
+; CHECK-NEXT:    mls z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    ret
   %div = urem <vscale x 2 x i64> %a, %b
   ret <vscale x 2 x i64> %div
diff --git a/llvm/test/CodeGen/AArch64/sve-gep.ll b/llvm/test/CodeGen/AArch64/sve-gep.ll
index 4230a7fa287160..8f68a38e2cd20d 100644
--- a/llvm/test/CodeGen/AArch64/sve-gep.ll
+++ b/llvm/test/CodeGen/AArch64/sve-gep.ll
@@ -105,10 +105,11 @@ define <vscale x 2 x <vscale x 2 x i64>*> @scalable_of_scalable_1(<vscale x 2 x
 ; CHECK-LABEL: scalable_of_scalable_1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rdvl x8, #1
-; CHECK-NEXT:    mov z1.d, x8
+; CHECK-NEXT:    mov z1.d, #1 // =0x1
 ; CHECK-NEXT:    mov z0.d, x0
-; CHECK-NEXT:    mul z1.d, z1.d, #1
-; CHECK-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mla z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    ret
   %idx = shufflevector <vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 1, i32 0), <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i32> zeroinitializer
   %d = getelementptr <vscale x 2 x i64>, <vscale x 2 x i64>* %base, <vscale x 2 x i64> %idx
@@ -119,9 +120,10 @@ define <vscale x 2 x <vscale x 2 x i64>*> @scalable_of_scalable_2(<vscale x 2 x
 ; CHECK-LABEL: scalable_of_scalable_2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rdvl x8, #1
-; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    mul z1.d, z1.d, #1
-; CHECK-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEXT:    mov z1.d, #1 // =0x1
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mla z0.d, p0/m, z2.d, z1.d
 ; CHECK-NEXT:    ret
   %idx = shufflevector <vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 1, i32 0), <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i32> zeroinitializer
   %d = getelementptr <vscale x 2 x i64>, <vscale x 2 x <vscale x 2 x i64>*> %base, <vscale x 2 x i64> %idx
@@ -135,8 +137,7 @@ define <vscale x 2 x <vscale x 2 x i64>*> @scalable_of_scalable_3(<vscale x 2 x
 ; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    sxtw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    mov z2.d, x8
-; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    ret
   %d = getelementptr <vscale x 2 x i64>, <vscale x 2 x <vscale x 2 x i64>*> %base, <vscale x 2 x i32> %idx
   ret <vscale x 2 x <vscale x 2 x i64>*> %d
diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
index d70e817085500f..bcd94d2d019334 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s
 ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
 
@@ -5,200 +6,262 @@
 ; WARN-NOT: warning
 
 define <vscale x 2 x i64> @add_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
-; CHECK-LABEL: add_i64
-; CHECK: add z0.d, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-LABEL: add_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
   %res = add <vscale x 2 x i64> %a, %b
   ret <vscale x 2 x i64> %res
 }
 
 define <vscale x 4 x i32> @add_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: add_i32
-; CHECK: add z0.s, z0.s, z1.s
-; CHECK-NEXT: ret
+; CHECK-LABEL: add_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
   %res = add <vscale x 4 x i32> %a, %b
   ret <vscale x 4 x i32> %res
 }
 
 define <vscale x 8 x i16> @add_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: add_i16
-; CHECK: add z0.h, z0.h, z1.h
-; CHECK-NEXT: ret
+; CHECK-LABEL: add_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
   %res = add <vscale x 8 x i16> %a, %b
   ret <vscale x 8 x i16> %res
 }
 
 define <vscale x 16 x i8> @add_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: add_i8
-; CHECK: add z0.b, z0.b, z1.b
-; CHECK-NEXT: ret
+; CHECK-LABEL: add_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add z0.b, z0.b, z1.b
+; CHECK-NEXT:    ret
   %res = add <vscale x 16 x i8> %a, %b
   ret <vscale x 16 x i8> %res
 }
 
 define <vscale x 2 x i64> @sub_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
-; CHECK-LABEL: sub_i64
-; CHECK: sub z0.d, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-LABEL: sub_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
   %res = sub <vscale x 2 x i64> %a, %b
   ret <vscale x 2 x i64> %res
 }
 
 define <vscale x 4 x i32> @sub_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: sub_i32
-; CHECK: sub z0.s, z0.s, z1.s
-; CHECK-NEXT: ret
+; CHECK-LABEL: sub_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
   %res = sub <vscale x 4 x i32> %a, %b
   ret <vscale x 4 x i32> %res
 }
 
 define <vscale x 8 x i16> @sub_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: sub_i16
-; CHECK: sub z0.h, z0.h, z1.h
-; CHECK-NEXT: ret
+; CHECK-LABEL: sub_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
   %res = sub <vscale x 8 x i16> %a, %b
   ret <vscale x 8 x i16> %res
 }
 
 define <vscale x 16 x i8> @sub_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: sub_i8
-; CHECK: sub z0.b, z0.b, z1.b
-; CHECK-NEXT: ret
+; CHECK-LABEL: sub_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub z0.b, z0.b, z1.b
+; CHECK-NEXT:    ret
   %res = sub <vscale x 16 x i8> %a, %b
   ret <vscale x 16 x i8> %res
 }
 
 define <vscale x 2 x i64> @sqadd_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
-; CHECK-LABEL: sqadd_i64
-; CHECK: sqadd  z0.d, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-LABEL: sqadd_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqadd z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
   %res = call <vscale x 2 x i64> @llvm.sadd.sat.nxv2i64(<vscale x 2 x i64>  %a, <vscale x 2 x i64> %b)
   ret <vscale x 2 x i64> %res
 }
 
 define <vscale x 4 x i32> @sqadd_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: sqadd_i32
-; CHECK: sqadd  z0.s, z0.s, z1.s
-; CHECK-NEXT: ret
+; CHECK-LABEL: sqadd_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqadd z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
   %res = call <vscale x 4 x i32> @llvm.sadd.sat.nxv4i32(<vscale x 4 x i32>  %a, <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %res
 }
 
 define <vscale x 8 x i16> @sqadd_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: sqadd_i16
-; CHECK: sqadd  z0.h, z0.h, z1.h
-; CHECK-NEXT: ret
+; CHECK-LABEL: sqadd_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqadd z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
   %res = call <vscale x 8 x i16> @llvm.sadd.sat.nxv8i16(<vscale x 8 x i16>  %a, <vscale x 8 x i16> %b)
   ret <vscale x 8 x i16> %res
 }
 
 define <vscale x 16 x i8> @sqadd_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: sqadd_i8
-; CHECK: sqadd  z0.b, z0.b, z1.b
-; CHECK-NEXT: ret
+; CHECK-LABEL: sqadd_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqadd z0.b, z0.b, z1.b
+; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i8> @llvm.sadd.sat.nxv16i8(<vscale x 16 x i8>  %a, <vscale x 16 x i8> %b)
   ret <vscale x 16 x i8> %res
 }
 
 
 define <vscale x 2 x i64> @sqsub_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
-; CHECK-LABEL: sqsub_i64
-; CHECK: sqsub  z0.d, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-LABEL: sqsub_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqsub z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
   %res = call <vscale x 2 x i64> @llvm.ssub.sat.nxv2i64(<vscale x 2 x i64>  %a, <vscale x 2 x i64> %b)
   ret <vscale x 2 x i64> %res
 }
 
 define <vscale x 4 x i32> @sqsub_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: sqsub_i32
-; CHECK: sqsub  z0.s, z0.s, z1.s
-; CHECK-NEXT: ret
+; CHECK-LABEL: sqsub_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqsub z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
   %res = call <vscale x 4 x i32> @llvm.ssub.sat.nxv4i32(<vscale x 4 x i32>  %a, <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %res
 }
 
 define <vscale x 8 x i16> @sqsub_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: sqsub_i16
-; CHECK: sqsub  z0.h, z0.h, z1.h
-; CHECK-NEXT: ret
+; CHECK-LABEL: sqsub_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqsub z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
   %res = call <vscale x 8 x i16> @llvm.ssub.sat.nxv8i16(<vscale x 8 x i16>  %a, <vscale x 8 x i16> %b)
   ret <vscale x 8 x i16> %res
 }
 
 define <vscale x 16 x i8> @sqsub_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: sqsub_i8
-; CHECK: sqsub  z0.b, z0.b, z1.b
-; CHECK-NEXT: ret
+; CHECK-LABEL: sqsub_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sqsub z0.b, z0.b, z1.b
+; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i8> @llvm.ssub.sat.nxv16i8(<vscale x 16 x i8>  %a, <vscale x 16 x i8> %b)
   ret <vscale x 16 x i8> %res
 }
 
 
 define <vscale x 2 x i64> @uqadd_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
-; CHECK-LABEL: uqadd_i64
-; CHECK: uqadd  z0.d, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-LABEL: uqadd_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
   %res = call <vscale x 2 x i64> @llvm.uadd.sat.nxv2i64(<vscale x 2 x i64>  %a, <vscale x 2 x i64> %b)
   ret <vscale x 2 x i64> %res
 }
 
 define <vscale x 4 x i32> @uqadd_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: uqadd_i32
-; CHECK: uqadd  z0.s, z0.s, z1.s
-; CHECK-NEXT: ret
+; CHECK-LABEL: uqadd_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
   %res = call <vscale x 4 x i32> @llvm.uadd.sat.nxv4i32(<vscale x 4 x i32>  %a, <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %res
 }
 
 define <vscale x 8 x i16> @uqadd_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: uqadd_i16
-; CHECK: uqadd  z0.h, z0.h, z1.h
-; CHECK-NEXT: ret
+; CHECK-LABEL: uqadd_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
   %res = call <vscale x 8 x i16> @llvm.uadd.sat.nxv8i16(<vscale x 8 x i16>  %a, <vscale x 8 x i16> %b)
   ret <vscale x 8 x i16> %res
 }
 
 define <vscale x 16 x i8> @uqadd_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: uqadd_i8
-; CHECK: uqadd  z0.b, z0.b, z1.b
-; CHECK-NEXT: ret
+; CHECK-LABEL: uqadd_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqadd z0.b, z0.b, z1.b
+; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i8> @llvm.uadd.sat.nxv16i8(<vscale x 16 x i8>  %a, <vscale x 16 x i8> %b)
   ret <vscale x 16 x i8> %res
 }
 
 
 define <vscale x 2 x i64> @uqsub_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
-; CHECK-LABEL: uqsub_i64
-; CHECK: uqsub  z0.d, z0.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-LABEL: uqsub_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqsub z0.d, z0.d, z1.d
+; CHECK-NEXT:    ret
   %res = call <vscale x 2 x i64> @llvm.usub.sat.nxv2i64(<vscale x 2 x i64>  %a, <vscale x 2 x i64> %b)
   ret <vscale x 2 x i64> %res
 }
 
 define <vscale x 4 x i32> @uqsub_i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
-; CHECK-LABEL: uqsub_i32
-; CHECK: uqsub  z0.s, z0.s, z1.s
-; CHECK-NEXT: ret
+; CHECK-LABEL: uqsub_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqsub z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
   %res = call <vscale x 4 x i32> @llvm.usub.sat.nxv4i32(<vscale x 4 x i32>  %a, <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %res
 }
 
 define <vscale x 8 x i16> @uqsub_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
-; CHECK-LABEL: uqsub_i16
-; CHECK: uqsub  z0.h, z0.h, z1.h
-; CHECK-NEXT: ret
+; CHECK-LABEL: uqsub_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqsub z0.h, z0.h, z1.h
+; CHECK-NEXT:    ret
   %res = call <vscale x 8 x i16> @llvm.usub.sat.nxv8i16(<vscale x 8 x i16>  %a, <vscale x 8 x i16> %b)
   ret <vscale x 8 x i16> %res
 }
 
 define <vscale x 16 x i8> @uqsub_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
-; CHECK-LABEL: uqsub_i8
-; CHECK: uqsub  z0.b, z0.b, z1.b
-; CHECK-NEXT: ret
+; CHECK-LABEL: uqsub_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqsub z0.b, z0.b, z1.b
+; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i8> @llvm.usub.sat.nxv16i8(<vscale x 16 x i8>  %a, <vscale x 16 x i8> %b)
   ret <vscale x 16 x i8> %res
 }
 
+define <vscale x 16 x i8> @mla_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) {
+; CHECK-LABEL: mla_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mla z2.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %prod = mul <vscale x 16 x i8> %a, %b
+  %res = add <vscale x 16 x i8> %c, %prod
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 16 x i8> @mla_i8_multiuse(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8>* %p) {
+; CHECK-LABEL: mla_i8_multiuse:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z0.b
+; CHECK-NEXT:    add z0.b, z2.b, z1.b
+; CHECK-NEXT:    st1b { z1.b }, p0, [x0]
+; CHECK-NEXT:    ret
+  %prod = mul <vscale x 16 x i8> %a, %b
+  store <vscale x 16 x i8> %prod, <vscale x 16 x i8>* %p
+  %res = add <vscale x 16 x i8> %c, %prod
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 16 x i8> @mls_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) {
+; CHECK-LABEL: mls_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mls z2.b, p0/m, z0.b, z1.b
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+  %prod = mul <vscale x 16 x i8> %a, %b
+  %res = sub <vscale x 16 x i8> %c, %prod
+  ret <vscale x 16 x i8> %res
+}
+
 declare <vscale x 16 x i8> @llvm.sadd.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare <vscale x 8 x i16> @llvm.sadd.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
 declare <vscale x 4 x i32> @llvm.sadd.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)

From 45901ebd430566a1b4ed7ae954a1d6d1722f3276 Mon Sep 17 00:00:00 2001
From: Marius Brehler <marius.brehler@iml.fraunhofer.de>
Date: Tue, 18 Aug 2020 22:16:00 +0200
Subject: [PATCH 081/101] [mlir] Check libraries linked into standalone-opt

Adds a call to mlir_check_all_link_libraries() to check all libraries
linked into standalone-opt.
---
 mlir/examples/standalone/standalone-opt/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mlir/examples/standalone/standalone-opt/CMakeLists.txt b/mlir/examples/standalone/standalone-opt/CMakeLists.txt
index 854fd556ae868c..06bbb4712645a3 100644
--- a/mlir/examples/standalone/standalone-opt/CMakeLists.txt
+++ b/mlir/examples/standalone/standalone-opt/CMakeLists.txt
@@ -10,3 +10,5 @@ add_llvm_executable(standalone-opt standalone-opt.cpp)
 
 llvm_update_compile_flags(standalone-opt)
 target_link_libraries(standalone-opt PRIVATE ${LIBS})
+
+mlir_check_all_link_libraries(standalone-opt)

From 0b98a59fedb5e98661ca531d5ed20110bfdd7b2f Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Tue, 18 Aug 2020 13:44:29 -0400
Subject: [PATCH 082/101] [VectorCombine] add tests for vector loads; NFC

---
 .../test/Transforms/VectorCombine/X86/load.ll | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
index 524f48332b7cb8..104c8c2d025faf 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -373,3 +373,29 @@ define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceabl
   %r = insertelement <8 x i32> undef, i32 %s, i32 0
   ret <8 x i32> %r
 }
+
+; TODO: Should load v4f32.
+
+define <8 x float> @load_f32_insert_v8f32(float* align 16 dereferenceable(16) %p) {
+; CHECK-LABEL: @load_f32_insert_v8f32(
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
+  %s = load float, float* %p, align 4
+  %r = insertelement <8 x float> undef, float %s, i32 0
+  ret <8 x float> %r
+}
+
+; TODO: Should load v4f32.
+
+define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) {
+; CHECK-LABEL: @load_f32_insert_v2f32(
+; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    ret <2 x float> [[R]]
+;
+  %s = load float, float* %p, align 4
+  %r = insertelement <2 x float> undef, float %s, i32 0
+  ret <2 x float> %r
+}

From 08748d15b8d696db9f894db38d74a212b8ab43e6 Mon Sep 17 00:00:00 2001
From: Greg Clayton <gclayton@fb.com>
Date: Mon, 17 Aug 2020 17:26:50 -0700
Subject: [PATCH 083/101] Fix a check that was attempting to see if an object
 file was in memory.

Checking if an object file is in memory should use the ObjectFile::IsInMemory(), not test ObjectFile::BaseAddress(). ObjectFile::BaseAddress() is designed to be overridden by all classes and is for mach-o, ELF and COFF plug-ins. They find the header base adddress and return that as a section offset address. The default implementation of ObjectFile::BaseAddress() does try and make an Address() from the ObjectFile::m_memory_addr, but I switched it to a correct function call.

Differential Revision: https://reviews.llvm.org/D86122
---
 lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index e7701c350ff518..babe5a3847274b 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -1628,7 +1628,7 @@ void ObjectFileMachO::ProcessSegmentCommand(const load_command &load_cmd_,
   } else if (unified_section_sp) {
     if (is_dsym && unified_section_sp->GetFileAddress() != load_cmd.vmaddr) {
       // Check to see if the module was read from memory?
-      if (module_sp->GetObjectFile()->GetBaseAddress().IsValid()) {
+      if (module_sp->GetObjectFile()->IsInMemory()) {
         // We have a module that is in memory and needs to have its file
         // address adjusted. We need to do this because when we load a file
         // from memory, its addresses will be slid already, yet the addresses

From 84fffa67283139954b7764328966b5f766db1003 Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Mon, 17 Aug 2020 15:25:08 -0700
Subject: [PATCH 084/101] [Coverage] Adjust skipped regions only if
 {Prev,Next}TokLoc is in the same file as regions' {start, end}Loc

Fix a bug if {Prev, Next}TokLoc is in different file from skipped regions' {start, end}Loc

Differential Revision: https://reviews.llvm.org/D86116
---
 clang/lib/CodeGen/CoverageMappingGen.cpp    | 24 ++++++++++++---------
 clang/test/CoverageMapping/Inputs/comment.h |  6 ++++++
 clang/test/CoverageMapping/comment.cpp      | 13 +++++++++++
 3 files changed, 33 insertions(+), 10 deletions(-)
 create mode 100644 clang/test/CoverageMapping/Inputs/comment.h
 create mode 100644 clang/test/CoverageMapping/comment.cpp

diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index e6e1b211193591..8277804d27c0ec 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -44,7 +44,8 @@ CoverageMappingModuleGen::setUpCoverageCallbacks(Preprocessor &PP) {
   PP.setTokenWatcher([CoverageInfo](clang::Token Tok) {
     // Update previous token location.
     CoverageInfo->PrevTokLoc = Tok.getLocation();
-    CoverageInfo->updateNextTokLoc(Tok.getLocation());
+    if (Tok.getKind() != clang::tok::eod)
+      CoverageInfo->updateNextTokLoc(Tok.getLocation());
   });
   return CoverageInfo;
 }
@@ -305,20 +306,24 @@ class CoverageMappingBuilder {
   /// non-comment token. If shrinking the skipped range would make it empty,
   /// this returns None.
   Optional<SpellingRegion> adjustSkippedRange(SourceManager &SM,
-                                              SpellingRegion SR,
+                                              SourceLocation LocStart,
+                                              SourceLocation LocEnd,
                                               SourceLocation PrevTokLoc,
                                               SourceLocation NextTokLoc) {
+    SpellingRegion SR{SM, LocStart, LocEnd};
     // If Range begin location is invalid, it's not a comment region.
     if (PrevTokLoc.isInvalid())
       return SR;
     unsigned PrevTokLine = SM.getSpellingLineNumber(PrevTokLoc);
     unsigned NextTokLine = SM.getSpellingLineNumber(NextTokLoc);
     SpellingRegion newSR(SR);
-    if (SR.LineStart == PrevTokLine) {
+    if (SM.isWrittenInSameFile(LocStart, PrevTokLoc) &&
+        SR.LineStart == PrevTokLine) {
       newSR.LineStart = SR.LineStart + 1;
       newSR.ColumnStart = 1;
     }
-    if (SR.LineEnd == NextTokLine) {
+    if (SM.isWrittenInSameFile(LocEnd, NextTokLoc) &&
+        SR.LineEnd == NextTokLine) {
       newSR.LineEnd = SR.LineEnd - 1;
       newSR.ColumnEnd = SR.ColumnStart + 1;
     }
@@ -354,14 +359,13 @@ class CoverageMappingBuilder {
       auto CovFileID = getCoverageFileID(LocStart);
       if (!CovFileID)
         continue;
-      SpellingRegion SR{SM, LocStart, LocEnd};
-      if (Optional<SpellingRegion> res =
-              adjustSkippedRange(SM, SR, I.PrevTokLoc, I.NextTokLoc))
-        SR = res.getValue();
-      else
+      Optional<SpellingRegion> SR =
+          adjustSkippedRange(SM, LocStart, LocEnd, I.PrevTokLoc, I.NextTokLoc);
+      if (!SR.hasValue())
         continue;
       auto Region = CounterMappingRegion::makeSkipped(
-          *CovFileID, SR.LineStart, SR.ColumnStart, SR.LineEnd, SR.ColumnEnd);
+          *CovFileID, SR->LineStart, SR->ColumnStart, SR->LineEnd,
+          SR->ColumnEnd);
       // Make sure that we only collect the regions that are inside
       // the source code of this function.
       if (Region.LineStart >= FileLineRanges[*CovFileID].first &&
diff --git a/clang/test/CoverageMapping/Inputs/comment.h b/clang/test/CoverageMapping/Inputs/comment.h
new file mode 100644
index 00000000000000..eec5833c2bd0bc
--- /dev/null
+++ b/clang/test/CoverageMapping/Inputs/comment.h
@@ -0,0 +1,6 @@
+
+
+
+
+
+x = 0;
diff --git a/clang/test/CoverageMapping/comment.cpp b/clang/test/CoverageMapping/comment.cpp
new file mode 100644
index 00000000000000..f8e4b4912e182f
--- /dev/null
+++ b/clang/test/CoverageMapping/comment.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+
+int f() {
+  int x = 0;
+#include "Inputs/comment.h" /*
+    */
+  return x;
+}
+
+// CHECK: File 0, 3:9 -> 8:2 = #0
+// CHECK-NEXT: Expansion,File 0, 5:10 -> 5:28 = #0
+// CHECK-NEXT: Skipped,File 0, 6:1 -> 6:7 = 0
+// CHECK-NEXT: File 1, 1:1 -> 7:1 = #0

From 1870b52f0c0880ad9e40eb01344372c59dbc0fb1 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Tue, 18 Aug 2020 11:06:16 -0700
Subject: [PATCH 085/101] Recommit "PR44685: DebugInfo: Handle
 address-use-invalid type units referencing non-type units"

Originally committed as be3ef93bf58aa5546c7baadfb21d43b75fbb4e24.
Reverted by b4bffdbadfcceb3959aaf231c1542301944e5812 due to bot
failures:
http://green.lab.llvm.org/green/job/clang-stage1-cmake-RA-expensive/17380/testReport/junit/LLVM/DebugInfo_X86/addr_tu_to_non_tu_ll/
http://45.33.8.238/win/22216/step_11.txt

MacOS failure due to testing Split DWARF which isn't compatible with
MachO.
Windows failure due to testing type units which aren't enabled on
Windows.

Fix both of these by applying an explicit x86 linux triple to the test.
---
 llvm/lib/CodeGen/AsmPrinter/AddressPool.h    |  2 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp   |  6 +-
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h     |  1 +
 llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll | 89 ++++++++++++++++++++
 4 files changed, 94 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll

diff --git a/llvm/lib/CodeGen/AsmPrinter/AddressPool.h b/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
index f92cf72093ca03..f1edc6c330d51e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
+++ b/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
@@ -48,7 +48,7 @@ class AddressPool {
 
   bool hasBeenUsed() const { return HasBeenUsed; }
 
-  void resetUsedFlag() { HasBeenUsed = false; }
+  void resetUsedFlag(bool HasBeenUsed = false) { this->HasBeenUsed = HasBeenUsed; }
 
   MCSymbol *getLabel() { return AddressTableBaseSym; }
   void setLabel(MCSymbol *Sym) { AddressTableBaseSym = Sym; }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index f70eed32f0b532..cee72120accb79 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -3305,14 +3305,14 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
 
 DwarfDebug::NonTypeUnitContext::NonTypeUnitContext(DwarfDebug *DD)
     : DD(DD),
-      TypeUnitsUnderConstruction(std::move(DD->TypeUnitsUnderConstruction)) {
+      TypeUnitsUnderConstruction(std::move(DD->TypeUnitsUnderConstruction)), AddrPoolUsed(DD->AddrPool.hasBeenUsed()) {
   DD->TypeUnitsUnderConstruction.clear();
-  assert(TypeUnitsUnderConstruction.empty() || !DD->AddrPool.hasBeenUsed());
+  DD->AddrPool.resetUsedFlag();
 }
 
 DwarfDebug::NonTypeUnitContext::~NonTypeUnitContext() {
   DD->TypeUnitsUnderConstruction = std::move(TypeUnitsUnderConstruction);
-  DD->AddrPool.resetUsedFlag();
+  DD->AddrPool.resetUsedFlag(AddrPoolUsed);
 }
 
 DwarfDebug::NonTypeUnitContext DwarfDebug::enterNonTypeUnitContext() {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 0b943ebe46b669..93e08d1151ff70 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -648,6 +648,7 @@ class DwarfDebug : public DebugHandlerBase {
   class NonTypeUnitContext {
     DwarfDebug *DD;
     decltype(DwarfDebug::TypeUnitsUnderConstruction) TypeUnitsUnderConstruction;
+    bool AddrPoolUsed;
     friend class DwarfDebug;
     NonTypeUnitContext(DwarfDebug *DD);
   public:
diff --git a/llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll b/llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll
new file mode 100644
index 00000000000000..e81cb38c2131b0
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/addr-tu-to-non-tu.ll
@@ -0,0 +1,89 @@
+; RUN: llc -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu -split-dwarf-file=x.dwo < %s \
+; RUN:     | llvm-dwarfdump -debug-info -debug-types - \
+; RUN:     | FileCheck --implicit-check-not=Unit --implicit-check-not=contents --implicit-check-not=declaration %s
+
+; Test that an address-using-with-Split-DWARF type unit that references a
+; non-type unit is handled correctly. A NonTypeUnitContext is used to insulate
+; the type construction from being discarded when the prior/outer type has to be
+; discarded due to finding it used an address & so can't be type united under
+; Split DWARF. 
+
+; The intermediate types tu and t2 are here just to test a bit more
+; thoroughly/broadly. They also demonstrate one slight limitation/sub-optimality
+; since 't2' isn't put in a type unit.
+
+
+; extern int foo;
+; namespace {
+; struct t1 {
+; };
+; }
+; template <int *> struct t2 {
+;   t1 v1;
+; };
+; struct t3 {
+;   t2<&foo> v1;
+; };
+; t3 v1;
+
+; CHECK: .debug_info contents:
+; CHECK: Compile Unit:
+
+; CHECK: .debug_info.dwo contents:
+; CHECK: Compile Unit:
+
+; FIXME: In theory "t3" could be in a type unit - but at the moment, because it
+;        references t2, which needs an address, t3 gets non-type-united.
+;        But the same doesn't happen if t3 referenced an anonymous namespace type.
+
+; CHECK: DW_TAG_structure_type
+; CHECK:   DW_AT_name ("t3")
+; CHECK:   DW_TAG_member
+; CHECK:     DW_AT_type {{.*}} "t2<&foo>"
+; CHECK: DW_TAG_namespace
+; CHECK: [[T1:0x[0-9a-f]*]]:  DW_TAG_structure_type
+; CHECK:     DW_AT_name    ("t1")
+; CHECK: DW_TAG_structure_type
+; CHECK:   DW_AT_name ("t2<&foo>")
+; CHECK:   DW_TAG_member
+; CHECK:     DW_AT_name    ("v1")
+; CHECK:     DW_AT_type    ([[T1]] "t1")
+
+; CHECK: .debug_types contents:
+
+; CHECK-NOT: .debug_types.dwo contents:
+
+
+%struct.t3 = type { %struct.t2 }
+%struct.t2 = type { %"struct.(anonymous namespace)::t1" }
+%"struct.(anonymous namespace)::t1" = type { i8 }
+
+@v1 = dso_local global %struct.t3 zeroinitializer, align 1, !dbg !0
+@foo = external dso_local global i32, align 4
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!18, !19, !20}
+!llvm.ident = !{!21}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "v1", scope: !2, file: !3, line: 16, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 12.0.0 (git@github.com:llvm/llvm-project.git be646ae2865371c7a4966797e88f355de5653e04)", isOptimized: false, runtimeVersion: 0, splitDebugFilename: "test.dwo", emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: GNU)
+!3 = !DIFile(filename: "test.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!4 = !{}
+!5 = !{!0}
+!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t3", file: !3, line: 12, size: 8, flags: DIFlagTypePassByValue, elements: !7, identifier: "_ZTS2t3")
+!7 = !{!8}
+!8 = !DIDerivedType(tag: DW_TAG_member, name: "v1", scope: !6, file: !3, line: 13, baseType: !9, size: 8)
+!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t2<&foo>", file: !3, line: 8, size: 8, flags: DIFlagTypePassByValue, elements: !10, templateParams: !14, identifier: "_ZTS2t2IXadL_Z3fooEEE")
+!10 = !{!11}
+!11 = !DIDerivedType(tag: DW_TAG_member, name: "v1", scope: !9, file: !3, line: 9, baseType: !12, size: 8)
+!12 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", scope: !13, file: !3, line: 4, size: 8, flags: DIFlagTypePassByValue, elements: !4)
+!13 = !DINamespace(scope: null)
+!14 = !{!15}
+!15 = !DITemplateValueParameter(type: !16, value: i32* @foo)
+!16 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !17, size: 64)
+!17 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!18 = !{i32 7, !"Dwarf Version", i32 4}
+!19 = !{i32 2, !"Debug Info Version", i32 3}
+!20 = !{i32 1, !"wchar_size", i32 4}
+!21 = !{!"clang version 12.0.0 (git@github.com:llvm/llvm-project.git be646ae2865371c7a4966797e88f355de5653e04)"}

From 5ccac05d433cf8a46683acb5293fb43280d0f2ed Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <ravishankarm@google.com>
Date: Tue, 18 Aug 2020 13:26:29 -0700
Subject: [PATCH 086/101] [mlir][Linalg] Modify callback for getting id/nprocs
 in LinalgDistribution options to allow more general distributions.

Changing the signature of the callback to send in the ranges for all
the parallel loops and expect a vector with the Value to use for the
processor-id and number-of-processors for each of the parallel loops.

Differential Revision: https://reviews.llvm.org/D86095
---
 .../include/mlir/Dialect/Linalg/Utils/Utils.h |  14 +-
 mlir/lib/Dialect/Linalg/Utils/Utils.cpp       |  24 +++-
 .../Dialect/Linalg/tile-and-distribute.mlir   | 136 +++++++++---------
 .../lib/Transforms/TestLinalgTransforms.cpp   |  21 ++-
 4 files changed, 103 insertions(+), 92 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index 794ebcbc264516..beef1a70096e67 100644
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -198,19 +198,23 @@ enum class DistributionMethod {
 };
 
 /// Callback function type used to get processor ID, and number of processors
-/// used for distribution.
+/// used for distribution for all parallel loops generated.
 struct ProcInfo {
   Value procId;
   Value nprocs;
 };
-using ProcInfoCallBackFn =
-    std::function<ProcInfo(OpBuilder &b, Location loc, unsigned loopNum)>;
+using ProcInfoCallBackFn = std::function<SmallVector<ProcInfo, 2>(
+    OpBuilder &b, Location loc, ArrayRef<SubViewOp::Range> parallelLoopRanges)>;
 
 /// Options that allow distribution of loops generated in Linalg transforms to
 /// processors while generating the loops.
 struct LinalgLoopDistributionOptions {
-  /// Callback function that returns the Value for processor ID, and number of
-  /// processors used to execute a given loop.
+  /// Callback function that returns the Values for processor ID (`procId`), and
+  /// number of processors (`nprocs`) used to execute the parallel loops. The
+  /// number of `{procId, nprocs}` pairs returned must be equal to the number of
+  /// `parallelLoopRanges` passed into the callback, which in-turn is same as
+  /// the number of parallel loops for which the `distributionMethod` is
+  /// specified below.
   ProcInfoCallBackFn procInfo;
   /// Specification of how to distribute the `scf.parallel` loops that are
   /// generated. As the `scf.parallel` loop is generated, the elements of this
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 4e9cbe9d913d11..cf14555aa63fc7 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -334,21 +334,31 @@ void GenerateLoopNest<scf::ParallelOp>::doit(
   SmallVector<DistributionMethod, 0> distributionMethod;
   if (distributionOptions) {
     auto &options = distributionOptions.getValue();
-    unsigned index = 0;
     OpBuilder &builder = edsc::ScopedContext::getBuilderRef();
     Location loc = edsc::ScopedContext::getLocation();
     distributionMethod.assign(distributionOptions->distributionMethod.begin(),
                               distributionOptions->distributionMethod.end());
-    for (auto iteratorType : enumerate(iteratorTypes))
-      if (isParallelIteratorType(iteratorType.value()) &&
-          index < distributionMethod.size()) {
+    SmallVector<SubViewOp::Range, 2> parallelLoopRanges;
+    for (auto iteratorType : enumerate(iteratorTypes)) {
+      if (isParallelIteratorType(iteratorType.value()))
+        parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
+    }
+    if (distributionMethod.size() < parallelLoopRanges.size())
+      parallelLoopRanges.resize(distributionMethod.size());
+    SmallVector<ProcInfo, 2> procInfo =
+        options.procInfo(builder, loc, parallelLoopRanges);
+    unsigned index = 0;
+    for (auto iteratorType : enumerate(iteratorTypes)) {
+      if (index >= procInfo.size())
+        break;
+      if (isParallelIteratorType(iteratorType.value())) {
         unsigned i = iteratorType.index();
-        ProcInfo procInfo = options.procInfo(builder, loc, index);
-        updateBoundsForCyclicDistribution(builder, loc, procInfo.procId,
-                                          procInfo.nprocs, lbsStorage[i],
+        updateBoundsForCyclicDistribution(builder, loc, procInfo[index].procId,
+                                          procInfo[index].nprocs, lbsStorage[i],
                                           ubsStorage[i], stepsStorage[i]);
         index++;
       }
+    }
   }
   ValueRange lbs(lbsStorage), ubs(ubsStorage), steps(stepsStorage);
   generateParallelLoopNest(lbs, ubs, steps, iteratorTypes, bodyBuilderFn, ivs,
diff --git a/mlir/test/Dialect/Linalg/tile-and-distribute.mlir b/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
index e1bc28e133bde2..08f6d19fe6d6f0 100644
--- a/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
+++ b/mlir/test/Dialect/Linalg/tile-and-distribute.mlir
@@ -11,16 +11,16 @@ func @gemm1(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//      CHECK: %[[T1:.*]] = "gpu.block_id"() {dimension = "y"}
-//      CHECK: %[[T2:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+//      CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
 //      CHECK: scf.for %[[ARG3:.*]] =
-//      CHECK:   %[[T3:.*]] = affine.apply #[[MAP0]]()[%[[T1]]]
-//      CHECK:   %[[SV1:.*]] = subview %[[ARG0]][%[[T3]], %[[ARG3]]]
-//      CHECK:   %[[T11:.*]] = affine.apply #[[MAP0]]()[%[[T2]]]
-//      CHECK:   %[[SV2:.*]] = subview %[[ARG1]][%[[ARG3]], %[[T11]]]
-//      CHECK:   %[[T15:.*]] = affine.apply #[[MAP0]]()[%[[T1]]]
-//      CHECK:   %[[T18:.*]] = affine.apply #[[MAP0]]()[%[[T2]]]
-//      CHECK:   %[[SV3:.*]] = subview %[[ARG2]][%[[T15]], %[[T18]]]
+//      CHECK:   %[[OFFSETY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
+//      CHECK:   %[[SV1:.*]] = subview %[[ARG0]][%[[OFFSETY]], %[[ARG3]]]
+//      CHECK:   %[[OFFSETX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
+//      CHECK:   %[[SV2:.*]] = subview %[[ARG1]][%[[ARG3]], %[[OFFSETX]]]
+//      CHECK:   %[[OFFSETY_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
+//      CHECK:   %[[OFFSETX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
+//      CHECK:   %[[SV3:.*]] = subview %[[ARG2]][%[[OFFSETY_2]], %[[OFFSETX]]]
 //      CHECK:   linalg.matmul %[[SV1]], %[[SV2]], %[[SV3]]
 
 // -----
@@ -36,22 +36,22 @@ func @gemm2(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//      CHECK: %[[T3:.*]] = "gpu.block_id"() {dimension = "y"}
-//      CHECK: %[[T4:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
-//      CHECK: %[[T5:.*]] = "gpu.block_id"() {dimension = "x"}
-//      CHECK: %[[T6:.*]] = affine.apply #[[MAP0]]()[%[[T5]]]
-//      CHECK: %[[T7:.*]] = cmpi "slt", %[[T4]], %{{.*}}
-//      CHECK: %[[T8:.*]] = cmpi "slt", %[[T6]], %{{.*}}
-//      CHECK: %[[T9:.*]] = and %[[T7]], %[[T8]]
-//      CHECK: scf.if %[[T9]]
+//  CHECK-DAG: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+//  CHECK-DAG: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: %[[ITERY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
+//      CHECK: %[[ITERX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
+//      CHECK: %[[INBOUNDSY:.*]] = cmpi "slt", %[[ITERY]], %{{.*}}
+//      CHECK: %[[INBOUNDSX:.*]] = cmpi "slt", %[[ITERX]], %{{.*}}
+//      CHECK: %[[INBOUNDS:.*]] = and %[[INBOUNDSY]], %[[INBOUNDSX]]
+//      CHECK: scf.if %[[INBOUNDS]]
 //      CHECK:   scf.for %[[ARG3:.*]] =
-//      CHECK:     %[[T10:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
-//      CHECK:     %[[SV1:.*]] = subview %[[ARG0]][%[[T10]], %[[ARG3]]]
-//      CHECK:     %[[T18:.*]] = affine.apply #[[MAP0]]()[%[[T5]]]
-//      CHECK:     %[[SV2:.*]] = subview %[[ARG1]][%[[ARG3]], %[[T18]]]
-//      CHECK:     %[[T22:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
-//      CHECK:     %[[T25:.*]] = affine.apply #[[MAP0]]()[%[[T5]]]
-//      CHECK:     %[[SV3:.*]] = subview %[[ARG2]][%[[T22]], %[[T25]]]
+//      CHECK:     %[[OFFSETY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
+//      CHECK:     %[[SV1:.*]] = subview %[[ARG0]][%[[OFFSETY]], %[[ARG3]]]
+//      CHECK:     %[[OFFSETX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
+//      CHECK:     %[[SV2:.*]] = subview %[[ARG1]][%[[ARG3]], %[[OFFSETX]]]
+//      CHECK:     %[[OFFSETY_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
+//      CHECK:     %[[OFFSETX_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
+//      CHECK:     %[[SV3:.*]] = subview %[[ARG2]][%[[OFFSETY_2]], %[[OFFSETX_2]]]
 //      CHECK:     linalg.matmul %[[SV1]], %[[SV2]], %[[SV3]]
 
 // -----
@@ -67,15 +67,15 @@ func @gemm3(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//      CHECK: %[[T3:.*]] = "gpu.block_id"() {dimension = "y"}
-//      CHECK: %[[T4:.*]] = "gpu.grid_dim"() {dimension = "y"}
-//      CHECK: %[[T5:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
-//      CHECK: %[[T6:.*]] = affine.apply #[[MAP0]]()[%[[T4]]]
-//      CHECK: %[[T7:.*]] = "gpu.block_id"() {dimension = "x"}
-//      CHECK: %[[T8:.*]] = "gpu.grid_dim"() {dimension = "x"}
-//      CHECK: %[[T9:.*]] = affine.apply #[[MAP0]]()[%[[T7]]]
-//      CHECK: %[[T10:.*]] = affine.apply #[[MAP0]]()[%[[T8]]]
-//      CHECK: scf.parallel (%[[ARG3:.*]], %[[ARG4:.*]]) = (%[[T5]], %[[T9]]) to (%{{.*}}, %{{.*}}) step (%[[T6]], %[[T10]])
+//      CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+//      CHECK: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
+//      CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
+//      CHECK: %[[LBY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
+//      CHECK: %[[STEPY:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSY]]]
+//      CHECK: %[[LBX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
+//      CHECK: %[[STEPX:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSX]]]
+//      CHECK: scf.parallel (%[[ARG3:.*]], %[[ARG4:.*]]) = (%[[LBY]], %[[LBX]]) to (%{{.*}}, %{{.*}}) step (%[[STEPY]], %[[STEPX]])
 //      CHECK:   scf.for %[[ARG5:.*]] =
 //      CHECK:     %[[SV1:.*]] = subview %[[ARG0]][%[[ARG3]], %[[ARG5]]]
 //      CHECK:     %[[SV2:.*]] = subview %[[ARG1]][%[[ARG5]], %[[ARG4]]]
@@ -95,19 +95,19 @@ func @gemm4(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//      CHECK: %[[T2:.*]] = "gpu.block_id"() {dimension = "y"}
-//      CHECK: %[[T3:.*]] = "gpu.block_id"() {dimension = "x"}
-//      CHECK: %[[T4:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
-//      CHECK: %[[T5:.*]] = cmpi "slt", %[[T4]], %{{.*}}
-//      CHECK: scf.if %[[T5]]
+//      CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+//      CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: %[[LBX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
+//      CHECK: %[[INBOUNDS:.*]] = cmpi "slt", %[[LBX]], %{{.*}}
+//      CHECK: scf.if %[[INBOUNDS]]
 //      CHECK:   scf.for %[[ARG3:.*]] =
-//      CHECK:     %[[T6:.*]] = affine.apply #[[MAP0]]()[%[[T2]]]
-//      CHECK:     %[[SV1:.*]] = subview %[[ARG0]][%[[T6]], %[[ARG3]]]
-//      CHECK:     %[[T14:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
-//      CHECK:     %[[SV2:.*]] = subview %[[ARG1]][%[[ARG3]], %[[T14]]]
-//      CHECK:     %[[T18:.*]] = affine.apply #[[MAP0]]()[%[[T2]]]
-//      CHECK:     %[[T21:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
-//      CHECK:     %[[SV3:.*]] = subview %[[ARG2]][%[[T18]], %[[T21]]]
+//      CHECK:     %[[OFFSETY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
+//      CHECK:     %[[SV1:.*]] = subview %[[ARG0]][%[[OFFSETY]], %[[ARG3]]]
+//      CHECK:     %[[OFFSETX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
+//      CHECK:     %[[SV2:.*]] = subview %[[ARG1]][%[[ARG3]], %[[OFFSETX]]]
+//      CHECK:     %[[OFFSETY_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
+//      CHECK:     %[[OFFSETX_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
+//      CHECK:     %[[SV3:.*]] = subview %[[ARG2]][%[[OFFSETY_2]], %[[OFFSETX_2]]]
 //      CHECK:     linalg.matmul %[[SV1]], %[[SV2]], %[[SV3]]
 
 // -----
@@ -123,21 +123,21 @@ func @gemm5(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//      CHECK: %[[T3:.*]] = "gpu.block_id"() {dimension = "y"}
-//      CHECK: %[[T4:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
-//      CHECK: %[[T5:.*]] = "gpu.block_id"() {dimension = "x"}
-//      CHECK: %[[T6:.*]] = "gpu.grid_dim"() {dimension = "x"}
-//      CHECK: %[[T7:.*]] = affine.apply #[[MAP0]]()[%[[T5]]]
-//      CHECK: %[[T8:.*]] = affine.apply #[[MAP0]]()[%[[T6]]]
-//      CHECK: %[[T9:.*]] = cmpi "slt", %[[T4]], %{{.*}}
-//      CHECK: scf.if %[[T9]]
-//      CHECK:   scf.parallel (%[[ARG3.*]]) = (%[[T7]]) to (%{{.*}}) step (%[[T8]])
+//      CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+//      CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: %[[NBLOCKSX:.*]] = "gpu.grid_dim"() {dimension = "x"}
+//      CHECK: %[[LBY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
+//      CHECK: %[[LBX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
+//      CHECK: %[[STEPX:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSX]]]
+//      CHECK: %[[INBOUNDS:.*]] = cmpi "slt", %[[LBY]], %{{.*}}
+//      CHECK: scf.if %[[INBOUNDS]]
+//      CHECK:   scf.parallel (%[[ARG3.*]]) = (%[[LBX]]) to (%{{.*}}) step (%[[STEPX]])
 //      CHECK:     scf.for %[[ARG4:.*]] =
-//      CHECK:      %[[T10:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
-//      CHECK:       %[[SV1:.*]] = subview %[[ARG0]][%[[T10]], %[[ARG4]]]
+//      CHECK:      %[[OFFSETY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
+//      CHECK:       %[[SV1:.*]] = subview %[[ARG0]][%[[OFFSETY]], %[[ARG4]]]
 //      CHECK:       %[[SV2:.*]] = subview %[[ARG1]][%[[ARG4]], %[[ARG3]]]
-//      CHECK:       %[[T21:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
-//      CHECK:       %[[SV3:.*]] = subview %[[ARG2]][%[[T21]], %[[ARG3]]]
+//      CHECK:       %[[OFFSETY_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
+//      CHECK:       %[[SV3:.*]] = subview %[[ARG2]][%[[OFFSETY_2]], %[[ARG3]]]
 //      CHECK:       linalg.matmul %[[SV1]], %[[SV2]], %[[SV3]]
 
 // -----
@@ -153,16 +153,16 @@ func @gemm6(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
 // CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?xf32>
 // CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?xf32>
-//      CHECK: %[[T2:.*]] = "gpu.block_id"() {dimension = "y"}
-//      CHECK: %[[T3:.*]] = "gpu.grid_dim"() {dimension = "y"}
-//      CHECK: %[[T4:.*]] = affine.apply #[[MAP0]]()[%[[T2]]]
-//      CHECK: %[[T5:.*]] = affine.apply #[[MAP0]]()[%[[T3]]]
-//      CHECK: %[[T6:.*]] = "gpu.block_id"() {dimension = "x"}
-//      CHECK: scf.parallel (%[[ARG3.*]]) = (%[[T4]]) to (%{{.*}}) step (%[[T5]])
+//      CHECK: %[[BIDY:.*]] = "gpu.block_id"() {dimension = "y"}
+//      CHECK: %[[NBLOCKSY:.*]] = "gpu.grid_dim"() {dimension = "y"}
+//      CHECK: %[[BIDX:.*]] = "gpu.block_id"() {dimension = "x"}
+//      CHECK: %[[LBY:.*]] = affine.apply #[[MAP0]]()[%[[BIDY]]]
+//      CHECK: %[[STEPY:.*]] = affine.apply #[[MAP0]]()[%[[NBLOCKSY]]]
+//      CHECK: scf.parallel (%[[ARG3.*]]) = (%[[LBY]]) to (%{{.*}}) step (%[[STEPY]])
 //      CHECK:   scf.for %[[ARG4:.*]] =
 //      CHECK:     %[[SV1:.*]] = subview %[[ARG0]][%[[ARG3]], %[[ARG4]]]
-//      CHECK:     %[[T14:.*]] = affine.apply #[[MAP0]]()[%[[T6]]]
-//      CHECK:     %[[SV2:.*]] = subview %[[ARG1]][%[[ARG4]], %[[T14]]]
-//      CHECK:     %[[T20:.*]] = affine.apply #[[MAP0]]()[%[[T6]]]
-//      CHECK:     %[[SV3:.*]] = subview %[[ARG2]][%[[ARG3]], %[[T20]]]
+//      CHECK:     %[[OFFSETX:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
+//      CHECK:     %[[SV2:.*]] = subview %[[ARG1]][%[[ARG4]], %[[OFFSETX]]]
+//      CHECK:     %[[OFFSETX_2:.*]] = affine.apply #[[MAP0]]()[%[[BIDX]]]
+//      CHECK:     %[[SV3:.*]] = subview %[[ARG2]][%[[ARG3]], %[[OFFSETX_2]]]
 //      CHECK:     linalg.matmul %[[SV1]], %[[SV2]], %[[SV3]]
diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
index f6c1160d35b092..dffe4f2a0796a0 100644
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -289,19 +289,16 @@ static void fillPromotionCallBackPatterns(MLIRContext *ctx,
 }
 
 template <typename IdOp, typename NProcsOp>
-static ProcInfo getGpuProcIds(OpBuilder &b, Location loc, unsigned loopNum) {
+static SmallVector<ProcInfo, 2>
+getGpuProcIds(OpBuilder &b, Location loc,
+              ArrayRef<SubViewOp::Range> parallelLoopRanges) {
   Type indexType = b.getIndexType();
-  switch (loopNum) {
-  case 0:
-    return {b.create<IdOp>(loc, indexType, b.getStringAttr("y")),
-            b.create<NProcsOp>(loc, indexType, b.getStringAttr("y"))};
-  case 1:
-    return {b.create<IdOp>(loc, indexType, b.getStringAttr("x")),
-            b.create<NProcsOp>(loc, indexType, b.getStringAttr("x"))};
-  default:
-    llvm_unreachable("test patterns handles only upto 2-level nested loops");
-  }
-  return {nullptr, nullptr};
+  SmallVector<ProcInfo, 2> procInfo(2);
+  procInfo[0] = {b.create<IdOp>(loc, indexType, b.getStringAttr("y")),
+                 b.create<NProcsOp>(loc, indexType, b.getStringAttr("y"))};
+  procInfo[1] = {b.create<IdOp>(loc, indexType, b.getStringAttr("x")),
+                 b.create<NProcsOp>(loc, indexType, b.getStringAttr("x"))};
+  return procInfo;
 }
 
 static void fillTileAndDistributePatterns(MLIRContext *context,

From e1de2b75501e5eaf8777bd5248382a7c55a44fd6 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 18 Aug 2020 20:01:19 +0000
Subject: [PATCH 087/101] Separate the Registration from Loading dialects in
 the Context

This changes the behavior of constructing MLIRContext to no longer load globally
registered dialects on construction. Instead Dialects are only loaded explicitly
on demand:
- the Parser is lazily loading Dialects in the context as it encounters them
during parsing. This is the only purpose for registering dialects and not load
them in the context.
- Passes are expected to declare the dialects they will create entity from
(Operations, Attributes, or Types), and the PassManager is loading Dialects into
the Context when starting a pipeline.

This changes simplifies the configuration of the registration: a compiler only
need to load the dialect for the IR it will emit, and the optimizer is
self-contained and load the required Dialects. For example in the Toy tutorial,
the compiler only needs to load the Toy dialect in the Context, all the others
(linalg, affine, std, LLVM, ...) are automatically loaded depending on the
optimization pipeline enabled.

To adjust to this change, stop using the existing dialect registration: the
global registry will be removed soon.

1) For passes, you need to override the method:

virtual void getDependentDialects(DialectRegistry &registry) const {}

and registery on the provided registry any dialect that this pass can produce.
Passes defined in TableGen can provide this list in the dependentDialects list
field.

2) For dialects, on construction you can register dependent dialects using the
provided MLIRContext: `context.getOrLoadDialect<DialectName>()`
This is useful if a dialect may canonicalize or have interfaces involving
another dialect.

3) For loading IR, dialect that can be in the input file must be explicitly
registered with the context. `MlirOptMain()` is taking an explicit registry for
this purpose. See how the standalone-opt.cpp example is setup:

  mlir::DialectRegistry registry;
  mlir::registerDialect<mlir::standalone::StandaloneDialect>();
  mlir::registerDialect<mlir::StandardOpsDialect>();

Only operations from these two dialects can be in the input file. To include all
of the dialects in MLIR Core, you can populate the registry this way:

  mlir::registerAllDialects(registry);

4) For `mlir-translate` callback, as well as frontend, Dialects can be loaded in
the context before emitting the IR: context.getOrLoadDialect<ToyDialect>()
---
 .../standalone-opt/standalone-opt.cpp         | 11 ++-
 mlir/examples/toy/Ch2/toyc.cpp                |  7 +-
 mlir/examples/toy/Ch3/toyc.cpp                |  6 +-
 mlir/examples/toy/Ch4/toyc.cpp                |  6 +-
 .../toy/Ch5/mlir/LowerToAffineLoops.cpp       |  3 +
 mlir/examples/toy/Ch5/toyc.cpp                |  6 +-
 .../toy/Ch6/mlir/LowerToAffineLoops.cpp       |  3 +
 mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp    |  3 +
 mlir/examples/toy/Ch6/toyc.cpp                |  6 +-
 .../toy/Ch7/mlir/LowerToAffineLoops.cpp       |  3 +
 mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp    |  3 +
 mlir/examples/toy/Ch7/toyc.cpp                |  6 +-
 mlir/include/mlir-c/IR.h                      |  6 ++
 mlir/include/mlir/Conversion/Passes.td        | 26 ++++++
 mlir/include/mlir/Dialect/Affine/Passes.td    |  1 +
 .../include/mlir/Dialect/LLVMIR/LLVMDialect.h |  1 +
 .../include/mlir/Dialect/LLVMIR/LLVMOpBase.td |  5 ++
 .../include/mlir/Dialect/LLVMIR/NVVMDialect.h |  1 +
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   |  1 +
 .../mlir/Dialect/LLVMIR/ROCDLDialect.h        |  1 +
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |  1 +
 mlir/include/mlir/Dialect/Linalg/Passes.td    |  8 ++
 mlir/include/mlir/Dialect/SCF/Passes.td       |  1 +
 mlir/include/mlir/IR/Dialect.h                | 87 ++++++++++++++++---
 mlir/include/mlir/IR/FunctionSupport.h        |  4 +-
 mlir/include/mlir/IR/MLIRContext.h            | 67 +++++++++++---
 mlir/include/mlir/IR/OpBase.td                |  5 ++
 mlir/include/mlir/InitAllDialects.h           | 47 +++++-----
 mlir/include/mlir/InitAllTranslations.h       |  4 +-
 mlir/include/mlir/Pass/Pass.h                 |  8 ++
 mlir/include/mlir/Pass/PassBase.td            |  3 +
 mlir/include/mlir/Pass/PassManager.h          | 14 +++
 mlir/include/mlir/Support/MlirOptMain.h       | 20 ++++-
 mlir/include/mlir/TableGen/Dialect.h          |  8 +-
 mlir/include/mlir/TableGen/Pass.h             |  4 +
 mlir/include/mlir/Transforms/Passes.td        |  2 +
 mlir/lib/CAPI/IR/IR.cpp                       |  9 +-
 ...ConvertGPULaunchFuncToVulkanLaunchFunc.cpp |  1 +
 .../Conversion/LinalgToLLVM/LinalgToLLVM.cpp  |  1 +
 mlir/lib/Conversion/PassDetail.h              | 32 +++++++
 .../StandardToLLVM/StandardToLLVM.cpp         |  2 +-
 .../LegalizeStandardForSPIRV.cpp              |  1 +
 .../Dialect/Affine/Transforms/PassDetail.h    | 10 +++
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      |  1 +
 .../Dialect/Linalg/Transforms/PassDetail.h    |  9 ++
 mlir/lib/Dialect/SCF/Transforms/PassDetail.h  |  5 ++
 mlir/lib/Dialect/SDBM/SDBMExpr.cpp            |  2 +-
 mlir/lib/ExecutionEngine/JitRunner.cpp        |  4 +-
 mlir/lib/IR/Dialect.cpp                       | 36 +++++---
 mlir/lib/IR/MLIRContext.cpp                   | 87 ++++++++++++-------
 mlir/lib/IR/Operation.cpp                     |  2 +-
 mlir/lib/IR/Verifier.cpp                      |  4 +-
 mlir/lib/Parser/AttributeParser.cpp           | 10 ++-
 mlir/lib/Parser/DialectSymbolParser.cpp       |  7 +-
 mlir/lib/Parser/Parser.cpp                    | 41 ++++++---
 mlir/lib/Pass/Pass.cpp                        | 27 ++++++
 mlir/lib/Pass/PassDetail.h                    |  4 +
 mlir/lib/Support/MlirOptMain.cpp              | 48 +++++-----
 mlir/lib/TableGen/Dialect.cpp                 |  8 ++
 mlir/lib/TableGen/Pass.cpp                    |  5 ++
 mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp  |  1 +
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  |  5 +-
 mlir/lib/Transforms/PassDetail.h              |  7 ++
 mlir/test/CAPI/ir.c                           |  1 +
 mlir/test/EDSC/builder-api-test.cpp           | 18 ++--
 mlir/test/SDBM/sdbm-api-test.cpp              |  9 +-
 .../Dialect/Affine/TestVectorizationUtils.cpp |  4 +
 .../lib/Dialect/SPIRV/TestAvailability.cpp    |  2 +-
 mlir/test/lib/Dialect/Test/TestDialect.cpp    |  4 +
 mlir/test/lib/Dialect/Test/TestDialect.h      |  2 +
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   |  4 +
 .../lib/Transforms/TestAllReduceLowering.cpp  |  4 +
 .../lib/Transforms/TestBufferPlacement.cpp    |  4 +
 .../lib/Transforms/TestGpuMemoryPromotion.cpp |  7 ++
 .../lib/Transforms/TestLinalgHoisting.cpp     |  4 +
 .../lib/Transforms/TestLinalgTransforms.cpp   | 11 +++
 .../lib/Transforms/TestVectorTransforms.cpp   |  8 ++
 mlir/test/mlir-opt/commandline.mlir           |  2 +-
 .../mlir-linalg-ods-gen.cpp                   |  2 +-
 mlir/tools/mlir-opt/mlir-opt.cpp              |  8 +-
 mlir/tools/mlir-tblgen/DialectGen.cpp         | 20 ++++-
 mlir/tools/mlir-tblgen/PassGen.cpp            | 21 ++++-
 mlir/tools/mlir-translate/mlir-translate.cpp  |  3 +-
 .../Dialect/Quant/QuantizationUtilsTest.cpp   | 15 ++--
 .../Dialect/SPIRV/DeserializationTest.cpp     |  3 +-
 .../Dialect/SPIRV/SerializationTest.cpp       |  5 +-
 mlir/unittests/IR/AttributeTest.cpp           | 32 +++----
 mlir/unittests/IR/DialectTest.cpp             |  6 +-
 mlir/unittests/IR/OperationSupportTest.cpp    |  8 +-
 mlir/unittests/Pass/AnalysisManagerTest.cpp   |  8 +-
 mlir/unittests/SDBM/SDBMTest.cpp              |  7 +-
 mlir/unittests/TableGen/OpBuildGen.cpp        |  9 +-
 mlir/unittests/TableGen/StructsGenTest.cpp    |  2 +-
 93 files changed, 759 insertions(+), 231 deletions(-)

diff --git a/mlir/examples/standalone/standalone-opt/standalone-opt.cpp b/mlir/examples/standalone/standalone-opt/standalone-opt.cpp
index b33dab26a71367..2dfb859ebd0526 100644
--- a/mlir/examples/standalone/standalone-opt/standalone-opt.cpp
+++ b/mlir/examples/standalone/standalone-opt/standalone-opt.cpp
@@ -24,9 +24,16 @@
 int main(int argc, char **argv) {
   mlir::registerAllDialects();
   mlir::registerAllPasses();
+  // TODO: Register standalone passes here.
 
+  mlir::DialectRegistry registry;
   mlir::registerDialect<mlir::standalone::StandaloneDialect>();
-  // TODO: Register standalone passes here.
+  mlir::registerDialect<mlir::StandardOpsDialect>();
+  // Add the following to include *all* MLIR Core dialects, or selectively
+  // include what you need like above. You only need to register dialects that
+  // will be *parsed* by the tool, not the one generated
+  // registerAllDialects(registry);
 
-  return failed(mlir::MlirOptMain(argc, argv, "Standalone optimizer driver\n"));
+  return failed(
+      mlir::MlirOptMain(argc, argv, "Standalone optimizer driver\n", registry));
 }
diff --git a/mlir/examples/toy/Ch2/toyc.cpp b/mlir/examples/toy/Ch2/toyc.cpp
index d0880ce0971b6e..99232d8f24a4a5 100644
--- a/mlir/examples/toy/Ch2/toyc.cpp
+++ b/mlir/examples/toy/Ch2/toyc.cpp
@@ -68,10 +68,9 @@ std::unique_ptr<toy::ModuleAST> parseInputFile(llvm::StringRef filename) {
 }
 
 int dumpMLIR() {
-  // Register our Dialect with MLIR.
-  mlir::registerDialect<mlir::toy::ToyDialect>();
-
-  mlir::MLIRContext context;
+  mlir::MLIRContext context(/*loadAllDialects=*/false);
+  // Load our Dialect in this MLIR Context.
+  context.getOrLoadDialect<mlir::toy::ToyDialect>();
 
   // Handle '.toy' input to the compiler.
   if (inputType != InputType::MLIR &&
diff --git a/mlir/examples/toy/Ch3/toyc.cpp b/mlir/examples/toy/Ch3/toyc.cpp
index f9d5631719e8b6..d0430ce16e54a8 100644
--- a/mlir/examples/toy/Ch3/toyc.cpp
+++ b/mlir/examples/toy/Ch3/toyc.cpp
@@ -102,10 +102,10 @@ int loadMLIR(llvm::SourceMgr &sourceMgr, mlir::MLIRContext &context,
 }
 
 int dumpMLIR() {
-  // Register our Dialect with MLIR.
-  mlir::registerDialect<mlir::toy::ToyDialect>();
+  mlir::MLIRContext context(/*loadAllDialects=*/false);
+  // Load our Dialect in this MLIR Context.
+  context.getOrLoadDialect<mlir::toy::ToyDialect>();
 
-  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   llvm::SourceMgr sourceMgr;
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
diff --git a/mlir/examples/toy/Ch4/toyc.cpp b/mlir/examples/toy/Ch4/toyc.cpp
index e11f35c5f7e10c..9f95887d270738 100644
--- a/mlir/examples/toy/Ch4/toyc.cpp
+++ b/mlir/examples/toy/Ch4/toyc.cpp
@@ -103,10 +103,10 @@ int loadMLIR(llvm::SourceMgr &sourceMgr, mlir::MLIRContext &context,
 }
 
 int dumpMLIR() {
-  // Register our Dialect with MLIR.
-  mlir::registerDialect<mlir::toy::ToyDialect>();
+  mlir::MLIRContext context(/*loadAllDialects=*/false);
+  // Load our Dialect in this MLIR Context.
+  context.getOrLoadDialect<mlir::toy::ToyDialect>();
 
-  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   llvm::SourceMgr sourceMgr;
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
diff --git a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
index 3097681ea3fad3..92fd246a135886 100644
--- a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
@@ -256,6 +256,9 @@ struct TransposeOpLowering : public ConversionPattern {
 namespace {
 struct ToyToAffineLoweringPass
     : public PassWrapper<ToyToAffineLoweringPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AffineDialect, StandardOpsDialect>();
+  }
   void runOnFunction() final;
 };
 } // end anonymous namespace.
diff --git a/mlir/examples/toy/Ch5/toyc.cpp b/mlir/examples/toy/Ch5/toyc.cpp
index ed0496957093bb..16faac02fc60d0 100644
--- a/mlir/examples/toy/Ch5/toyc.cpp
+++ b/mlir/examples/toy/Ch5/toyc.cpp
@@ -106,10 +106,10 @@ int loadMLIR(llvm::SourceMgr &sourceMgr, mlir::MLIRContext &context,
 }
 
 int dumpMLIR() {
-  // Register our Dialect with MLIR.
-  mlir::registerDialect<mlir::toy::ToyDialect>();
+  mlir::MLIRContext context(/*loadAllDialects=*/false);
+  // Load our Dialect in this MLIR Context.
+  context.getOrLoadDialect<mlir::toy::ToyDialect>();
 
-  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   llvm::SourceMgr sourceMgr;
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
diff --git a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
index cac3415f48d68f..f3857f35e25c95 100644
--- a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
@@ -255,6 +255,9 @@ struct TransposeOpLowering : public ConversionPattern {
 namespace {
 struct ToyToAffineLoweringPass
     : public PassWrapper<ToyToAffineLoweringPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AffineDialect, StandardOpsDialect>();
+  }
   void runOnFunction() final;
 };
 } // end anonymous namespace.
diff --git a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
index 74b32dc0ca1102..19bf27e1864d18 100644
--- a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
@@ -159,6 +159,9 @@ class PrintOpLowering : public ConversionPattern {
 namespace {
 struct ToyToLLVMLoweringPass
     : public PassWrapper<ToyToLLVMLoweringPass, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<LLVM::LLVMDialect, scf::SCFDialect>();
+  }
   void runOnOperation() final;
 };
 } // end anonymous namespace
diff --git a/mlir/examples/toy/Ch6/toyc.cpp b/mlir/examples/toy/Ch6/toyc.cpp
index bdcdf1af7ea831..9504a38b8784c9 100644
--- a/mlir/examples/toy/Ch6/toyc.cpp
+++ b/mlir/examples/toy/Ch6/toyc.cpp
@@ -255,10 +255,10 @@ int main(int argc, char **argv) {
 
   // If we aren't dumping the AST, then we are compiling with/to MLIR.
 
-  // Register our Dialect with MLIR.
-  mlir::registerDialect<mlir::toy::ToyDialect>();
+  mlir::MLIRContext context(/*loadAllDialects=*/false);
+  // Load our Dialect in this MLIR Context.
+  context.getOrLoadDialect<mlir::toy::ToyDialect>();
 
-  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   if (int error = loadAndProcessMLIR(context, module))
     return error;
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
index 3097681ea3fad3..92fd246a135886 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
@@ -256,6 +256,9 @@ struct TransposeOpLowering : public ConversionPattern {
 namespace {
 struct ToyToAffineLoweringPass
     : public PassWrapper<ToyToAffineLoweringPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AffineDialect, StandardOpsDialect>();
+  }
   void runOnFunction() final;
 };
 } // end anonymous namespace.
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
index 74b32dc0ca1102..19bf27e1864d18 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
@@ -159,6 +159,9 @@ class PrintOpLowering : public ConversionPattern {
 namespace {
 struct ToyToLLVMLoweringPass
     : public PassWrapper<ToyToLLVMLoweringPass, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<LLVM::LLVMDialect, scf::SCFDialect>();
+  }
   void runOnOperation() final;
 };
 } // end anonymous namespace
diff --git a/mlir/examples/toy/Ch7/toyc.cpp b/mlir/examples/toy/Ch7/toyc.cpp
index c1cc207a406ce2..cb3b455dc7ecbe 100644
--- a/mlir/examples/toy/Ch7/toyc.cpp
+++ b/mlir/examples/toy/Ch7/toyc.cpp
@@ -256,10 +256,10 @@ int main(int argc, char **argv) {
 
   // If we aren't dumping the AST, then we are compiling with/to MLIR.
 
-  // Register our Dialect with MLIR.
-  mlir::registerDialect<mlir::toy::ToyDialect>();
+  mlir::MLIRContext context(/*loadAllDialects=*/false);
+  // Load our Dialect in this MLIR Context.
+  context.getOrLoadDialect<mlir::toy::ToyDialect>();
 
-  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   if (int error = loadAndProcessMLIR(context, module))
     return error;
diff --git a/mlir/include/mlir-c/IR.h b/mlir/include/mlir-c/IR.h
index 68546bf35625a2..f0c421bd5cce9b 100644
--- a/mlir/include/mlir-c/IR.h
+++ b/mlir/include/mlir-c/IR.h
@@ -88,6 +88,12 @@ MlirContext mlirContextCreate();
 /** Takes an MLIR context owned by the caller and destroys it. */
 void mlirContextDestroy(MlirContext context);
 
+/** Load all the globally registered dialects in the provided context.
+ *  TODO: remove the concept of globally registered dialect by exposing the
+ *  DialectRegistry.
+ */
+void mlirContextLoadAllDialects(MlirContext context);
+
 /*============================================================================*/
 /* Location API.                                                              */
 /*============================================================================*/
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 4ff23d71a5c0bf..0a043c01e98140 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -66,6 +66,11 @@ def ConvertAffineToStandard : Pass<"lower-affine"> {
         `affine.apply`.
   }];
   let constructor = "mlir::createLowerAffinePass()";
+  let dependentDialects = [
+    "scf::SCFDialect",
+    "StandardOpsDialect",
+    "vector::VectorDialect"
+  ];
 }
 
 //===----------------------------------------------------------------------===//
@@ -76,6 +81,7 @@ def ConvertAVX512ToLLVM : Pass<"convert-avx512-to-llvm", "ModuleOp"> {
   let summary = "Convert the operations from the avx512 dialect into the LLVM "
                 "dialect";
   let constructor = "mlir::createConvertAVX512ToLLVMPass()";
+  let dependentDialects = ["LLVM::LLVMDialect", "LLVM::LLVMAVX512Dialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -98,6 +104,7 @@ def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> {
 def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
   let summary = "Generate NVVM operations for gpu operations";
   let constructor = "mlir::createLowerGpuOpsToNVVMOpsPass()";
+  let dependentDialects = ["NVVM::NVVMDialect"];
   let options = [
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
@@ -112,6 +119,7 @@ def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
 def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
   let summary = "Generate ROCDL operations for gpu operations";
   let constructor = "mlir::createLowerGpuOpsToROCDLOpsPass()";
+  let dependentDialects = ["ROCDL::ROCDLDialect"];
   let options = [
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
@@ -126,6 +134,7 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
 def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
   let summary = "Convert GPU dialect to SPIR-V dialect";
   let constructor = "mlir::createConvertGPUToSPIRVPass()";
+  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -136,6 +145,7 @@ def ConvertGpuLaunchFuncToVulkanLaunchFunc
     : Pass<"convert-gpu-launch-to-vulkan-launch", "ModuleOp"> {
   let summary = "Convert gpu.launch_func to vulkanLaunch external call";
   let constructor = "mlir::createConvertGpuLaunchFuncToVulkanLaunchFuncPass()";
+  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 def ConvertVulkanLaunchFuncToVulkanCalls
@@ -143,6 +153,7 @@ def ConvertVulkanLaunchFuncToVulkanCalls
   let summary = "Convert vulkanLaunch external call to Vulkan runtime external "
                 "calls";
   let constructor = "mlir::createConvertVulkanLaunchFuncToVulkanCallsPass()";
+  let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -153,6 +164,7 @@ def ConvertLinalgToLLVM : Pass<"convert-linalg-to-llvm", "ModuleOp"> {
   let summary = "Convert the operations from the linalg dialect into the LLVM "
                 "dialect";
   let constructor = "mlir::createConvertLinalgToLLVMPass()";
+  let dependentDialects = ["scf::SCFDialect", "LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -163,6 +175,7 @@ def ConvertLinalgToStandard : Pass<"convert-linalg-to-std", "ModuleOp"> {
   let summary = "Convert the operations from the linalg dialect into the "
                 "Standard dialect";
   let constructor = "mlir::createConvertLinalgToStandardPass()";
+  let dependentDialects = ["StandardOpsDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -172,6 +185,7 @@ def ConvertLinalgToStandard : Pass<"convert-linalg-to-std", "ModuleOp"> {
 def ConvertLinalgToSPIRV : Pass<"convert-linalg-to-spirv", "ModuleOp"> {
   let summary = "Convert Linalg ops to SPIR-V ops";
   let constructor = "mlir::createLinalgToSPIRVPass()";
+  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -182,6 +196,7 @@ def SCFToStandard : Pass<"convert-scf-to-std"> {
   let summary = "Convert SCF dialect to Standard dialect, replacing structured"
                 " control flow with a CFG";
   let constructor = "mlir::createLowerToCFGPass()";
+  let dependentDialects = ["StandardOpsDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -191,6 +206,7 @@ def SCFToStandard : Pass<"convert-scf-to-std"> {
 def ConvertAffineForToGPU : FunctionPass<"convert-affine-for-to-gpu"> {
   let summary = "Convert top-level AffineFor Ops to GPU kernels";
   let constructor = "mlir::createAffineForToGPUPass()";
+  let dependentDialects = ["gpu::GPUDialect"];
   let options = [
     Option<"numBlockDims", "gpu-block-dims", "unsigned", /*default=*/"1u",
            "Number of GPU block dimensions for mapping">,
@@ -202,6 +218,7 @@ def ConvertAffineForToGPU : FunctionPass<"convert-affine-for-to-gpu"> {
 def ConvertParallelLoopToGpu : Pass<"convert-parallel-loops-to-gpu"> {
   let summary = "Convert mapped scf.parallel ops to gpu launch operations";
   let constructor = "mlir::createParallelLoopToGpuPass()";
+  let dependentDialects = ["AffineDialect", "gpu::GPUDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -212,6 +229,7 @@ def ConvertShapeToStandard : Pass<"convert-shape-to-std", "ModuleOp"> {
   let summary = "Convert operations from the shape dialect into the standard "
                 "dialect";
   let constructor = "mlir::createConvertShapeToStandardPass()";
+  let dependentDialects = ["StandardOpsDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -221,6 +239,7 @@ def ConvertShapeToStandard : Pass<"convert-shape-to-std", "ModuleOp"> {
 def ConvertShapeToSCF : FunctionPass<"convert-shape-to-scf"> {
   let summary = "Convert operations from the shape dialect to the SCF dialect";
   let constructor = "mlir::createConvertShapeToSCFPass()";
+  let dependentDialects = ["scf::SCFDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -230,6 +249,7 @@ def ConvertShapeToSCF : FunctionPass<"convert-shape-to-scf"> {
 def ConvertSPIRVToLLVM : Pass<"convert-spirv-to-llvm", "ModuleOp"> {
   let summary = "Convert SPIR-V dialect to LLVM dialect";
   let constructor = "mlir::createConvertSPIRVToLLVMPass()";
+  let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -264,6 +284,7 @@ def ConvertStandardToLLVM : Pass<"convert-std-to-llvm", "ModuleOp"> {
     LLVM IR types.
   }];
   let constructor = "mlir::createLowerToLLVMPass()";
+  let dependentDialects = ["LLVM::LLVMDialect"];
   let options = [
     Option<"useAlignedAlloc", "use-aligned-alloc", "bool", /*default=*/"false",
            "Use aligned_alloc in place of malloc for heap allocations">,
@@ -291,11 +312,13 @@ def ConvertStandardToLLVM : Pass<"convert-std-to-llvm", "ModuleOp"> {
 def LegalizeStandardForSPIRV : Pass<"legalize-std-for-spirv"> {
   let summary = "Legalize standard ops for SPIR-V lowering";
   let constructor = "mlir::createLegalizeStdOpsForSPIRVLoweringPass()";
+  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 def ConvertStandardToSPIRV : Pass<"convert-std-to-spirv", "ModuleOp"> {
   let summary = "Convert Standard Ops to SPIR-V dialect";
   let constructor = "mlir::createConvertStandardToSPIRVPass()";
+  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -306,6 +329,7 @@ def ConvertVectorToSCF : FunctionPass<"convert-vector-to-scf"> {
   let summary = "Lower the operations from the vector dialect into the SCF "
                 "dialect";
   let constructor = "mlir::createConvertVectorToSCFPass()";
+  let dependentDialects = ["AffineDialect", "scf::SCFDialect"];
   let options = [
     Option<"fullUnroll", "full-unroll", "bool", /*default=*/"false",
            "Perform full unrolling when converting vector transfers to SCF">,
@@ -320,6 +344,7 @@ def ConvertVectorToLLVM : Pass<"convert-vector-to-llvm", "ModuleOp"> {
   let summary = "Lower the operations from the vector dialect into the LLVM "
                 "dialect";
   let constructor = "mlir::createConvertVectorToLLVMPass()";
+  let dependentDialects = ["LLVM::LLVMDialect"];
   let options = [
     Option<"reassociateFPReductions", "reassociate-fp-reductions",
            "bool", /*default=*/"false",
@@ -335,6 +360,7 @@ def ConvertVectorToROCDL : Pass<"convert-vector-to-rocdl", "ModuleOp"> {
   let summary = "Lower the operations from the vector dialect into the ROCDL "
                 "dialect";
   let constructor = "mlir::createConvertVectorToROCDLPass()";
+  let dependentDialects = ["ROCDL::ROCDLDialect"];
 }
 
 #endif // MLIR_CONVERSION_PASSES
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
index 810640058155fb..f43fabd19aaefe 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -94,6 +94,7 @@ def AffineLoopUnrollAndJam : FunctionPass<"affine-loop-unroll-jam"> {
 def AffineVectorize : FunctionPass<"affine-super-vectorize"> {
   let summary = "Vectorize to a target independent n-D vector abstraction";
   let constructor = "mlir::createSuperVectorizePass()";
+  let dependentDialects = ["vector::VectorDialect"];
   let options = [
     ListOption<"vectorSizes", "virtual-vector-size", "int64_t",
                "Specify an n-D virtual vector size for vectorization",
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
index 04700f0aa17dbb..2f465f07a97e42 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
@@ -15,6 +15,7 @@
 #define MLIR_DIALECT_LLVMIR_LLVMDIALECT_H_
 
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Function.h"
 #include "mlir/IR/OpDefinition.h"
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index e824f97bc28544..226743587bd9d5 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -19,6 +19,11 @@ include "mlir/IR/OpBase.td"
 def LLVM_Dialect : Dialect {
   let name = "llvm";
   let cppNamespace = "LLVM";
+
+  /// FIXME: at the moment this is a dependency of the translation to LLVM IR,
+  /// not really one of this dialect per-se.
+  let dependentDialects = ["omp::OpenMPDialect"];
+
   let hasRegionArgAttrVerify = 1;
   let hasOperationAttrVerify = 1;
   let extraClassDeclaration = [{
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
index 86d437c9b561b7..9cc5314bdb901f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -14,6 +14,7 @@
 #ifndef MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_
 #define MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_
 
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 5f022e32b801d6..7d47e5012ac9a0 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -23,6 +23,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 def NVVM_Dialect : Dialect {
   let name = "nvvm";
   let cppNamespace = "NVVM";
+  let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
index bf761c357f9074..eb40373c3f1171 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
@@ -22,6 +22,7 @@
 #ifndef MLIR_DIALECT_LLVMIR_ROCDLDIALECT_H_
 #define MLIR_DIALECT_LLVMIR_ROCDLDIALECT_H_
 
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 0cd11690daa8ba..f85c4f02899b46 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -23,6 +23,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 def ROCDL_Dialect : Dialect {
   let name = "rocdl";
   let cppNamespace = "ROCDL";
+  let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index 11f12ad30eb6c0..dcf4b5ec06cb6f 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -30,17 +30,20 @@ def LinalgFusion : FunctionPass<"linalg-fusion"> {
 def LinalgFusionOfTensorOps : Pass<"linalg-fusion-for-tensor-ops"> {
   let summary = "Fuse operations on RankedTensorType in linalg dialect";
   let constructor = "mlir::createLinalgFusionOfTensorOpsPass()";
+  let dependentDialects = ["AffineDialect"];
 }
 
 def LinalgLowerToAffineLoops : FunctionPass<"convert-linalg-to-affine-loops"> {
   let summary = "Lower the operations from the linalg dialect into affine "
                 "loops";
   let constructor = "mlir::createConvertLinalgToAffineLoopsPass()";
+  let dependentDialects = ["AffineDialect"];
 }
 
 def LinalgLowerToLoops : FunctionPass<"convert-linalg-to-loops"> {
   let summary = "Lower the operations from the linalg dialect into loops";
   let constructor = "mlir::createConvertLinalgToLoopsPass()";
+  let dependentDialects = ["scf::SCFDialect", "AffineDialect"];
 }
 
 def LinalgOnTensorsToBuffers : Pass<"convert-linalg-on-tensors-to-buffers", "ModuleOp"> {
@@ -54,6 +57,7 @@ def LinalgLowerToParallelLoops
   let summary = "Lower the operations from the linalg dialect into parallel "
                 "loops";
   let constructor = "mlir::createConvertLinalgToParallelLoopsPass()";
+  let dependentDialects = ["AffineDialect", "scf::SCFDialect"];
 }
 
 def LinalgPromotion : FunctionPass<"linalg-promote-subviews"> {
@@ -70,6 +74,9 @@ def LinalgPromotion : FunctionPass<"linalg-promote-subviews"> {
 def LinalgTiling : FunctionPass<"linalg-tile"> {
   let summary = "Tile operations in the linalg dialect";
   let constructor = "mlir::createLinalgTilingPass()";
+  let dependentDialects = [
+    "AffineDialect", "scf::SCFDialect"
+  ];
   let options = [
     ListOption<"tileSizes", "linalg-tile-sizes", "int64_t",
                "Test generation of dynamic promoted buffers",
@@ -86,6 +93,7 @@ def LinalgTilingToParallelLoops
                "Test generation of dynamic promoted buffers",
                "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
   ];
+  let dependentDialects = ["AffineDialect", "scf::SCFDialect"];
 }
 
 #endif // MLIR_DIALECT_LINALG_PASSES
diff --git a/mlir/include/mlir/Dialect/SCF/Passes.td b/mlir/include/mlir/Dialect/SCF/Passes.td
index 483d0ba7c7be08..6f3cf0e1264235 100644
--- a/mlir/include/mlir/Dialect/SCF/Passes.td
+++ b/mlir/include/mlir/Dialect/SCF/Passes.td
@@ -36,6 +36,7 @@ def SCFParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
                "Factors to tile parallel loops by",
                "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
   ];
+  let dependentDialects = ["AffineDialect"];
 }
 
 #endif // MLIR_DIALECT_SCF_PASSES
diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h
index 4f9e4cb3618b65..8c0fef0d7ccf64 100644
--- a/mlir/include/mlir/IR/Dialect.h
+++ b/mlir/include/mlir/IR/Dialect.h
@@ -16,6 +16,8 @@
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/Support/TypeID.h"
 
+#include <map>
+
 namespace mlir {
 class DialectAsmParser;
 class DialectAsmPrinter;
@@ -23,7 +25,7 @@ class DialectInterface;
 class OpBuilder;
 class Type;
 
-using DialectAllocatorFunction = std::function<void(MLIRContext *)>;
+using DialectAllocatorFunction = std::function<Dialect *(MLIRContext *)>;
 
 /// Dialects are groups of MLIR operations and behavior associated with the
 /// entire group.  For example, hooks into other systems for constant folding,
@@ -212,30 +214,87 @@ class Dialect {
   /// A collection of registered dialect interfaces.
   DenseMap<TypeID, std::unique_ptr<DialectInterface>> registeredInterfaces;
 
-  /// Registers a specific dialect creation function with the global registry.
-  /// Used through the registerDialect template.
-  /// Registrations are deduplicated by dialect TypeID and only the first
-  /// registration will be used.
-  static void
-  registerDialectAllocator(TypeID typeID,
-                           const DialectAllocatorFunction &function);
-  template <typename ConcreteDialect>
   friend void registerDialect();
   friend class MLIRContext;
 };
 
-/// Registers all dialects and hooks from the global registries with the
-/// specified MLIRContext.
+/// The DialectRegistry maps a dialect namespace to a constructor for the
+/// matching dialect.
+/// This allows for decoupling the list of dialects "available" from the
+/// dialects loaded in the Context. The parser in particular will lazily load
+/// dialects in in the Context as operations are encountered.
+class DialectRegistry {
+  using MapTy =
+      std::map<std::string, std::pair<TypeID, DialectAllocatorFunction>>;
+
+public:
+  template <typename ConcreteDialect>
+  void insert() {
+    insert(TypeID::get<ConcreteDialect>(),
+           ConcreteDialect::getDialectNamespace(),
+           static_cast<DialectAllocatorFunction>(([](MLIRContext *ctx) {
+             // Just allocate the dialect, the context
+             // takes ownership of it.
+             return ctx->getOrLoadDialect<ConcreteDialect>();
+           })));
+  }
+
+  template <typename ConcreteDialect, typename OtherDialect,
+            typename... MoreDialects>
+  void insert() {
+    insert<ConcreteDialect>();
+    insert<OtherDialect, MoreDialects...>();
+  }
+
+  /// Add a new dialect constructor to the registry.
+  void insert(TypeID typeID, StringRef name, DialectAllocatorFunction ctor);
+
+  /// Load a dialect for this namespace in the provided context.
+  Dialect *loadByName(StringRef name, MLIRContext *context);
+
+  // Register all dialects available in the current registry with the registry
+  // in the provided context.
+  void appendTo(DialectRegistry &destination) {
+    for (const auto &nameAndRegistrationIt : registry)
+      destination.insert(nameAndRegistrationIt.second.first,
+                         nameAndRegistrationIt.first,
+                         nameAndRegistrationIt.second.second);
+  }
+  // Load all dialects available in the registry in the provided context.
+  void loadAll(MLIRContext *context) {
+    for (const auto &nameAndRegistrationIt : registry)
+      nameAndRegistrationIt.second.second(context);
+  }
+
+  MapTy::const_iterator begin() const { return registry.begin(); }
+  MapTy::const_iterator end() const { return registry.end(); }
+
+private:
+  MapTy registry;
+};
+
+/// Deprecated: this provides a global registry for convenience, while we're
+/// transitionning the registration mechanism to a stateless approach.
+DialectRegistry &getGlobalDialectRegistry();
+
+/// Registers all dialects from the global registries with the
+/// specified MLIRContext. This won't load the dialects in the context,
+/// but only make them available for lazy loading by name.
 /// Note: This method is not thread-safe.
 void registerAllDialects(MLIRContext *context);
 
+/// Register and return the dialect with the given namespace in the provided
+/// context. Returns nullptr is there is no constructor registered for this
+/// dialect.
+inline Dialect *registerDialect(StringRef name, MLIRContext *context) {
+  return getGlobalDialectRegistry().loadByName(name, context);
+}
+
 /// Utility to register a dialect. Client can register their dialect with the
 /// global registry by calling registerDialect<MyDialect>();
 /// Note: This method is not thread-safe.
 template <typename ConcreteDialect> void registerDialect() {
-  Dialect::registerDialectAllocator(
-      TypeID::get<ConcreteDialect>(),
-      [](MLIRContext *ctx) { ctx->getOrCreateDialect<ConcreteDialect>(); });
+  getGlobalDialectRegistry().insert<ConcreteDialect>();
 }
 
 /// DialectRegistration provides a global initializer that registers a Dialect
diff --git a/mlir/include/mlir/IR/FunctionSupport.h b/mlir/include/mlir/IR/FunctionSupport.h
index 7e281f393af946..3d467cd4f3642f 100644
--- a/mlir/include/mlir/IR/FunctionSupport.h
+++ b/mlir/include/mlir/IR/FunctionSupport.h
@@ -428,7 +428,7 @@ LogicalResult FunctionLike<ConcreteType>::verifyTrait(Operation *op) {
       if (!attr.first.strref().contains('.'))
         return funcOp.emitOpError("arguments may only have dialect attributes");
       auto dialectNamePair = attr.first.strref().split('.');
-      if (auto *dialect = ctx->getRegisteredDialect(dialectNamePair.first)) {
+      if (auto *dialect = ctx->getLoadedDialect(dialectNamePair.first)) {
         if (failed(dialect->verifyRegionArgAttribute(op, /*regionIndex=*/0,
                                                      /*argIndex=*/i, attr)))
           return failure();
@@ -444,7 +444,7 @@ LogicalResult FunctionLike<ConcreteType>::verifyTrait(Operation *op) {
       if (!attr.first.strref().contains('.'))
         return funcOp.emitOpError("results may only have dialect attributes");
       auto dialectNamePair = attr.first.strref().split('.');
-      if (auto *dialect = ctx->getRegisteredDialect(dialectNamePair.first)) {
+      if (auto *dialect = ctx->getLoadedDialect(dialectNamePair.first)) {
         if (failed(dialect->verifyRegionResultAttribute(op, /*regionIndex=*/0,
                                                         /*resultIndex=*/i,
                                                         attr)))
diff --git a/mlir/include/mlir/IR/MLIRContext.h b/mlir/include/mlir/IR/MLIRContext.h
index 0192a8ae06af87..e8a5d6e6d2368b 100644
--- a/mlir/include/mlir/IR/MLIRContext.h
+++ b/mlir/include/mlir/IR/MLIRContext.h
@@ -19,10 +19,12 @@ namespace mlir {
 class AbstractOperation;
 class DiagnosticEngine;
 class Dialect;
+class DialectRegistry;
 class InFlightDiagnostic;
 class Location;
 class MLIRContextImpl;
 class StorageUniquer;
+DialectRegistry &getGlobalDialectRegistry();
 
 /// MLIRContext is the top-level object for a collection of MLIR modules.  It
 /// holds immortal uniqued objects like types, and the tables used to unique
@@ -34,34 +36,69 @@ class StorageUniquer;
 ///
 class MLIRContext {
 public:
-  explicit MLIRContext();
+  /// Create a new Context.
+  /// The loadAllDialects parameters allows to load all dialects from the global
+  /// registry on Context construction. It is deprecated and will be removed
+  /// soon.
+  explicit MLIRContext(bool loadAllDialects = true);
   ~MLIRContext();
 
-  /// Return information about all registered IR dialects.
-  std::vector<Dialect *> getRegisteredDialects();
+  /// Return information about all IR dialects loaded in the context.
+  std::vector<Dialect *> getLoadedDialects();
+
+  /// Return the dialect registry associated with this context.
+  DialectRegistry &getDialectRegistry();
+
+  /// Return information about all available dialects in the registry in this
+  /// context.
+  std::vector<StringRef> getAvailableDialects();
 
   /// Get a registered IR dialect with the given namespace. If an exact match is
   /// not found, then return nullptr.
-  Dialect *getRegisteredDialect(StringRef name);
+  Dialect *getLoadedDialect(StringRef name);
 
   /// Get a registered IR dialect for the given derived dialect type. The
   /// derived type must provide a static 'getDialectNamespace' method.
-  template <typename T> T *getRegisteredDialect() {
-    return static_cast<T *>(getRegisteredDialect(T::getDialectNamespace()));
+  template <typename T>
+  T *getLoadedDialect() {
+    return static_cast<T *>(getLoadedDialect(T::getDialectNamespace()));
   }
 
   /// Get (or create) a dialect for the given derived dialect type. The derived
   /// type must provide a static 'getDialectNamespace' method.
   template <typename T>
-  T *getOrCreateDialect() {
-    return static_cast<T *>(getOrCreateDialect(
-        T::getDialectNamespace(), TypeID::get<T>(), [this]() {
+  T *getOrLoadDialect() {
+    return static_cast<T *>(
+        getOrLoadDialect(T::getDialectNamespace(), TypeID::get<T>(), [this]() {
           std::unique_ptr<T> dialect(new T(this));
-          dialect->dialectID = TypeID::get<T>();
           return dialect;
         }));
   }
 
+  /// Load a dialect in the context.
+  template <typename Dialect>
+  void loadDialect() {
+    getOrLoadDialect<Dialect>();
+  }
+
+  /// Load a list dialects in the context.
+  template <typename Dialect, typename OtherDialect, typename... MoreDialects>
+  void loadDialect() {
+    getOrLoadDialect<Dialect>();
+    loadDialect<OtherDialect, MoreDialects...>();
+  }
+
+  /// Deprecated: load all globally registered dialects into this context.
+  /// This method will be removed soon, it can be used temporarily as we're
+  /// phasing out the global registry.
+  void loadAllGloballyRegisteredDialects();
+
+  /// Get (or create) a dialect for the given derived dialect name.
+  /// The dialect will be loaded from the registry if no dialect is found.
+  /// If no dialect is loaded for this name and none is available in the
+  /// registry, returns nullptr.
+  Dialect *getOrLoadDialect(StringRef name);
+
   /// Return true if we allow to create operation for unregistered dialects.
   bool allowsUnregisteredDialects();
 
@@ -123,10 +160,12 @@ class MLIRContext {
   const std::unique_ptr<MLIRContextImpl> impl;
 
   /// Get a dialect for the provided namespace and TypeID: abort the program if
-  /// a dialect exist for this namespace with different TypeID. Returns a
-  /// pointer to the dialect owned by the context.
-  Dialect *getOrCreateDialect(StringRef dialectNamespace, TypeID dialectID,
-                              function_ref<std::unique_ptr<Dialect>()> ctor);
+  /// a dialect exist for this namespace with different TypeID. If a dialect has
+  /// not been loaded for this namespace/TypeID yet, use the provided ctor to
+  /// create one on the fly and load it. Returns a pointer to the dialect owned
+  /// by the context.
+  Dialect *getOrLoadDialect(StringRef dialectNamespace, TypeID dialectID,
+                            function_ref<std::unique_ptr<Dialect>()> ctor);
 
   MLIRContext(const MLIRContext &) = delete;
   void operator=(const MLIRContext &) = delete;
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index 9cc57a61728949..a28410f028d5f0 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -244,6 +244,11 @@ class Dialect {
   // The description of the dialect.
   string description = ?;
 
+  // A list of dialects this dialect will load on construction as dependencies.
+  // These are dialects that this dialect may involved in canonicalization
+  // pattern or interfaces.
+  list<string> dependentDialects = [];
+
   // The C++ namespace that ops of this dialect should be placed into.
   //
   // By default, uses the name of the dialect as the only namespace. To avoid
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index b76b26fe348346..147ececc4c5a96 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -35,30 +35,35 @@
 
 namespace mlir {
 
+// Add all the MLIR dialects to the provided registry.
+inline void registerAllDialects(DialectRegistry &registry) {
+  // clang-format off
+  registry.insert<acc::OpenACCDialect,
+                  AffineDialect,
+                  avx512::AVX512Dialect,
+                  gpu::GPUDialect,
+                  LLVM::LLVMAVX512Dialect,
+                  LLVM::LLVMDialect,
+                  linalg::LinalgDialect,
+                  scf::SCFDialect,
+                  omp::OpenMPDialect,
+                  quant::QuantizationDialect,
+                  spirv::SPIRVDialect,
+                  StandardOpsDialect,
+                  vector::VectorDialect,
+                  NVVM::NVVMDialect,
+                  ROCDL::ROCDLDialect,
+                  SDBMDialect,
+                  shape::ShapeDialect>();
+  // clang-format on
+}
+
 // This function should be called before creating any MLIRContext if one expect
 // all the possible dialects to be made available to the context automatically.
 inline void registerAllDialects() {
-  static bool init_once = []() {
-    registerDialect<acc::OpenACCDialect>();
-    registerDialect<AffineDialect>();
-    registerDialect<avx512::AVX512Dialect>();
-    registerDialect<gpu::GPUDialect>();
-    registerDialect<LLVM::LLVMAVX512Dialect>();
-    registerDialect<LLVM::LLVMDialect>();
-    registerDialect<linalg::LinalgDialect>();
-    registerDialect<scf::SCFDialect>();
-    registerDialect<omp::OpenMPDialect>();
-    registerDialect<quant::QuantizationDialect>();
-    registerDialect<spirv::SPIRVDialect>();
-    registerDialect<StandardOpsDialect>();
-    registerDialect<vector::VectorDialect>();
-    registerDialect<NVVM::NVVMDialect>();
-    registerDialect<ROCDL::ROCDLDialect>();
-    registerDialect<SDBMDialect>();
-    registerDialect<shape::ShapeDialect>();
-    return true;
-  }();
-  (void)init_once;
+  static bool initOnce =
+      ([]() { registerAllDialects(getGlobalDialectRegistry()); }(), true);
+  (void)initOnce;
 }
 } // namespace mlir
 
diff --git a/mlir/include/mlir/InitAllTranslations.h b/mlir/include/mlir/InitAllTranslations.h
index 31ca0254cf8999..a1771dab144c04 100644
--- a/mlir/include/mlir/InitAllTranslations.h
+++ b/mlir/include/mlir/InitAllTranslations.h
@@ -28,7 +28,7 @@ void registerAVX512ToLLVMIRTranslation();
 // expects all the possible translations to be made available to the context
 // automatically.
 inline void registerAllTranslations() {
-  static bool init_once = []() {
+  static bool initOnce = []() {
     registerFromLLVMIRTranslation();
     registerFromSPIRVTranslation();
     registerToLLVMIRTranslation();
@@ -38,7 +38,7 @@ inline void registerAllTranslations() {
     registerAVX512ToLLVMIRTranslation();
     return true;
   }();
-  (void)init_once;
+  (void)initOnce;
 }
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Pass/Pass.h b/mlir/include/mlir/Pass/Pass.h
index 8de31d9443190d..cd4c06acd070b4 100644
--- a/mlir/include/mlir/Pass/Pass.h
+++ b/mlir/include/mlir/Pass/Pass.h
@@ -9,6 +9,7 @@
 #ifndef MLIR_PASS_PASS_H
 #define MLIR_PASS_PASS_H
 
+#include "mlir/IR/Dialect.h"
 #include "mlir/IR/Function.h"
 #include "mlir/Pass/AnalysisManager.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -57,6 +58,13 @@ class Pass {
   /// Returns the derived pass name.
   virtual StringRef getName() const = 0;
 
+  /// Register dependent dialects for the current pass.
+  /// A pass is expected to register the dialects it will create entities for
+  /// (Operations, Types, Attributes), other than dialect that exists in the
+  /// input. For example, a pass that converts from Linalg to Affine would
+  /// register the Affine dialect but does not need to register Linalg.
+  virtual void getDependentDialects(DialectRegistry &registry) const {}
+
   /// Returns the command line argument used when registering this pass. Return
   /// an empty string if one does not exist.
   virtual StringRef getArgument() const {
diff --git a/mlir/include/mlir/Pass/PassBase.td b/mlir/include/mlir/Pass/PassBase.td
index 54b44031559e72..7a2feff4fe0454 100644
--- a/mlir/include/mlir/Pass/PassBase.td
+++ b/mlir/include/mlir/Pass/PassBase.td
@@ -78,6 +78,9 @@ class PassBase<string passArg, string base> {
   // A C++ constructor call to create an instance of this pass.
   code constructor = [{}];
 
+  // A list of dialects this pass may produce entities in.
+  list<string> dependentDialects = [];
+
   // A set of options provided by this pass.
   list<Option> options = [];
 
diff --git a/mlir/include/mlir/Pass/PassManager.h b/mlir/include/mlir/Pass/PassManager.h
index 9cbfb0b277100c..29e7c07c2ee416 100644
--- a/mlir/include/mlir/Pass/PassManager.h
+++ b/mlir/include/mlir/Pass/PassManager.h
@@ -9,6 +9,7 @@
 #ifndef MLIR_PASS_PASSMANAGER_H
 #define MLIR_PASS_PASSMANAGER_H
 
+#include "mlir/IR/Dialect.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/Optional.h"
@@ -58,6 +59,14 @@ class OpPassManager {
   pass_iterator end();
   iterator_range<pass_iterator> getPasses() { return {begin(), end()}; }
 
+  using const_pass_iterator = llvm::pointee_iterator<
+      std::vector<std::unique_ptr<Pass>>::const_iterator>;
+  const_pass_iterator begin() const;
+  const_pass_iterator end() const;
+  iterator_range<const_pass_iterator> getPasses() const {
+    return {begin(), end()};
+  }
+
   /// Run the held passes over the given operation.
   LogicalResult run(Operation *op, AnalysisManager am);
 
@@ -100,6 +109,11 @@ class OpPassManager {
   /// Merge the pass statistics of this class into 'other'.
   void mergeStatisticsInto(OpPassManager &other);
 
+  /// Register dependent dialects for the current pass manager.
+  /// This is forwarding to every pass in this PassManager, see the
+  /// documentation for the same method on the Pass class.
+  void getDependentDialects(DialectRegistry &dialects) const;
+
 private:
   OpPassManager(OperationName name, bool verifyPasses);
 
diff --git a/mlir/include/mlir/Support/MlirOptMain.h b/mlir/include/mlir/Support/MlirOptMain.h
index 137cf66f01509c..da03baed2ae708 100644
--- a/mlir/include/mlir/Support/MlirOptMain.h
+++ b/mlir/include/mlir/Support/MlirOptMain.h
@@ -21,12 +21,14 @@ class MemoryBuffer;
 } // end namespace llvm
 
 namespace mlir {
+class DialectRegistry;
 class PassPipelineCLParser;
 
 /// Perform the core processing behind `mlir-opt`:
 /// - outputStream is the stream where the resulting IR is printed.
 /// - buffer is the in-memory file to parser and process.
 /// - passPipeline is the specification of the pipeline that will be applied.
+/// - registry should contain all the dialects that can be parsed in the source.
 /// - splitInputFile will look for a "-----" marker in the input file, and load
 /// each chunk in an individual ModuleOp processed separately.
 /// - verifyDiagnostics enables a verification mode where comments starting with
@@ -35,13 +37,25 @@ class PassPipelineCLParser;
 /// - verifyPasses enables the IR verifier in-between each pass in the pipeline.
 /// - allowUnregisteredDialects allows to parse and create operation without
 /// registering the Dialect in the MLIRContext.
+/// - preloadDialectsInContext will trigger the upfront loading of all
+///   dialects from the global registry in the MLIRContext. This option is
+///   deprecated and will be removed soon.
 LogicalResult MlirOptMain(llvm::raw_ostream &outputStream,
                           std::unique_ptr<llvm::MemoryBuffer> buffer,
                           const PassPipelineCLParser &passPipeline,
-                          bool splitInputFile, bool verifyDiagnostics,
-                          bool verifyPasses, bool allowUnregisteredDialects);
+                          DialectRegistry &registry, bool splitInputFile,
+                          bool verifyDiagnostics, bool verifyPasses,
+                          bool allowUnregisteredDialects,
+                          bool preloadDialectsInContext = true);
 
 /// Implementation for tools like `mlir-opt`.
-LogicalResult MlirOptMain(int argc, char **argv, llvm::StringRef toolName);
+/// - toolName is used for the header displayed by `--help`.
+/// - registry should contain all the dialects that can be parsed in the source.
+/// - preloadDialectsInContext will trigger the upfront loading of all
+///   dialects from the global registry in the MLIRContext. This option is
+///   deprecated and will be removed soon.
+LogicalResult MlirOptMain(int argc, char **argv, llvm::StringRef toolName,
+                          DialectRegistry &registry,
+                          bool preloadDialectsInContext = true);
 
 } // end namespace mlir
diff --git a/mlir/include/mlir/TableGen/Dialect.h b/mlir/include/mlir/TableGen/Dialect.h
index 5e85806f377f4f..99217d8c7d3d8c 100644
--- a/mlir/include/mlir/TableGen/Dialect.h
+++ b/mlir/include/mlir/TableGen/Dialect.h
@@ -14,6 +14,7 @@
 
 #include "mlir/Support/LLVM.h"
 #include <string>
+#include <vector>
 
 namespace llvm {
 class Record;
@@ -25,7 +26,7 @@ namespace tblgen {
 // and provides helper methods for accessing them.
 class Dialect {
 public:
-  explicit Dialect(const llvm::Record *def) : def(def) {}
+  explicit Dialect(const llvm::Record *def);
 
   // Returns the name of this dialect.
   StringRef getName() const;
@@ -43,6 +44,10 @@ class Dialect {
   // Returns the description of the dialect. Returns empty string if none.
   StringRef getDescription() const;
 
+  // Returns the list of dialect (class names) that this dialect depends on.
+  // These are dialects that will be loaded on construction of this dialect.
+  ArrayRef<StringRef> getDependentDialects() const;
+
   // Returns the dialects extra class declaration code.
   llvm::Optional<StringRef> getExtraClassDeclaration() const;
 
@@ -70,6 +75,7 @@ class Dialect {
 
 private:
   const llvm::Record *def;
+  std::vector<StringRef> dependentDialects;
 };
 } // end namespace tblgen
 } // end namespace mlir
diff --git a/mlir/include/mlir/TableGen/Pass.h b/mlir/include/mlir/TableGen/Pass.h
index 02427e42a5256f..968c85416965d0 100644
--- a/mlir/include/mlir/TableGen/Pass.h
+++ b/mlir/include/mlir/TableGen/Pass.h
@@ -94,6 +94,9 @@ class Pass {
   /// Return the C++ constructor call to create an instance of this pass.
   StringRef getConstructor() const;
 
+  /// Return the dialects this pass needs to be registered.
+  ArrayRef<StringRef> getDependentDialects() const;
+
   /// Return the options provided by this pass.
   ArrayRef<PassOption> getOptions() const;
 
@@ -104,6 +107,7 @@ class Pass {
 
 private:
   const llvm::Record *def;
+  std::vector<StringRef> dependentDialects;
   std::vector<PassOption> options;
   std::vector<PassStatistic> statistics;
 };
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 77878057349849..3292d5e7dec2d4 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -162,6 +162,8 @@ def BufferPlacement : FunctionPass<"buffer-placement"> {
 
   }];
   let constructor = "mlir::createBufferPlacementPass()";
+  // TODO: this pass likely shouldn't depend on Linalg?
+  let dependentDialects = ["linalg::LinalgDialect"];
 }
 
 def Canonicalizer : Pass<"canonicalize"> {
diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp
index 1ba1a6aca6f8ed..928a773b2351a1 100644
--- a/mlir/lib/CAPI/IR/IR.cpp
+++ b/mlir/lib/CAPI/IR/IR.cpp
@@ -10,9 +10,11 @@
 
 #include "mlir/CAPI/IR.h"
 #include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
 #include "mlir/IR/Module.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Types.h"
+#include "mlir/InitAllDialects.h"
 #include "mlir/Parser.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -50,12 +52,17 @@ class CallbackOstream : public llvm::raw_ostream {
 /* ========================================================================== */
 
 MlirContext mlirContextCreate() {
-  auto *context = new MLIRContext;
+  auto *context = new MLIRContext(/*loadAllDialects=*/false);
   return wrap(context);
 }
 
 void mlirContextDestroy(MlirContext context) { delete unwrap(context); }
 
+void mlirContextLoadAllDialects(MlirContext context) {
+  registerAllDialects(unwrap(context));
+  getGlobalDialectRegistry().loadAll(unwrap(context));
+}
+
 /* ========================================================================== */
 /* Location API.                                                              */
 /* ========================================================================== */
diff --git a/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp b/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
index 1ebf48174aafb6..42673936b87854 100644
--- a/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
+++ b/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
@@ -16,6 +16,7 @@
 #include "../PassDetail.h"
 #include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/Serialization.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
diff --git a/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp b/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
index 7b57854dde9833..0460d98b44a470 100644
--- a/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
+++ b/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
diff --git a/mlir/lib/Conversion/PassDetail.h b/mlir/lib/Conversion/PassDetail.h
index 6da0bc81e7af60..7fa5a5a92f2015 100644
--- a/mlir/lib/Conversion/PassDetail.h
+++ b/mlir/lib/Conversion/PassDetail.h
@@ -12,11 +12,43 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+class AffineDialect;
+class StandardOpsDialect;
+
+// Forward declaration from Dialect.h
+template <typename ConcreteDialect>
+void registerDialect(DialectRegistry &registry);
 
 namespace gpu {
+class GPUDialect;
 class GPUModuleOp;
 } // end namespace gpu
 
+namespace LLVM {
+class LLVMDialect;
+class LLVMAVX512Dialect;
+} // end namespace LLVM
+
+namespace NVVM {
+class NVVMDialect;
+} // end namespace NVVM
+
+namespace ROCDL {
+class ROCDLDialect;
+} // end namespace ROCDL
+
+namespace scf {
+class SCFDialect;
+} // end namespace scf
+
+namespace spirv {
+class SPIRVDialect;
+} // end namespace spirv
+
+namespace vector {
+class VectorDialect;
+} // end namespace vector
+
 #define GEN_PASS_CLASSES
 #include "mlir/Conversion/Passes.h.inc"
 
diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
index 0ee1166b1a643b..44d912bfd8eda6 100644
--- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -125,7 +125,7 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx)
 /// Create an LLVMTypeConverter using custom LowerToLLVMOptions.
 LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
                                      const LowerToLLVMOptions &options)
-    : llvmDialect(ctx->getRegisteredDialect<LLVM::LLVMDialect>()),
+    : llvmDialect(ctx->getOrLoadDialect<LLVM::LLVMDialect>()),
       options(options) {
   assert(llvmDialect && "LLVM IR dialect is not registered");
   if (options.indexBitwidth == kDeriveIndexBitwidthFromDataLayout)
diff --git a/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp b/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
index 19643d271f8dc3..a2e608dcb71347 100644
--- a/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
+++ b/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
@@ -14,6 +14,7 @@
 #include "../PassDetail.h"
 #include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h"
 #include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/IR/PatternMatch.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/PassDetail.h b/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
index 3bae0592b3d4fa..da8f7ac3fc81c7 100644
--- a/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
@@ -12,6 +12,16 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+// Forward declaration from Dialect.h
+template <typename ConcreteDialect>
+void registerDialect(DialectRegistry &registry);
+
+namespace linalg {
+class LinalgDialect;
+} // end namespace linalg
+namespace vector {
+class VectorDialect;
+} // end namespace vector
 
 #define GEN_PASS_CLASSES
 #include "mlir/Dialect/Affine/Passes.h.inc"
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 308272d66d567a..5d52d09bb191cb 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -1244,6 +1244,7 @@ template <typename NamedStructuredOpType>
 static ParseResult parseNamedStructuredOp(OpAsmParser &parser,
                                           OperationState &result) {
   SmallVector<OpAsmParser::OperandType, 8> operandsInfo;
+  result.getContext()->getOrLoadDialect<StandardOpsDialect>();
 
   // Optional attributes may be added.
   if (parser.parseOperandList(operandsInfo) ||
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PassDetail.h b/mlir/lib/Dialect/Linalg/Transforms/PassDetail.h
index 7fa05ff1212070..0415aeb8a1fd6f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/Linalg/Transforms/PassDetail.h
@@ -9,9 +9,18 @@
 #ifndef DIALECT_LINALG_TRANSFORMS_PASSDETAIL_H_
 #define DIALECT_LINALG_TRANSFORMS_PASSDETAIL_H_
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/IR/Dialect.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+// Forward declaration from Dialect.h
+template <typename ConcreteDialect>
+void registerDialect(DialectRegistry &registry);
+
+namespace scf {
+class SCFDialect;
+} // end namespace scf
 
 #define GEN_PASS_CLASSES
 #include "mlir/Dialect/Linalg/Passes.h.inc"
diff --git a/mlir/lib/Dialect/SCF/Transforms/PassDetail.h b/mlir/lib/Dialect/SCF/Transforms/PassDetail.h
index 95f8636b27c19d..6fa7f227d3da5b 100644
--- a/mlir/lib/Dialect/SCF/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/SCF/Transforms/PassDetail.h
@@ -12,6 +12,11 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+// Forward declaration from Dialect.h
+template <typename ConcreteDialect>
+void registerDialect(DialectRegistry &registry);
+
+class AffineDialect;
 
 #define GEN_PASS_CLASSES
 #include "mlir/Dialect/SCF/Passes.h.inc"
diff --git a/mlir/lib/Dialect/SDBM/SDBMExpr.cpp b/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
index 435c7fe25f0c90..a1971c3da3b286 100644
--- a/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
+++ b/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
@@ -517,7 +517,7 @@ Optional<SDBMExpr> SDBMExpr::tryConvertAffineExpr(AffineExpr affine) {
 
     SDBMDialect *dialect;
   } converter;
-  converter.dialect = affine.getContext()->getRegisteredDialect<SDBMDialect>();
+  converter.dialect = affine.getContext()->getOrLoadDialect<SDBMDialect>();
 
   if (auto result = converter.visit(affine))
     return result;
diff --git a/mlir/lib/ExecutionEngine/JitRunner.cpp b/mlir/lib/ExecutionEngine/JitRunner.cpp
index 7959183e89682c..2b18adb3734713 100644
--- a/mlir/lib/ExecutionEngine/JitRunner.cpp
+++ b/mlir/lib/ExecutionEngine/JitRunner.cpp
@@ -259,7 +259,9 @@ int mlir::JitRunnerMain(
     }
   }
 
-  MLIRContext context;
+  MLIRContext context(/*loadAllDialects=*/false);
+  registerAllDialects(&context);
+
   auto m = parseMLIRInput(options.inputFilename, &context);
   if (!m) {
     llvm::errs() << "could not parse the input IR\n";
diff --git a/mlir/lib/IR/Dialect.cpp b/mlir/lib/IR/Dialect.cpp
index 555bb2bf0eb4b6..44b0ee6b61d0aa 100644
--- a/mlir/lib/IR/Dialect.cpp
+++ b/mlir/lib/IR/Dialect.cpp
@@ -27,21 +27,29 @@ DialectAsmParser::~DialectAsmParser() {}
 //===----------------------------------------------------------------------===//
 
 /// Registry for all dialect allocation functions.
-static llvm::ManagedStatic<llvm::MapVector<TypeID, DialectAllocatorFunction>>
-    dialectRegistry;
-
-void Dialect::registerDialectAllocator(
-    TypeID typeID, const DialectAllocatorFunction &function) {
-  assert(function &&
-         "Attempting to register an empty dialect initialize function");
-  dialectRegistry->insert({typeID, function});
-}
+static llvm::ManagedStatic<DialectRegistry> dialectRegistry;
+DialectRegistry &mlir::getGlobalDialectRegistry() { return *dialectRegistry; }
 
-/// Registers all dialects and hooks from the global registries with the
-/// specified MLIRContext.
 void mlir::registerAllDialects(MLIRContext *context) {
-  for (const auto &it : *dialectRegistry)
-    it.second(context);
+  dialectRegistry->appendTo(context->getDialectRegistry());
+}
+
+Dialect *DialectRegistry::loadByName(StringRef name, MLIRContext *context) {
+  auto it = registry.find(name.str());
+  if (it == registry.end())
+    return nullptr;
+  return it->second.second(context);
+}
+
+void DialectRegistry::insert(TypeID typeID, StringRef name,
+                             DialectAllocatorFunction ctor) {
+  auto inserted =
+      registry.insert(std::make_pair(name, std::make_pair(typeID, ctor)));
+  if (!inserted.second && inserted.first->second.first != typeID) {
+    llvm::report_fatal_error(
+        "Trying to register different dialects for the same namespace: " +
+        name);
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -119,7 +127,7 @@ DialectInterface::~DialectInterface() {}
 
 DialectInterfaceCollectionBase::DialectInterfaceCollectionBase(
     MLIRContext *ctx, TypeID interfaceKind) {
-  for (auto *dialect : ctx->getRegisteredDialects()) {
+  for (auto *dialect : ctx->getLoadedDialects()) {
     if (auto *interface = dialect->getRegisteredInterface(interfaceKind)) {
       interfaces.insert(interface);
       orderedInterfaces.push_back(interface);
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 0d66070657aaff..7c8a637ede0fcf 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -31,10 +31,13 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
+#define DEBUG_TYPE "mlircontext"
+
 using namespace mlir;
 using namespace mlir::detail;
 
@@ -275,7 +278,8 @@ class MLIRContextImpl {
 
   /// This is a list of dialects that are created referring to this context.
   /// The MLIRContext owns the objects.
-  std::vector<std::unique_ptr<Dialect>> dialects;
+  DenseMap<StringRef, std::unique_ptr<Dialect>> loadedDialects;
+  DialectRegistry dialectsRegistry;
 
   /// This is a mapping from operation name to AbstractOperation for registered
   /// operations.
@@ -346,7 +350,7 @@ class MLIRContextImpl {
 };
 } // end namespace mlir
 
-MLIRContext::MLIRContext() : impl(new MLIRContextImpl()) {
+MLIRContext::MLIRContext(bool loadAllDialects) : impl(new MLIRContextImpl()) {
   // Initialize values based on the command line flags if they were provided.
   if (clOptions.isConstructed()) {
     disableMultithreading(clOptions->disableThreading);
@@ -355,8 +359,9 @@ MLIRContext::MLIRContext() : impl(new MLIRContextImpl()) {
   }
 
   // Register dialects with this context.
-  getOrCreateDialect<BuiltinDialect>();
-  registerAllDialects(this);
+  getOrLoadDialect<BuiltinDialect>();
+  if (loadAllDialects)
+    loadAllGloballyRegisteredDialects();
 
   // Initialize several common attributes and types to avoid the need to lock
   // the context when accessing them.
@@ -438,54 +443,72 @@ DiagnosticEngine &MLIRContext::getDiagEngine() { return getImpl().diagEngine; }
 // Dialect and Operation Registration
 //===----------------------------------------------------------------------===//
 
+DialectRegistry &MLIRContext::getDialectRegistry() {
+  return impl->dialectsRegistry;
+}
+
 /// Return information about all registered IR dialects.
-std::vector<Dialect *> MLIRContext::getRegisteredDialects() {
+std::vector<Dialect *> MLIRContext::getLoadedDialects() {
   std::vector<Dialect *> result;
-  result.reserve(impl->dialects.size());
-  for (auto &dialect : impl->dialects)
-    result.push_back(dialect.get());
+  result.reserve(impl->loadedDialects.size());
+  for (auto &dialect : impl->loadedDialects)
+    result.push_back(dialect.second.get());
+  llvm::array_pod_sort(result.begin(), result.end(),
+                       [](Dialect *const *lhs, Dialect *const *rhs) -> int {
+                         return (*lhs)->getNamespace() < (*rhs)->getNamespace();
+                       });
+  return result;
+}
+std::vector<StringRef> MLIRContext::getAvailableDialects() {
+  std::vector<StringRef> result;
+  for (auto &dialect : impl->dialectsRegistry)
+    result.push_back(dialect.first);
   return result;
 }
 
 /// Get a registered IR dialect with the given namespace. If none is found,
 /// then return nullptr.
-Dialect *MLIRContext::getRegisteredDialect(StringRef name) {
+Dialect *MLIRContext::getLoadedDialect(StringRef name) {
   // Dialects are sorted by name, so we can use binary search for lookup.
-  auto it = llvm::lower_bound(
-      impl->dialects, name,
-      [](const auto &lhs, StringRef rhs) { return lhs->getNamespace() < rhs; });
-  return (it != impl->dialects.end() && (*it)->getNamespace() == name)
-             ? (*it).get()
-             : nullptr;
+  auto it = impl->loadedDialects.find(name);
+  return (it != impl->loadedDialects.end()) ? it->second.get() : nullptr;
+}
+
+Dialect *MLIRContext::getOrLoadDialect(StringRef name) {
+  Dialect *dialect = getLoadedDialect(name);
+  if (dialect)
+    return dialect;
+  return impl->dialectsRegistry.loadByName(name, this);
 }
 
 /// Get a dialect for the provided namespace and TypeID: abort the program if a
 /// dialect exist for this namespace with different TypeID. Returns a pointer to
 /// the dialect owned by the context.
 Dialect *
-MLIRContext::getOrCreateDialect(StringRef dialectNamespace, TypeID dialectID,
-                                function_ref<std::unique_ptr<Dialect>()> ctor) {
+MLIRContext::getOrLoadDialect(StringRef dialectNamespace, TypeID dialectID,
+                              function_ref<std::unique_ptr<Dialect>()> ctor) {
   auto &impl = getImpl();
   // Get the correct insertion position sorted by namespace.
-  auto insertPt =
-      llvm::lower_bound(impl.dialects, nullptr,
-                        [&](const std::unique_ptr<Dialect> &lhs,
-                            const std::unique_ptr<Dialect> &rhs) {
-                          if (!lhs)
-                            return dialectNamespace < rhs->getNamespace();
-                          return lhs->getNamespace() < dialectNamespace;
-                        });
+  std::unique_ptr<Dialect> &dialect = impl.loadedDialects[dialectNamespace];
+
+  if (!dialect) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Load new dialect in Context" << dialectNamespace);
+    dialect = ctor();
+    assert(dialect && "dialect ctor failed");
+    return dialect.get();
+  }
 
   // Abort if dialect with namespace has already been registered.
-  if (insertPt != impl.dialects.end() &&
-      (*insertPt)->getNamespace() == dialectNamespace) {
-    if ((*insertPt)->getTypeID() == dialectID)
-      return insertPt->get();
+  if (dialect->getTypeID() != dialectID)
     llvm::report_fatal_error("a dialect with namespace '" + dialectNamespace +
                              "' has already been registered");
-  }
-  auto it = impl.dialects.insert(insertPt, ctor());
-  return &**it;
+
+  return dialect.get();
+}
+
+void MLIRContext::loadAllGloballyRegisteredDialects() {
+  getGlobalDialectRegistry().loadAll(this);
 }
 
 bool MLIRContext::allowsUnregisteredDialects() {
diff --git a/mlir/lib/IR/Operation.cpp b/mlir/lib/IR/Operation.cpp
index 4ddc3df38a7e80..67249b83b10470 100644
--- a/mlir/lib/IR/Operation.cpp
+++ b/mlir/lib/IR/Operation.cpp
@@ -214,7 +214,7 @@ Dialect *Operation::getDialect() {
 
   // If this operation hasn't been registered or doesn't have abstract
   // operation, try looking up the dialect name in the context.
-  return getContext()->getRegisteredDialect(getName().getDialect());
+  return getContext()->getLoadedDialect(getName().getDialect());
 }
 
 Region *Operation::getParentRegion() {
diff --git a/mlir/lib/IR/Verifier.cpp b/mlir/lib/IR/Verifier.cpp
index b1aed8842dc431..4caf9891383c57 100644
--- a/mlir/lib/IR/Verifier.cpp
+++ b/mlir/lib/IR/Verifier.cpp
@@ -50,7 +50,7 @@ class OperationVerifier {
   Dialect *getDialectForAttribute(const NamedAttribute &attr) {
     assert(attr.first.strref().contains('.') && "expected dialect attribute");
     auto dialectNamePair = attr.first.strref().split('.');
-    return ctx->getRegisteredDialect(dialectNamePair.first);
+    return ctx->getLoadedDialect(dialectNamePair.first);
   }
 
 private:
@@ -218,7 +218,7 @@ LogicalResult OperationVerifier::verifyOperation(Operation &op) {
   auto it = dialectAllowsUnknownOps.find(dialectPrefix);
   if (it == dialectAllowsUnknownOps.end()) {
     // If the operation dialect is registered, query it directly.
-    if (auto *dialect = ctx->getRegisteredDialect(dialectPrefix))
+    if (auto *dialect = ctx->getLoadedDialect(dialectPrefix))
       it = dialectAllowsUnknownOps
                .try_emplace(dialectPrefix, dialect->allowsUnknownOperations())
                .first;
diff --git a/mlir/lib/Parser/AttributeParser.cpp b/mlir/lib/Parser/AttributeParser.cpp
index 1c1261e6d765c9..37ee938a4bcd5c 100644
--- a/mlir/lib/Parser/AttributeParser.cpp
+++ b/mlir/lib/Parser/AttributeParser.cpp
@@ -12,6 +12,7 @@
 
 #include "Parser.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Dialect.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/IR/StandardTypes.h"
 #include "llvm/ADT/StringExtras.h"
@@ -246,6 +247,11 @@ ParseResult Parser::parseAttributeDict(NamedAttrList &attributes) {
       return emitError("duplicate key in dictionary attribute");
     consumeToken();
 
+    // Lazy load a dialect in the context if there is a possible namespace.
+    auto splitName = nameId->strref().split('.');
+    if (!splitName.second.empty())
+      getContext()->getOrLoadDialect(splitName.first);
+
     // Try to parse the '=' for the attribute value.
     if (!consumeIf(Token::equal)) {
       // If there is no '=', we treat this as a unit attribute.
@@ -817,7 +823,9 @@ Attribute Parser::parseOpaqueElementsAttr(Type attrType) {
     return (emitError("expected dialect namespace"), nullptr);
 
   auto name = getToken().getStringValue();
-  auto *dialect = builder.getContext()->getRegisteredDialect(name);
+  // Lazy load a dialect in the context if there is a possible namespace.
+  Dialect *dialect = builder.getContext()->getOrLoadDialect(name);
+
   // TODO: Allow for having an unknown dialect on an opaque
   // attribute. Otherwise, it can't be roundtripped without having the dialect
   // registered.
diff --git a/mlir/lib/Parser/DialectSymbolParser.cpp b/mlir/lib/Parser/DialectSymbolParser.cpp
index 3b522a876f2541..d45ddf0719897c 100644
--- a/mlir/lib/Parser/DialectSymbolParser.cpp
+++ b/mlir/lib/Parser/DialectSymbolParser.cpp
@@ -526,7 +526,8 @@ Attribute Parser::parseExtendedAttr(Type type) {
           return Attribute();
 
         // If we found a registered dialect, then ask it to parse the attribute.
-        if (auto *dialect = state.context->getRegisteredDialect(dialectName)) {
+        if (Dialect *dialect =
+                builder.getContext()->getOrLoadDialect(dialectName)) {
           return parseSymbol<Attribute>(
               symbolData, state.context, state.symbols, [&](Parser &parser) {
                 CustomDialectAsmParser customParser(symbolData, parser);
@@ -563,7 +564,9 @@ Type Parser::parseExtendedType() {
       [&](StringRef dialectName, StringRef symbolData,
           llvm::SMLoc loc) -> Type {
         // If we found a registered dialect, then ask it to parse the type.
-        if (auto *dialect = state.context->getRegisteredDialect(dialectName)) {
+        auto *dialect = state.context->getOrLoadDialect(dialectName);
+
+        if (dialect) {
           return parseSymbol<Type>(
               symbolData, state.context, state.symbols, [&](Parser &parser) {
                 CustomDialectAsmParser customParser(symbolData, parser);
diff --git a/mlir/lib/Parser/Parser.cpp b/mlir/lib/Parser/Parser.cpp
index 3a995a4e2b048f..837b08ca54c0b2 100644
--- a/mlir/lib/Parser/Parser.cpp
+++ b/mlir/lib/Parser/Parser.cpp
@@ -12,6 +12,7 @@
 
 #include "Parser.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Dialect.h"
 #include "mlir/IR/Module.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Parser.h"
@@ -727,7 +728,7 @@ Operation *OperationParser::parseGenericOperation() {
   // Get location information for the operation.
   auto srcLocation = getEncodedSourceLocation(getToken().getLoc());
 
-  auto name = getToken().getStringValue();
+  std::string name = getToken().getStringValue();
   if (name.empty())
     return (emitError("empty operation name is invalid"), nullptr);
   if (name.find('\0') != StringRef::npos)
@@ -737,6 +738,15 @@ Operation *OperationParser::parseGenericOperation() {
 
   OperationState result(srcLocation, name);
 
+  // Lazy load dialects in the context as needed.
+  if (!result.name.getAbstractOperation()) {
+    StringRef dialectName = StringRef(name).split('.').first;
+    if (!getContext()->getLoadedDialect(dialectName) &&
+        getContext()->getOrLoadDialect(dialectName)) {
+      result.name = OperationName(name, getContext());
+    }
+  }
+
   // Parse the operand list.
   SmallVector<SSAUseInfo, 8> operandInfos;
   if (parseToken(Token::l_paren, "expected '(' to start operand list") ||
@@ -1442,17 +1452,28 @@ class CustomOpAsmParser : public OpAsmParser {
 
 Operation *
 OperationParser::parseCustomOperation(ArrayRef<ResultRecord> resultIDs) {
-  auto opLoc = getToken().getLoc();
-  auto opName = getTokenSpelling();
+  llvm::SMLoc opLoc = getToken().getLoc();
+  StringRef opName = getTokenSpelling();
 
   auto *opDefinition = AbstractOperation::lookup(opName, getContext());
-  if (!opDefinition && !opName.contains('.')) {
-    // If the operation name has no namespace prefix we treat it as a standard
-    // operation and prefix it with "std".
-    // TODO: Would it be better to just build a mapping of the registered
-    // operations in the standard dialect?
-    opDefinition =
-        AbstractOperation::lookup(Twine("std." + opName).str(), getContext());
+  if (!opDefinition) {
+    if (opName.contains('.')) {
+      // This op has a dialect, we try to check if we can register it in the
+      // context on the fly.
+      StringRef dialectName = opName.split('.').first;
+      if (!getContext()->getLoadedDialect(dialectName) &&
+          getContext()->getOrLoadDialect(dialectName)) {
+        opDefinition = AbstractOperation::lookup(opName, getContext());
+      }
+    } else {
+      // If the operation name has no namespace prefix we treat it as a standard
+      // operation and prefix it with "std".
+      // TODO: Would it be better to just build a mapping of the registered
+      // operations in the standard dialect?
+      if (getContext()->getOrLoadDialect("std"))
+        opDefinition = AbstractOperation::lookup(Twine("std." + opName).str(),
+                                                 getContext());
+    }
   }
 
   if (!opDefinition) {
diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp
index b791bf483e675a..9bc23c2e4a653a 100644
--- a/mlir/lib/Pass/Pass.cpp
+++ b/mlir/lib/Pass/Pass.cpp
@@ -290,6 +290,13 @@ OpPassManager::pass_iterator OpPassManager::begin() {
 }
 OpPassManager::pass_iterator OpPassManager::end() { return impl->passes.end(); }
 
+OpPassManager::const_pass_iterator OpPassManager::begin() const {
+  return impl->passes.begin();
+}
+OpPassManager::const_pass_iterator OpPassManager::end() const {
+  return impl->passes.end();
+}
+
 /// Run all of the passes in this manager over the current operation.
 LogicalResult OpPassManager::run(Operation *op, AnalysisManager am) {
   // Run each of the held passes.
@@ -346,6 +353,16 @@ void OpPassManager::printAsTextualPipeline(raw_ostream &os) {
   ::printAsTextualPipeline(impl->passes, os);
 }
 
+static void registerDialectsForPipeline(const OpPassManager &pm,
+                                        DialectRegistry &dialects) {
+  for (const Pass &pass : pm.getPasses())
+    pass.getDependentDialects(dialects);
+}
+
+void OpPassManager::getDependentDialects(DialectRegistry &dialects) const {
+  registerDialectsForPipeline(*this, dialects);
+}
+
 //===----------------------------------------------------------------------===//
 // OpToOpPassAdaptor
 //===----------------------------------------------------------------------===//
@@ -378,6 +395,11 @@ OpToOpPassAdaptor::OpToOpPassAdaptor(OpPassManager &&mgr) {
   mgrs.emplace_back(std::move(mgr));
 }
 
+void OpToOpPassAdaptor::getDependentDialects(DialectRegistry &dialects) const {
+  for (auto &pm : mgrs)
+    pm.getDependentDialects(dialects);
+}
+
 /// Merge the current pass adaptor into given 'rhs'.
 void OpToOpPassAdaptor::mergeInto(OpToOpPassAdaptor &rhs) {
   for (auto &pm : mgrs) {
@@ -721,6 +743,11 @@ LogicalResult PassManager::run(ModuleOp module) {
   // pipeline.
   getImpl().coalesceAdjacentAdaptorPasses();
 
+  // Register all dialects for the current pipeline.
+  DialectRegistry dependentDialects;
+  getDependentDialects(dependentDialects);
+  dependentDialects.loadAll(module.getContext());
+
   // Construct an analysis manager for the pipeline.
   ModuleAnalysisManager am(module, instrumentor.get());
 
diff --git a/mlir/lib/Pass/PassDetail.h b/mlir/lib/Pass/PassDetail.h
index 2342a1a7af97d8..f69701d85e15a5 100644
--- a/mlir/lib/Pass/PassDetail.h
+++ b/mlir/lib/Pass/PassDetail.h
@@ -43,6 +43,10 @@ class OpToOpPassAdaptor
   /// Returns the pass managers held by this adaptor.
   MutableArrayRef<OpPassManager> getPassManagers() { return mgrs; }
 
+  /// Populate the set of dependent dialects for the passes in the current
+  /// adaptor.
+  void getDependentDialects(DialectRegistry &dialects) const override;
+
   /// Return the async pass managers held by this parallel adaptor.
   MutableArrayRef<SmallVector<OpPassManager, 1>> getParallelPassManagers() {
     return asyncExecutors;
diff --git a/mlir/lib/Support/MlirOptMain.cpp b/mlir/lib/Support/MlirOptMain.cpp
index 699eded9fe285a..77b07605407b73 100644
--- a/mlir/lib/Support/MlirOptMain.cpp
+++ b/mlir/lib/Support/MlirOptMain.cpp
@@ -81,13 +81,18 @@ static LogicalResult processBuffer(raw_ostream &os,
                                    std::unique_ptr<MemoryBuffer> ownedBuffer,
                                    bool verifyDiagnostics, bool verifyPasses,
                                    bool allowUnregisteredDialects,
-                                   const PassPipelineCLParser &passPipeline) {
+                                   bool preloadDialectsInContext,
+                                   const PassPipelineCLParser &passPipeline,
+                                   DialectRegistry &registry) {
   // Tell sourceMgr about this buffer, which is what the parser will pick up.
   SourceMgr sourceMgr;
   sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), SMLoc());
 
   // Parse the input file.
-  MLIRContext context;
+  MLIRContext context(/*loadAllDialects=*/preloadDialectsInContext);
+  registry.appendTo(context.getDialectRegistry());
+  if (preloadDialectsInContext)
+    registry.loadAll(&context);
   context.allowUnregisteredDialects(allowUnregisteredDialects);
   context.printOpOnDiagnostic(!verifyDiagnostics);
 
@@ -115,9 +120,10 @@ static LogicalResult processBuffer(raw_ostream &os,
 LogicalResult mlir::MlirOptMain(raw_ostream &outputStream,
                                 std::unique_ptr<MemoryBuffer> buffer,
                                 const PassPipelineCLParser &passPipeline,
-                                bool splitInputFile, bool verifyDiagnostics,
-                                bool verifyPasses,
-                                bool allowUnregisteredDialects) {
+                                DialectRegistry &registry, bool splitInputFile,
+                                bool verifyDiagnostics, bool verifyPasses,
+                                bool allowUnregisteredDialects,
+                                bool preloadDialectsInContext) {
   // The split-input-file mode is a very specific mode that slices the file
   // up into small pieces and checks each independently.
   if (splitInputFile)
@@ -126,15 +132,19 @@ LogicalResult mlir::MlirOptMain(raw_ostream &outputStream,
         [&](std::unique_ptr<MemoryBuffer> chunkBuffer, raw_ostream &os) {
           return processBuffer(os, std::move(chunkBuffer), verifyDiagnostics,
                                verifyPasses, allowUnregisteredDialects,
-                               passPipeline);
+                               preloadDialectsInContext, passPipeline,
+                               registry);
         },
         outputStream);
 
   return processBuffer(outputStream, std::move(buffer), verifyDiagnostics,
-                       verifyPasses, allowUnregisteredDialects, passPipeline);
+                       verifyPasses, allowUnregisteredDialects,
+                       preloadDialectsInContext, passPipeline, registry);
 }
 
-LogicalResult mlir::MlirOptMain(int argc, char **argv, StringRef toolName) {
+LogicalResult mlir::MlirOptMain(int argc, char **argv, llvm::StringRef toolName,
+                                DialectRegistry &registry,
+                                bool preloadDialectsInContext) {
   static cl::opt<std::string> inputFilename(
       cl::Positional, cl::desc("<input file>"), cl::init("-"));
 
@@ -180,25 +190,19 @@ LogicalResult mlir::MlirOptMain(int argc, char **argv, StringRef toolName) {
   {
     llvm::raw_string_ostream os(helpHeader);
     MLIRContext context;
-    interleaveComma(context.getRegisteredDialects(), os, [&](Dialect *dialect) {
-      StringRef name = dialect->getNamespace();
-      // filter the builtin dialect.
-      if (name.empty())
-        os << "<builtin>";
-      else
-        os << name;
+    interleaveComma(registry, os, [&](auto &registryEntry) {
+      StringRef name = registryEntry.first;
+      os << name;
     });
   }
   // Parse pass names in main to ensure static initialization completed.
   cl::ParseCommandLineOptions(argc, argv, helpHeader);
 
   if (showDialects) {
-    llvm::outs() << "Registered Dialects:\n";
-    MLIRContext context;
+    llvm::outs() << "Available Dialects:\n";
     interleave(
-        context.getRegisteredDialects(), llvm::outs(),
-        [](Dialect *dialect) { llvm::outs() << dialect->getNamespace(); },
-        "\n");
+        registry, llvm::outs(),
+        [](auto &registryEntry) { llvm::outs() << registryEntry.first; }, "\n");
     return success();
   }
 
@@ -216,9 +220,9 @@ LogicalResult mlir::MlirOptMain(int argc, char **argv, StringRef toolName) {
     return failure();
   }
 
-  if (failed(MlirOptMain(output->os(), std::move(file), passPipeline,
+  if (failed(MlirOptMain(output->os(), std::move(file), passPipeline, registry,
                          splitInputFile, verifyDiagnostics, verifyPasses,
-                         allowUnregisteredDialects)))
+                         allowUnregisteredDialects, preloadDialectsInContext)))
     return failure();
 
   // Keep the output file if the invocation of MlirOptMain was successful.
diff --git a/mlir/lib/TableGen/Dialect.cpp b/mlir/lib/TableGen/Dialect.cpp
index 6af77e7df0f6fa..2b5f7e534ecc7b 100644
--- a/mlir/lib/TableGen/Dialect.cpp
+++ b/mlir/lib/TableGen/Dialect.cpp
@@ -15,6 +15,10 @@
 
 using namespace mlir;
 using namespace mlir::tblgen;
+Dialect::Dialect(const llvm::Record *def) : def(def) {
+  for (StringRef dialect : def->getValueAsListOfStrings("dependentDialects"))
+    dependentDialects.push_back(dialect);
+}
 
 StringRef Dialect::getName() const { return def->getValueAsString("name"); }
 
@@ -46,6 +50,10 @@ StringRef Dialect::getDescription() const {
   return getAsStringOrEmpty(*def, "description");
 }
 
+ArrayRef<StringRef> Dialect::getDependentDialects() const {
+  return dependentDialects;
+}
+
 llvm::Optional<StringRef> Dialect::getExtraClassDeclaration() const {
   auto value = def->getValueAsString("extraClassDeclaration");
   return value.empty() ? llvm::Optional<StringRef>() : value;
diff --git a/mlir/lib/TableGen/Pass.cpp b/mlir/lib/TableGen/Pass.cpp
index 4bc46b622c2b9f..f96180689c55af 100644
--- a/mlir/lib/TableGen/Pass.cpp
+++ b/mlir/lib/TableGen/Pass.cpp
@@ -69,6 +69,8 @@ Pass::Pass(const llvm::Record *def) : def(def) {
     options.push_back(PassOption(init));
   for (auto *init : def->getValueAsListOfDefs("statistics"))
     statistics.push_back(PassStatistic(init));
+  for (StringRef dialect : def->getValueAsListOfStrings("dependentDialects"))
+    dependentDialects.push_back(dialect);
 }
 
 StringRef Pass::getArgument() const {
@@ -88,6 +90,9 @@ StringRef Pass::getDescription() const {
 StringRef Pass::getConstructor() const {
   return def->getValueAsString("constructor");
 }
+ArrayRef<StringRef> Pass::getDependentDialects() const {
+  return dependentDialects;
+}
 
 ArrayRef<PassOption> Pass::getOptions() const { return options; }
 
diff --git a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
index 470044bc995372..a5d83389387959 100644
--- a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
@@ -836,6 +836,7 @@ LogicalResult Importer::processBasicBlock(llvm::BasicBlock *bb, Block *block) {
 OwningModuleRef
 mlir::translateLLVMIRToModule(std::unique_ptr<llvm::Module> llvmModule,
                               MLIRContext *context) {
+  context->loadDialect<LLVMDialect>();
   OwningModuleRef module(ModuleOp::create(
       FileLineColLoc::get("", /*line=*/0, /*column=*/0, context)));
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index f8277d154f2765..21f5201c7d697a 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -302,8 +302,7 @@ ModuleTranslation::ModuleTranslation(Operation *module,
     : mlirModule(module), llvmModule(std::move(llvmModule)),
       debugTranslation(
           std::make_unique<DebugTranslation>(module, *this->llvmModule)),
-      ompDialect(
-          module->getContext()->getRegisteredDialect<omp::OpenMPDialect>()),
+      ompDialect(module->getContext()->getOrLoadDialect<omp::OpenMPDialect>()),
       typeTranslator(this->llvmModule->getContext()) {
   assert(satisfiesLLVMModule(mlirModule) &&
          "mlirModule should honor LLVM's module semantics.");
@@ -944,8 +943,8 @@ ModuleTranslation::lookupValues(ValueRange values) {
 
 std::unique_ptr<llvm::Module> ModuleTranslation::prepareLLVMModule(
     Operation *m, llvm::LLVMContext &llvmContext, StringRef name) {
+  m->getContext()->getOrLoadDialect<LLVM::LLVMDialect>();
   auto llvmModule = std::make_unique<llvm::Module>(name, llvmContext);
-
   if (auto dataLayoutAttr =
           m->getAttr(LLVM::LLVMDialect::getDataLayoutAttrName()))
     llvmModule->setDataLayout(dataLayoutAttr.cast<StringAttr>().getValue());
diff --git a/mlir/lib/Transforms/PassDetail.h b/mlir/lib/Transforms/PassDetail.h
index c6f7e225d71ac5..220ed1aac40797 100644
--- a/mlir/lib/Transforms/PassDetail.h
+++ b/mlir/lib/Transforms/PassDetail.h
@@ -12,6 +12,13 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+// Forward declaration from Dialect.h
+template <typename ConcreteDialect>
+void registerDialect(DialectRegistry &registry);
+
+namespace linalg {
+class LinalgDialect;
+} // end namespace linalg
 
 #define GEN_PASS_CLASSES
 #include "mlir/Transforms/Passes.h.inc"
diff --git a/mlir/test/CAPI/ir.c b/mlir/test/CAPI/ir.c
index 56b7ecd7fd7c4c..04d42f29a294e5 100644
--- a/mlir/test/CAPI/ir.c
+++ b/mlir/test/CAPI/ir.c
@@ -383,6 +383,7 @@ static int printStandardTypes(MlirContext ctx) {
 int main() {
   mlirRegisterAllDialects();
   MlirContext ctx = mlirContextCreate();
+  mlirContextLoadAllDialects(ctx);
   MlirLocation location = mlirLocationUnknownGet(ctx);
 
   MlirModule moduleOp = makeAdd(ctx, location);
diff --git a/mlir/test/EDSC/builder-api-test.cpp b/mlir/test/EDSC/builder-api-test.cpp
index 3fcfcf24ef8fe4..062e4b5912297a 100644
--- a/mlir/test/EDSC/builder-api-test.cpp
+++ b/mlir/test/EDSC/builder-api-test.cpp
@@ -36,16 +36,18 @@ using namespace mlir::edsc;
 using namespace mlir::edsc::intrinsics;
 
 static MLIRContext &globalContext() {
-  static bool init_once = []() {
-    registerDialect<AffineDialect>();
-    registerDialect<linalg::LinalgDialect>();
-    registerDialect<scf::SCFDialect>();
-    registerDialect<StandardOpsDialect>();
-    registerDialect<vector::VectorDialect>();
+  static thread_local MLIRContext context(/*loadAllDialects=*/false);
+  static thread_local bool initOnce = [&]() {
+    // clang-format off
+    context.loadDialect<AffineDialect,
+                        scf::SCFDialect,
+                        linalg::LinalgDialect,
+                        StandardOpsDialect,
+                        vector::VectorDialect>();
+    // clang-format on
     return true;
   }();
-  (void)init_once;
-  static thread_local MLIRContext context;
+  (void)initOnce;
   context.allowUnregisteredDialects();
   return context;
 }
diff --git a/mlir/test/SDBM/sdbm-api-test.cpp b/mlir/test/SDBM/sdbm-api-test.cpp
index 0b58e2948145cc..ddefc52fb461d5 100644
--- a/mlir/test/SDBM/sdbm-api-test.cpp
+++ b/mlir/test/SDBM/sdbm-api-test.cpp
@@ -19,18 +19,19 @@
 
 using namespace mlir;
 
-// Load the SDBM dialect
-static DialectRegistration<SDBMDialect> SDBMRegistration;
 
 static MLIRContext *ctx() {
-  static thread_local MLIRContext context;
+  static thread_local MLIRContext context(/*loadAllDialects=*/false);
+  static thread_local bool once =
+      (context.getOrLoadDialect<SDBMDialect>(), true);
+  (void)once;
   return &context;
 }
 
 static SDBMDialect *dialect() {
   static thread_local SDBMDialect *d = nullptr;
   if (!d) {
-    d = ctx()->getRegisteredDialect<SDBMDialect>();
+    d = ctx()->getOrLoadDialect<SDBMDialect>();
   }
   return d;
 }
diff --git a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
index a6719b060aac9f..cfac2dce230075 100644
--- a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Analysis/NestedMatcher.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Diagnostics.h"
@@ -72,6 +73,9 @@ struct VectorizerTestPass
     : public PassWrapper<VectorizerTestPass, FunctionPass> {
   static constexpr auto kTestAffineMapOpName = "test_affine_map";
   static constexpr auto kTestAffineMapAttrName = "affine_map";
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<vector::VectorDialect>();
+  }
 
   void runOnFunction() override;
   void testVectorShapeRatio(llvm::raw_ostream &outs);
diff --git a/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp b/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp
index 0c1069f38b670a..03c425d6d9062e 100644
--- a/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp
+++ b/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp
@@ -30,7 +30,7 @@ void PrintOpAvailability::runOnFunction() {
   auto f = getFunction();
   llvm::outs() << f.getName() << "\n";
 
-  Dialect *spvDialect = getContext().getRegisteredDialect("spv");
+  Dialect *spvDialect = getContext().getLoadedDialect("spv");
 
   f.getOperation()->walk([&](Operation *op) {
     if (op->getDialect() != spvDialect)
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index c873a009f151d2..59d9697be5b478 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -21,6 +21,10 @@
 
 using namespace mlir;
 
+void mlir::registerTestDialect(DialectRegistry &registry) {
+  registry.insert<TestDialect>();
+}
+
 //===----------------------------------------------------------------------===//
 // TestDialect Interfaces
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.h b/mlir/test/lib/Dialect/Test/TestDialect.h
index fd1914cbc62455..34fc1a9534e8d5 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.h
+++ b/mlir/test/lib/Dialect/Test/TestDialect.h
@@ -37,6 +37,8 @@ namespace mlir {
 #define GET_OP_CLASSES
 #include "TestOps.h.inc"
 
+void registerTestDialect(DialectRegistry &registry);
+
 } // end namespace mlir
 
 #endif // MLIR_TESTDIALECT_H
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index f2a17a9f3f5fa6..be5d799a025336 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -768,6 +768,10 @@ struct TestTypeConversionProducer
 
 struct TestTypeConversionDriver
     : public PassWrapper<TestTypeConversionDriver, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<TestDialect>();
+  }
+
   void runOnOperation() override {
     // Initialize the type converter.
     TypeConverter converter;
diff --git a/mlir/test/lib/Transforms/TestAllReduceLowering.cpp b/mlir/test/lib/Transforms/TestAllReduceLowering.cpp
index c043d0f02f8d0b..0c72b6cd2a89c3 100644
--- a/mlir/test/lib/Transforms/TestAllReduceLowering.cpp
+++ b/mlir/test/lib/Transforms/TestAllReduceLowering.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 
@@ -19,6 +20,9 @@ using namespace mlir;
 namespace {
 struct TestAllReduceLoweringPass
     : public PassWrapper<TestAllReduceLoweringPass, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<StandardOpsDialect>();
+  }
   void runOnOperation() override {
     OwningRewritePatternList patterns;
     populateGpuRewritePatterns(&getContext(), patterns);
diff --git a/mlir/test/lib/Transforms/TestBufferPlacement.cpp b/mlir/test/lib/Transforms/TestBufferPlacement.cpp
index 5ad441aa15c3e2..6cc0924191cb8f 100644
--- a/mlir/test/lib/Transforms/TestBufferPlacement.cpp
+++ b/mlir/test/lib/Transforms/TestBufferPlacement.cpp
@@ -116,6 +116,10 @@ struct TestBufferPlacementPreparationPass
     patterns->insert<GenericOpConverter>(context, placer, converter);
   }
 
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect>();
+  }
+
   void runOnOperation() override {
     MLIRContext &context = this->getContext();
     ConversionTarget target(context);
diff --git a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
index 08862dd061402d..3c2b933e99f6a0 100644
--- a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
+++ b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
@@ -13,6 +13,9 @@
 
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/Pass/Pass.h"
 
@@ -26,6 +29,10 @@ namespace {
 class TestGpuMemoryPromotionPass
     : public PassWrapper<TestGpuMemoryPromotionPass,
                          OperationPass<gpu::GPUFuncOp>> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<StandardOpsDialect, scf::SCFDialect>();
+  }
+
   void runOnOperation() override {
     gpu::GPUFuncOp op = getOperation();
     for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) {
diff --git a/mlir/test/lib/Transforms/TestLinalgHoisting.cpp b/mlir/test/lib/Transforms/TestLinalgHoisting.cpp
index d1e478fec3bcba..5d4031f9004366 100644
--- a/mlir/test/lib/Transforms/TestLinalgHoisting.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgHoisting.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Pass/Pass.h"
@@ -22,6 +23,9 @@ struct TestLinalgHoisting
     : public PassWrapper<TestLinalgHoisting, FunctionPass> {
   TestLinalgHoisting() = default;
   TestLinalgHoisting(const TestLinalgHoisting &pass) {}
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AffineDialect>();
+  }
 
   void runOnFunction() override;
 
diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
index dffe4f2a0796a0..4fc880a24277b1 100644
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
@@ -30,6 +31,16 @@ struct TestLinalgTransforms
   TestLinalgTransforms() = default;
   TestLinalgTransforms(const TestLinalgTransforms &pass) {}
 
+  void getDependentDialects(DialectRegistry &registry) const override {
+    // clang-format off
+    registry.insert<AffineDialect,
+                    scf::SCFDialect,
+                    StandardOpsDialect,
+                    vector::VectorDialect,
+                    gpu::GPUDialect>();
+    // clang-format on
+  }
+
   void runOnFunction() override;
 
   Option<bool> testPatterns{*this, "test-patterns",
diff --git a/mlir/test/lib/Transforms/TestVectorTransforms.cpp b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
index 9da3156d535939..ab8460318b49fb 100644
--- a/mlir/test/lib/Transforms/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
@@ -8,6 +8,9 @@
 
 #include <type_traits>
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorTransforms.h"
@@ -128,6 +131,11 @@ struct TestVectorTransferFullPartialSplitPatterns
   TestVectorTransferFullPartialSplitPatterns() = default;
   TestVectorTransferFullPartialSplitPatterns(
       const TestVectorTransferFullPartialSplitPatterns &pass) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AffineDialect, linalg::LinalgDialect, scf::SCFDialect>();
+  }
+
   Option<bool> useLinalgOps{
       *this, "use-linalg-copy",
       llvm::cl::desc("Split using a unmasked vector.transfer + linalg.fill + "
diff --git a/mlir/test/mlir-opt/commandline.mlir b/mlir/test/mlir-opt/commandline.mlir
index f99a68d6303cef..4cf6ea9d8a698a 100644
--- a/mlir/test/mlir-opt/commandline.mlir
+++ b/mlir/test/mlir-opt/commandline.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt --show-dialects | FileCheck %s
-// CHECK: Registered Dialects:
+// CHECK: Available Dialects:
 // CHECK: affine
 // CHECK: gpu
 // CHECK: linalg
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp
index 12e6aeef9162ff..92efef67e8f4a7 100644
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp
@@ -1703,7 +1703,7 @@ int main(int argc, char **argv) {
   if (testEmitIncludeTdHeader)
     output->os() << "include \"mlir/Dialect/Linalg/IR/LinalgStructuredOps.td\"";
 
-  MLIRContext context;
+  MLIRContext context(/*loadAllDialects=*/false);
   llvm::SourceMgr mgr;
   mgr.AddNewSourceBuffer(std::move(file), llvm::SMLoc());
   Parser parser(mgr, &context);
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index f5f5a477942b9d..6f27949d832977 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -48,6 +48,7 @@ void registerTestConstantFold();
 void registerTestConvertGPUKernelToCubinPass();
 void registerTestConvertGPUKernelToHsacoPass();
 void registerTestDominancePass();
+void registerTestDialect(DialectRegistry &);
 void registerTestExpandTanhPass();
 void registerTestFunc();
 void registerTestGpuMemoryPromotionPass();
@@ -130,5 +131,10 @@ int main(int argc, char **argv) {
 #ifdef MLIR_INCLUDE_TESTS
   registerTestPasses();
 #endif
-  return failed(MlirOptMain(argc, argv, "MLIR modular optimizer driver"));
+  DialectRegistry registry;
+  registerAllDialects(registry);
+  registerTestDialect(registry);
+  return failed(MlirOptMain(argc, argv, "MLIR modular optimizer driver\n",
+                            registry,
+                            /*preloadDialectsInContext=*/false));
 }
diff --git a/mlir/tools/mlir-tblgen/DialectGen.cpp b/mlir/tools/mlir-tblgen/DialectGen.cpp
index 13421c42c3c2cf..3a19379da8a3a3 100644
--- a/mlir/tools/mlir-tblgen/DialectGen.cpp
+++ b/mlir/tools/mlir-tblgen/DialectGen.cpp
@@ -61,11 +61,14 @@ filterForDialect(ArrayRef<llvm::Record *> records, Dialect &dialect) {
 ///
 /// {0}: The name of the dialect class.
 /// {1}: The dialect namespace.
+/// {2}: initialization code that is emitted in the ctor body before calling
+/// initialize()
 static const char *const dialectDeclBeginStr = R"(
 class {0} : public ::mlir::Dialect {
   explicit {0}(::mlir::MLIRContext *context)
     : ::mlir::Dialect(getDialectNamespace(), context,
       ::mlir::TypeID::get<{0}>()) {{
+    {2}
     initialize();
   }
   void initialize();
@@ -74,6 +77,12 @@ class {0} : public ::mlir::Dialect {
   static ::llvm::StringRef getDialectNamespace() { return "{1}"; }
 )";
 
+/// Registration for a single dependent dialect: to be inserted in the ctor
+/// above for each dependent dialect.
+const char *const dialectRegistrationTemplate = R"(
+    getContext()->getOrLoadDialect<{0}>();
+)";
+
 /// The code block for the attribute parser/printer hooks.
 static const char *const attrParserDecl = R"(
   /// Parse an attribute registered to this dialect.
@@ -136,9 +145,18 @@ static void emitDialectDecl(Dialect &dialect,
                             iterator_range<DialectFilterIterator> dialectAttrs,
                             iterator_range<DialectFilterIterator> dialectTypes,
                             raw_ostream &os) {
+  /// Build the list of dependent dialects
+  std::string dependentDialectRegistrations;
+  {
+    llvm::raw_string_ostream dialectsOs(dependentDialectRegistrations);
+    for (StringRef dependentDialect : dialect.getDependentDialects())
+      dialectsOs << llvm::formatv(dialectRegistrationTemplate,
+                                  dependentDialect);
+  }
   // Emit the start of the decl.
   std::string cppName = dialect.getCppClassName();
-  os << llvm::formatv(dialectDeclBeginStr, cppName, dialect.getName());
+  os << llvm::formatv(dialectDeclBeginStr, cppName, dialect.getName(),
+                      dependentDialectRegistrations);
 
   // Check for any attributes/types registered to this dialect.  If there are,
   // add the hooks for parsing/printing.
diff --git a/mlir/tools/mlir-tblgen/PassGen.cpp b/mlir/tools/mlir-tblgen/PassGen.cpp
index c2dcdb8e4ac9b7..c1664a0c826c71 100644
--- a/mlir/tools/mlir-tblgen/PassGen.cpp
+++ b/mlir/tools/mlir-tblgen/PassGen.cpp
@@ -36,6 +36,7 @@ static llvm::cl::opt<std::string>
 /// {0}: The def name of the pass record.
 /// {1}: The base class for the pass.
 /// {2): The command line argument for the pass.
+/// {3}: The dependent dialects registration.
 const char *const passDeclBegin = R"(
 //===----------------------------------------------------------------------===//
 // {0}
@@ -63,9 +64,20 @@ class {0}Base : public {1} {
     return std::make_unique<DerivedT>(*static_cast<const DerivedT *>(this));
   }
 
+  /// Return the dialect that must be loaded in the context before this pass.
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    {3}
+  }
+
 protected:
 )";
 
+/// Registration for a single dependent dialect, to be inserted for each
+/// dependent dialect in the `getDependentDialects` above.
+const char *const dialectRegistrationTemplate = R"(
+  registry.insert<{0}>();
+)";
+
 /// Emit the declarations for each of the pass options.
 static void emitPassOptionDecls(const Pass &pass, raw_ostream &os) {
   for (const PassOption &opt : pass.getOptions()) {
@@ -94,8 +106,15 @@ static void emitPassStatisticDecls(const Pass &pass, raw_ostream &os) {
 
 static void emitPassDecl(const Pass &pass, raw_ostream &os) {
   StringRef defName = pass.getDef()->getName();
+  std::string dependentDialectRegistrations;
+  {
+    llvm::raw_string_ostream dialectsOs(dependentDialectRegistrations);
+    for (StringRef dependentDialect : pass.getDependentDialects())
+      dialectsOs << llvm::formatv(dialectRegistrationTemplate,
+                                  dependentDialect);
+  }
   os << llvm::formatv(passDeclBegin, defName, pass.getBaseClass(),
-                      pass.getArgument());
+                      pass.getArgument(), dependentDialectRegistrations);
   emitPassOptionDecls(pass, os);
   emitPassStatisticDecls(pass, os);
   os << "};\n";
diff --git a/mlir/tools/mlir-translate/mlir-translate.cpp b/mlir/tools/mlir-translate/mlir-translate.cpp
index 914bd340b3f564..0d67286a8a9142 100644
--- a/mlir/tools/mlir-translate/mlir-translate.cpp
+++ b/mlir/tools/mlir-translate/mlir-translate.cpp
@@ -88,7 +88,8 @@ int main(int argc, char **argv) {
   // Processes the memory buffer with a new MLIRContext.
   auto processBuffer = [&](std::unique_ptr<llvm::MemoryBuffer> ownedBuffer,
                            raw_ostream &os) {
-    MLIRContext context;
+    MLIRContext context(false);
+    registerAllDialects(&context);
     context.allowUnregisteredDialects();
     context.printOpOnDiagnostic(!verifyDiagnostics);
     llvm::SourceMgr sourceMgr;
diff --git a/mlir/unittests/Dialect/Quant/QuantizationUtilsTest.cpp b/mlir/unittests/Dialect/Quant/QuantizationUtilsTest.cpp
index 97c94a54ffc478..bae95e1a13b68f 100644
--- a/mlir/unittests/Dialect/Quant/QuantizationUtilsTest.cpp
+++ b/mlir/unittests/Dialect/Quant/QuantizationUtilsTest.cpp
@@ -17,9 +17,6 @@
 using namespace mlir;
 using namespace mlir::quant;
 
-// Load the quant dialect
-static DialectRegistration<QuantizationDialect> QuantOpsRegistration;
-
 namespace {
 
 // Test UniformQuantizedValueConverter converts all APFloat to a magic number 5.
@@ -78,7 +75,8 @@ UniformQuantizedType getTestQuantizedType(Type storageType, MLIRContext *ctx) {
 }
 
 TEST(QuantizationUtilsTest, convertFloatAttrUniform) {
-  MLIRContext ctx;
+  MLIRContext ctx(/*loadAllDialects=*/false);
+  ctx.getOrLoadDialect<QuantizationDialect>();
   IntegerType convertedType = IntegerType::get(8, &ctx);
   auto quantizedType = getTestQuantizedType(convertedType, &ctx);
   TestUniformQuantizedValueConverter converter(quantizedType);
@@ -95,7 +93,8 @@ TEST(QuantizationUtilsTest, convertFloatAttrUniform) {
 }
 
 TEST(QuantizationUtilsTest, convertRankedDenseAttrUniform) {
-  MLIRContext ctx;
+  MLIRContext ctx(/*loadAllDialects=*/false);
+  ctx.getOrLoadDialect<QuantizationDialect>();
   IntegerType convertedType = IntegerType::get(8, &ctx);
   auto quantizedType = getTestQuantizedType(convertedType, &ctx);
   TestUniformQuantizedValueConverter converter(quantizedType);
@@ -119,7 +118,8 @@ TEST(QuantizationUtilsTest, convertRankedDenseAttrUniform) {
 }
 
 TEST(QuantizationUtilsTest, convertRankedSplatAttrUniform) {
-  MLIRContext ctx;
+  MLIRContext ctx(/*loadAllDialects=*/false);
+  ctx.getOrLoadDialect<QuantizationDialect>();
   IntegerType convertedType = IntegerType::get(8, &ctx);
   auto quantizedType = getTestQuantizedType(convertedType, &ctx);
   TestUniformQuantizedValueConverter converter(quantizedType);
@@ -143,7 +143,8 @@ TEST(QuantizationUtilsTest, convertRankedSplatAttrUniform) {
 }
 
 TEST(QuantizationUtilsTest, convertRankedSparseAttrUniform) {
-  MLIRContext ctx;
+  MLIRContext ctx(/*loadAllDialects=*/false);
+  ctx.getOrLoadDialect<QuantizationDialect>();
   IntegerType convertedType = IntegerType::get(8, &ctx);
   auto quantizedType = getTestQuantizedType(convertedType, &ctx);
   TestUniformQuantizedValueConverter converter(quantizedType);
diff --git a/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp b/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp
index fe5632d7ae1658..4aa2ffed7e2b1f 100644
--- a/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp
+++ b/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp
@@ -38,7 +38,8 @@ using ::testing::StrEq;
 /// diagnostic checking utilities.
 class DeserializationTest : public ::testing::Test {
 protected:
-  DeserializationTest() {
+  DeserializationTest() : context(/*loadAllDialects=*/false) {
+    context.getOrLoadDialect<mlir::spirv::SPIRVDialect>();
     // Register a diagnostic handler to capture the diagnostic so that we can
     // check it later.
     context.getDiagEngine().registerHandler([&](Diagnostic &diag) {
diff --git a/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
index 3d57e559ca5ec2..cb89cd61de7bdc 100644
--- a/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
+++ b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
@@ -36,7 +36,10 @@ using namespace mlir;
 
 class SerializationTest : public ::testing::Test {
 protected:
-  SerializationTest() { createModuleOp(); }
+  SerializationTest() : context(/*loadAllDialects=*/false) {
+    context.getOrLoadDialect<mlir::spirv::SPIRVDialect>();
+    createModuleOp();
+  }
 
   void createModuleOp() {
     OpBuilder builder(&context);
diff --git a/mlir/unittests/IR/AttributeTest.cpp b/mlir/unittests/IR/AttributeTest.cpp
index df449a0da75c56..78f7dd53d8fd7a 100644
--- a/mlir/unittests/IR/AttributeTest.cpp
+++ b/mlir/unittests/IR/AttributeTest.cpp
@@ -32,7 +32,7 @@ static void testSplat(Type eltType, const EltTy &splatElt) {
 
 namespace {
 TEST(DenseSplatTest, BoolSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   IntegerType boolTy = IntegerType::get(1, &context);
   RankedTensorType shape = RankedTensorType::get({2, 2}, boolTy);
 
@@ -57,7 +57,7 @@ TEST(DenseSplatTest, BoolSplat) {
 TEST(DenseSplatTest, LargeBoolSplat) {
   constexpr int64_t boolCount = 56;
 
-  MLIRContext context;
+  MLIRContext context(false);
   IntegerType boolTy = IntegerType::get(1, &context);
   RankedTensorType shape = RankedTensorType::get({boolCount}, boolTy);
 
@@ -80,7 +80,7 @@ TEST(DenseSplatTest, LargeBoolSplat) {
 }
 
 TEST(DenseSplatTest, BoolNonSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   IntegerType boolTy = IntegerType::get(1, &context);
   RankedTensorType shape = RankedTensorType::get({6}, boolTy);
 
@@ -92,7 +92,7 @@ TEST(DenseSplatTest, BoolNonSplat) {
 
 TEST(DenseSplatTest, OddIntSplat) {
   // Test detecting a splat with an odd(non 8-bit) integer bitwidth.
-  MLIRContext context;
+  MLIRContext context(false);
   constexpr size_t intWidth = 19;
   IntegerType intTy = IntegerType::get(intWidth, &context);
   APInt value(intWidth, 10);
@@ -101,7 +101,7 @@ TEST(DenseSplatTest, OddIntSplat) {
 }
 
 TEST(DenseSplatTest, Int32Splat) {
-  MLIRContext context;
+  MLIRContext context(false);
   IntegerType intTy = IntegerType::get(32, &context);
   int value = 64;
 
@@ -109,7 +109,7 @@ TEST(DenseSplatTest, Int32Splat) {
 }
 
 TEST(DenseSplatTest, IntAttrSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   IntegerType intTy = IntegerType::get(85, &context);
   Attribute value = IntegerAttr::get(intTy, 109);
 
@@ -117,7 +117,7 @@ TEST(DenseSplatTest, IntAttrSplat) {
 }
 
 TEST(DenseSplatTest, F32Splat) {
-  MLIRContext context;
+  MLIRContext context(false);
   FloatType floatTy = FloatType::getF32(&context);
   float value = 10.0;
 
@@ -125,7 +125,7 @@ TEST(DenseSplatTest, F32Splat) {
 }
 
 TEST(DenseSplatTest, F64Splat) {
-  MLIRContext context;
+  MLIRContext context(false);
   FloatType floatTy = FloatType::getF64(&context);
   double value = 10.0;
 
@@ -133,7 +133,7 @@ TEST(DenseSplatTest, F64Splat) {
 }
 
 TEST(DenseSplatTest, FloatAttrSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   FloatType floatTy = FloatType::getF32(&context);
   Attribute value = FloatAttr::get(floatTy, 10.0);
 
@@ -141,7 +141,7 @@ TEST(DenseSplatTest, FloatAttrSplat) {
 }
 
 TEST(DenseSplatTest, BF16Splat) {
-  MLIRContext context;
+  MLIRContext context(false);
   FloatType floatTy = FloatType::getBF16(&context);
   Attribute value = FloatAttr::get(floatTy, 10.0);
 
@@ -149,7 +149,7 @@ TEST(DenseSplatTest, BF16Splat) {
 }
 
 TEST(DenseSplatTest, StringSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   Type stringType =
       OpaqueType::get(Identifier::get("test", &context), "string", &context);
   StringRef value = "test-string";
@@ -157,7 +157,7 @@ TEST(DenseSplatTest, StringSplat) {
 }
 
 TEST(DenseSplatTest, StringAttrSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   Type stringType =
       OpaqueType::get(Identifier::get("test", &context), "string", &context);
   Attribute stringAttr = StringAttr::get("test-string", stringType);
@@ -165,28 +165,28 @@ TEST(DenseSplatTest, StringAttrSplat) {
 }
 
 TEST(DenseComplexTest, ComplexFloatSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   ComplexType complexType = ComplexType::get(FloatType::getF32(&context));
   std::complex<float> value(10.0, 15.0);
   testSplat(complexType, value);
 }
 
 TEST(DenseComplexTest, ComplexIntSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   ComplexType complexType = ComplexType::get(IntegerType::get(64, &context));
   std::complex<int64_t> value(10, 15);
   testSplat(complexType, value);
 }
 
 TEST(DenseComplexTest, ComplexAPFloatSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   ComplexType complexType = ComplexType::get(FloatType::getF32(&context));
   std::complex<APFloat> value(APFloat(10.0f), APFloat(15.0f));
   testSplat(complexType, value);
 }
 
 TEST(DenseComplexTest, ComplexAPIntSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   ComplexType complexType = ComplexType::get(IntegerType::get(64, &context));
   std::complex<APInt> value(APInt(64, 10), APInt(64, 15));
   testSplat(complexType, value);
diff --git a/mlir/unittests/IR/DialectTest.cpp b/mlir/unittests/IR/DialectTest.cpp
index bc389ce1f0daed..5a0a229d2c82ee 100644
--- a/mlir/unittests/IR/DialectTest.cpp
+++ b/mlir/unittests/IR/DialectTest.cpp
@@ -26,12 +26,12 @@ struct AnotherTestDialect : public Dialect {
 };
 
 TEST(DialectDeathTest, MultipleDialectsWithSameNamespace) {
-  MLIRContext context;
+  MLIRContext context(false);
 
   // Registering a dialect with the same namespace twice should result in a
   // failure.
-  context.getOrCreateDialect<TestDialect>();
-  ASSERT_DEATH(context.getOrCreateDialect<AnotherTestDialect>(), "");
+  context.loadDialect<TestDialect>();
+  ASSERT_DEATH(context.loadDialect<AnotherTestDialect>(), "");
 }
 
 } // end namespace
diff --git a/mlir/unittests/IR/OperationSupportTest.cpp b/mlir/unittests/IR/OperationSupportTest.cpp
index 95ddcccc565e3a..96693309bd1319 100644
--- a/mlir/unittests/IR/OperationSupportTest.cpp
+++ b/mlir/unittests/IR/OperationSupportTest.cpp
@@ -25,7 +25,7 @@ static Operation *createOp(MLIRContext *context,
 
 namespace {
 TEST(OperandStorageTest, NonResizable) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   Operation *useOp =
@@ -49,7 +49,7 @@ TEST(OperandStorageTest, NonResizable) {
 }
 
 TEST(OperandStorageTest, Resizable) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   Operation *useOp =
@@ -77,7 +77,7 @@ TEST(OperandStorageTest, Resizable) {
 }
 
 TEST(OperandStorageTest, RangeReplace) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   Operation *useOp =
@@ -113,7 +113,7 @@ TEST(OperandStorageTest, RangeReplace) {
 }
 
 TEST(OperandStorageTest, MutableRange) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   Operation *useOp =
diff --git a/mlir/unittests/Pass/AnalysisManagerTest.cpp b/mlir/unittests/Pass/AnalysisManagerTest.cpp
index 41a90649deef0a..918c8985eb0c23 100644
--- a/mlir/unittests/Pass/AnalysisManagerTest.cpp
+++ b/mlir/unittests/Pass/AnalysisManagerTest.cpp
@@ -29,7 +29,7 @@ struct OpSpecificAnalysis {
 };
 
 TEST(AnalysisManagerTest, FineGrainModuleAnalysisPreservation) {
-  MLIRContext context;
+  MLIRContext context(false);
 
   // Test fine grain invalidation of the module analysis manager.
   OwningModuleRef module(ModuleOp::create(UnknownLoc::get(&context)));
@@ -50,7 +50,7 @@ TEST(AnalysisManagerTest, FineGrainModuleAnalysisPreservation) {
 }
 
 TEST(AnalysisManagerTest, FineGrainFunctionAnalysisPreservation) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   // Create a function and a module.
@@ -79,7 +79,7 @@ TEST(AnalysisManagerTest, FineGrainFunctionAnalysisPreservation) {
 }
 
 TEST(AnalysisManagerTest, FineGrainChildFunctionAnalysisPreservation) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   // Create a function and a module.
@@ -122,7 +122,7 @@ struct CustomInvalidatingAnalysis {
 };
 
 TEST(AnalysisManagerTest, CustomInvalidation) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   // Create a function and a module.
diff --git a/mlir/unittests/SDBM/SDBMTest.cpp b/mlir/unittests/SDBM/SDBMTest.cpp
index 61d670650b4bfd..bbe87e3d292c85 100644
--- a/mlir/unittests/SDBM/SDBMTest.cpp
+++ b/mlir/unittests/SDBM/SDBMTest.cpp
@@ -17,18 +17,17 @@
 
 using namespace mlir;
 
-/// Load the SDBM dialect.
-static DialectRegistration<SDBMDialect> SDBMRegistration;
 
 static MLIRContext *ctx() {
-  static thread_local MLIRContext context;
+  static thread_local MLIRContext context(false);
+  context.getOrLoadDialect<SDBMDialect>();
   return &context;
 }
 
 static SDBMDialect *dialect() {
   static thread_local SDBMDialect *d = nullptr;
   if (!d) {
-    d = ctx()->getRegisteredDialect<SDBMDialect>();
+    d = ctx()->getOrLoadDialect<SDBMDialect>();
   }
   return d;
 }
diff --git a/mlir/unittests/TableGen/OpBuildGen.cpp b/mlir/unittests/TableGen/OpBuildGen.cpp
index 3e3256e96cd045..46a37da6e9441f 100644
--- a/mlir/unittests/TableGen/OpBuildGen.cpp
+++ b/mlir/unittests/TableGen/OpBuildGen.cpp
@@ -25,11 +25,16 @@ namespace mlir {
 // Test Fixture
 //===----------------------------------------------------------------------===//
 
+static MLIRContext &getContext() {
+  static MLIRContext ctx(false);
+  ctx.getOrLoadDialect<TestDialect>();
+  return ctx;
+}
 /// Test fixture for providing basic utilities for testing.
 class OpBuildGenTest : public ::testing::Test {
 protected:
   OpBuildGenTest()
-      : ctx{}, builder(&ctx), loc(builder.getUnknownLoc()),
+      : ctx(getContext()), builder(&ctx), loc(builder.getUnknownLoc()),
         i32Ty(builder.getI32Type()), f32Ty(builder.getF32Type()),
         cstI32(builder.create<TableGenConstant>(loc, i32Ty)),
         cstF32(builder.create<TableGenConstant>(loc, f32Ty)),
@@ -86,7 +91,7 @@ class OpBuildGenTest : public ::testing::Test {
   }
 
 protected:
-  MLIRContext ctx;
+  MLIRContext &ctx;
   OpBuilder builder;
   Location loc;
   Type i32Ty;
diff --git a/mlir/unittests/TableGen/StructsGenTest.cpp b/mlir/unittests/TableGen/StructsGenTest.cpp
index c58fedb4ec4f03..14b0abc675bff3 100644
--- a/mlir/unittests/TableGen/StructsGenTest.cpp
+++ b/mlir/unittests/TableGen/StructsGenTest.cpp
@@ -42,7 +42,7 @@ static test::TestStruct getTestStruct(mlir::MLIRContext *context) {
 /// Validates that test::TestStruct::classof correctly identifies a valid
 /// test::TestStruct.
 TEST(StructsGenTest, ClassofTrue) {
-  mlir::MLIRContext context;
+  mlir::MLIRContext context(false);
   auto structAttr = getTestStruct(&context);
   ASSERT_TRUE(test::TestStruct::classof(structAttr));
 }

From 9028c03ce602eb217e35288d510ed93306d136af Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 18 Aug 2020 13:59:26 -0700
Subject: [PATCH 088/101] [X86] Fix the Predicates on MMX_PSHUFWri/PSHUFWmi to
 include SSE1 in addition to MMX.

These instructions weren't in the initial version of MMX, but
were added when SSE1 was introduced. We already have the intrinsic
named correctly to include sse and the frontened header enforces
sse. We have one place in the backend where we DAG combine to
this intrinsic, but that's also qualified. So don't know of anything
currently broken unless someone writes their own IR and doesn't
set the sse feature.
---
 llvm/lib/Target/X86/X86InstrMMX.td | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index 49940204c25a45..bb3e6df3bf3e04 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -472,6 +472,7 @@ defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb,
 defm MMX_PSHUFB : SS3I_binop_rm_int_mm<0x00, "pshufb", int_x86_ssse3_pshuf_b,
                                        SchedWriteVarShuffle.MMX>;
 
+let Predicates = [HasMMX, HasSSE1] in {
 def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
                           (outs VR64:$dst), (ins VR64:$src1, u8imm:$src2),
                           "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -485,6 +486,7 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
                              (int_x86_sse_pshuf_w (load_mmx addr:$src1),
                                                    timm:$src2))]>,
                           Sched<[SchedWriteShuffle.MMX.Folded]>;
+}
 
 // -- Conversion Instructions
 defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,

From a20f5fe70810e0a768c1814d69d10862965c21e4 Mon Sep 17 00:00:00 2001
From: Sterling Augustine <saugustine@google.com>
Date: Tue, 18 Aug 2020 12:05:07 -0700
Subject: [PATCH 089/101] Default to disabling the libunwind frameheader cache.

Although it works fine with glibc, as currently implemented the
frameheader cache is incompatible with certain platforms with
slightly different locking semantics inside dl_iterate_phdr.

Therefore only enable it when it is turned on explicitly with
a configure-time option.

Differential Revision: https://reviews.llvm.org/D86163
---
 libunwind/CMakeLists.txt                      | 5 +++++
 libunwind/src/AddressSpace.hpp                | 6 ++++++
 libunwind/test/frameheadercache_test.pass.cpp | 2 +-
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt
index 8419d851ab7f46..f4243f3d279142 100644
--- a/libunwind/CMakeLists.txt
+++ b/libunwind/CMakeLists.txt
@@ -138,6 +138,7 @@ option(LIBUNWIND_WEAK_PTHREAD_LIB "Use weak references to refer to pthread funct
 option(LIBUNWIND_USE_COMPILER_RT "Use compiler-rt instead of libgcc" OFF)
 option(LIBUNWIND_INCLUDE_DOCS "Build the libunwind documentation." ${LLVM_INCLUDE_DOCS})
 option(LIBUNWIND_IS_BAREMETAL "Build libunwind for baremetal targets." OFF)
+option(LIBUNWIND_USE_FRAME_HEADER_CACHE "Cache frame headers for unwinding. Requires locking dl_iterate_phdr." OFF)
 
 set(LIBUNWIND_LIBDIR_SUFFIX "${LLVM_LIBDIR_SUFFIX}" CACHE STRING
     "Define suffix of library directory name (32/64)")
@@ -368,6 +369,10 @@ if(LIBUNWIND_IS_BAREMETAL)
   add_compile_definitions(_LIBUNWIND_IS_BAREMETAL)
 endif()
 
+if(LIBUNWIND_USE_FRAME_HEADER_CACHE)
+  add_compile_definitions(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
+endif()
+
 # This is the _ONLY_ place where add_definitions is called.
 if (MSVC)
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
diff --git a/libunwind/src/AddressSpace.hpp b/libunwind/src/AddressSpace.hpp
index 78d2dd11086505..2443bd761cc433 100644
--- a/libunwind/src/AddressSpace.hpp
+++ b/libunwind/src/AddressSpace.hpp
@@ -411,10 +411,12 @@ struct _LIBUNWIND_HIDDEN dl_iterate_cb_data {
     #error "_LIBUNWIND_SUPPORT_DWARF_UNWIND requires _LIBUNWIND_SUPPORT_DWARF_INDEX on this platform."
   #endif
 
+#if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
 #include "FrameHeaderCache.hpp"
 
 // There should be just one of these per process.
 static FrameHeaderCache ProcessFrameHeaderCache;
+#endif
 
 static bool checkAddrInSegment(const Elf_Phdr *phdr, size_t image_base,
                                dl_iterate_cb_data *cbdata) {
@@ -435,8 +437,10 @@ int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t pinfo_size,
   auto cbdata = static_cast<dl_iterate_cb_data *>(data);
   if (pinfo->dlpi_phnum == 0 || cbdata->targetAddr < pinfo->dlpi_addr)
     return 0;
+#if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
   if (ProcessFrameHeaderCache.find(pinfo, pinfo_size, data))
     return 1;
+#endif
 
   Elf_Addr image_base = calculateImageBase(pinfo);
   bool found_obj = false;
@@ -464,7 +468,9 @@ int findUnwindSectionsByPhdr(struct dl_phdr_info *pinfo, size_t pinfo_size,
       found_obj = checkAddrInSegment(phdr, image_base, cbdata);
     }
     if (found_obj && found_hdr) {
+#if defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
       ProcessFrameHeaderCache.add(cbdata->sects);
+#endif
       return 1;
     }
   }
diff --git a/libunwind/test/frameheadercache_test.pass.cpp b/libunwind/test/frameheadercache_test.pass.cpp
index 9397e70d66cb2c..ebbc00464e0727 100644
--- a/libunwind/test/frameheadercache_test.pass.cpp
+++ b/libunwind/test/frameheadercache_test.pass.cpp
@@ -6,7 +6,7 @@
 // The frame header cache should work fine for other architectures,
 // but the #ifdefs end up being even more complicated than this.
 
-#ifdef __x86_64__
+#if defined(__x86_64__) && defined(_LIBUNWIND_USE_FRAME_HEADER_CACHE)
 
 // This #if chain is ugly, but see the comments in AddressSpace.hpp for
 // the reasoning.

From ed3534452486728aa2981537ac8d8c5c3e1e6b0a Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Tue, 18 Aug 2020 14:54:56 -0700
Subject: [PATCH 090/101] Use std::make_tuple instead of initializer lists to
 make a bot happy:
 http://lab.llvm.org:8011/builders/clang-cmake-x86_64-avx2-linux

---
 llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 588a24e1dc57f3..70e201068f5c46 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -646,7 +646,7 @@ bool CombinerHelper::matchSextInRegOfLoad(
   // anyway for most targets.
   if (!isPowerOf2_32(NewSizeBits))
     return false;
-  MatchInfo = {LoadDef->getOperand(0).getReg(), NewSizeBits};
+  MatchInfo = std::make_tuple(LoadDef->getOperand(0).getReg(), NewSizeBits);
   return true;
 }
 
@@ -1963,7 +1963,7 @@ bool CombinerHelper::matchAshrShlToSextInreg(
   if (!isLegalOrBeforeLegalizer(
           {TargetOpcode::G_SEXT_INREG, {MRI.getType(Src)}}))
     return false;
-  MatchInfo = {Src, ShlCst};
+  MatchInfo = std::make_tuple(Src, ShlCst);
   return true;
 }
 bool CombinerHelper::applyAshShlToSextInreg(

From 78bd4231bfbf695cc45b651b6c994f047b287bad Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Wed, 19 Aug 2020 00:59:07 +0300
Subject: [PATCH 091/101] [InstCombine] PHI-aware aggregate reconstruction:
 properly handle duplicate predecessors

While it may seem like we can just "deduplicate" the case where
some basic block happens to be a predecessor more than once,
which happens for e.g. switches, that is not correct thing to do.
We must actually add a PHI operand for each predecessor.

This was initially reported to me by David Major
as a clang crash during gecko build for android.
---
 .../InstCombine/InstCombineVectorOps.cpp      | 30 ++++++----
 .../phi-aware-aggregate-reconstruction.ll     | 57 +++++++++++++++++++
 2 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index d3193dc7e11e90..adf3643a8514c2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -931,17 +931,24 @@ Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse(
 
   // Arbitrary predecessor count limit.
   static const int PredCountLimit = 64;
-  // Don't bother if there are too many predecessors.
-  if (UseBB->hasNPredecessorsOrMore(PredCountLimit + 1))
-    return nullptr;
+
+  // Cache the (non-uniqified!) list of predecessors in a vector,
+  // checking the limit at the same time for efficiency.
+  SmallVector<BasicBlock *, 4> Preds; // May have duplicates!
+  for (BasicBlock *Pred : predecessors(UseBB)) {
+    // Don't bother if there are too many predecessors.
+    if (Preds.size() >= PredCountLimit) // FIXME: only count duplicates once?
+      return nullptr;
+    Preds.emplace_back(Pred);
+  }
 
   // For each predecessor, what is the source aggregate,
   // from which all the elements were originally extracted from?
   // Note that we want for the map to have stable iteration order!
   SmallMapVector<BasicBlock *, Value *, 4> SourceAggregates;
-  for (BasicBlock *PredBB : predecessors(UseBB)) {
+  for (BasicBlock *Pred : Preds) {
     std::pair<decltype(SourceAggregates)::iterator, bool> IV =
-        SourceAggregates.insert({PredBB, nullptr});
+        SourceAggregates.insert({Pred, nullptr});
     // Did we already evaluate this predecessor?
     if (!IV.second)
       continue;
@@ -949,7 +956,7 @@ Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse(
     // Let's hope that when coming from predecessor Pred, all elements of the
     // aggregate produced by OrigIVI must have been originally extracted from
     // the same aggregate. Is that so? Can we find said original aggregate?
-    SourceAggregate = FindCommonSourceAggregate(UseBB, PredBB);
+    SourceAggregate = FindCommonSourceAggregate(UseBB, Pred);
     if (Describe(SourceAggregate) != AggregateDescription::Found)
       return nullptr; // Give up.
     IV.first->second = *SourceAggregate;
@@ -958,13 +965,14 @@ Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse(
   // All good! Now we just need to thread the source aggregates here.
   // Note that we have to insert the new PHI here, ourselves, because we can't
   // rely on InstCombinerImpl::run() inserting it into the right basic block.
+  // Note that the same block can be a predecessor more than once,
+  // and we need to preserve that invariant for the PHI node.
   BuilderTy::InsertPointGuard Guard(Builder);
   Builder.SetInsertPoint(UseBB->getFirstNonPHI());
-  auto *PHI = Builder.CreatePHI(AggTy, SourceAggregates.size(),
-                                OrigIVI.getName() + ".merged");
-  for (const std::pair<BasicBlock *, Value *> &SourceAggregate :
-       SourceAggregates)
-    PHI->addIncoming(SourceAggregate.second, SourceAggregate.first);
+  auto *PHI =
+      Builder.CreatePHI(AggTy, Preds.size(), OrigIVI.getName() + ".merged");
+  for (BasicBlock *Pred : Preds)
+    PHI->addIncoming(SourceAggregates[Pred], Pred);
 
   ++NumAggregateReconstructionsSimplified;
   OrigIVI.replaceAllUsesWith(PHI);
diff --git a/llvm/test/Transforms/InstCombine/phi-aware-aggregate-reconstruction.ll b/llvm/test/Transforms/InstCombine/phi-aware-aggregate-reconstruction.ll
index 3d0fdcdf962f63..d1793343da6cd6 100644
--- a/llvm/test/Transforms/InstCombine/phi-aware-aggregate-reconstruction.ll
+++ b/llvm/test/Transforms/InstCombine/phi-aware-aggregate-reconstruction.ll
@@ -420,3 +420,60 @@ end:
   %i8 = insertvalue { i32, i32 } %i7, i32 %i3, 1
   ret { i32, i32 } %i8
 }
+
+; Most basic test - diamond structure, but with a switch, which results in multiple duplicate predecessors
+define { i32, i32 } @test8({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c, i32 %val_left, i32 %val_right) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]]
+; CHECK:       left:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    switch i32 [[VAL_LEFT:%.*]], label [[IMPOSSIBLE:%.*]] [
+; CHECK-NEXT:    i32 -42, label [[END:%.*]]
+; CHECK-NEXT:    i32 42, label [[END]]
+; CHECK-NEXT:    ]
+; CHECK:       right:
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    switch i32 [[VAL_RIGHT:%.*]], label [[IMPOSSIBLE]] [
+; CHECK-NEXT:    i32 42, label [[END]]
+; CHECK-NEXT:    i32 -42, label [[END]]
+; CHECK-NEXT:    ]
+; CHECK:       impossible:
+; CHECK-NEXT:    unreachable
+; CHECK:       end:
+; CHECK-NEXT:    [[I8_MERGED:%.*]] = phi { i32, i32 } [ [[AGG_RIGHT:%.*]], [[RIGHT]] ], [ [[AGG_RIGHT]], [[RIGHT]] ], [ [[AGG_LEFT:%.*]], [[LEFT]] ], [ [[AGG_LEFT]], [[LEFT]] ]
+; CHECK-NEXT:    call void @baz()
+; CHECK-NEXT:    ret { i32, i32 } [[I8_MERGED]]
+;
+entry:
+  br i1 %c, label %left, label %right
+
+left:
+  %i0 = extractvalue { i32, i32 } %agg_left, 0
+  %i2 = extractvalue { i32, i32 } %agg_left, 1
+  call void @foo()
+  switch i32 %val_left, label %impossible [
+  i32 -42, label %end
+  i32 42, label %end
+  ]
+
+right:
+  %i3 = extractvalue { i32, i32 } %agg_right, 0
+  %i4 = extractvalue { i32, i32 } %agg_right, 1
+  call void @bar()
+  switch i32 %val_right, label %impossible [
+  i32 42, label %end
+  i32 -42, label %end
+  ]
+
+impossible:
+  unreachable
+
+end:
+  %i5 = phi i32 [ %i0, %left ], [ %i0, %left ], [ %i3, %right ], [ %i3, %right ]
+  %i6 = phi i32 [ %i2, %left ], [ %i2, %left ], [ %i4, %right ], [ %i4, %right ]
+  call void @baz()
+  %i7 = insertvalue { i32, i32 } undef, i32 %i5, 0
+  %i8 = insertvalue { i32, i32 } %i7, i32 %i6, 1
+  ret { i32, i32 } %i8
+}

From b34b1e38381fa4d1b1d9751a6b5233b68e734cfe Mon Sep 17 00:00:00 2001
From: Nithin Vadukkumchery Rajendrakumar <vrnithinkumar@gmail.com>
Date: Wed, 12 Aug 2020 00:05:06 +0200
Subject: [PATCH 092/101] [Analysis] Bug fix for exploded graph branching in
 evalCall for constructor

Summary:
Make exactly single NodeBuilder exists at any given time

Reviewers: NoQ, Szelethus, vsavchenko, xazax.hun

Reviewed By: NoQ

Subscribers: martong, cfe-commits
Tags: #clang

Differential Revision: https://reviews.llvm.org/D85796
---
 .../lib/StaticAnalyzer/Core/ExprEngineCXX.cpp |   6 +-
 clang/test/Analysis/smart-ptr-text-output.cpp |  13 +-
 clang/test/Analysis/smart-ptr.cpp             | 145 ++++++++++--------
 3 files changed, 91 insertions(+), 73 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
index 38a680eb04c008..802bc934cfb067 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
@@ -602,11 +602,11 @@ void ExprEngine::handleConstructor(const Expr *E,
                                             *Call, *this);
 
   ExplodedNodeSet DstEvaluated;
-  StmtNodeBuilder Bldr(DstPreCall, DstEvaluated, *currBldrCtx);
 
   if (CE && CE->getConstructor()->isTrivial() &&
       CE->getConstructor()->isCopyOrMoveConstructor() &&
       !CallOpts.IsArrayCtorOrDtor) {
+    StmtNodeBuilder Bldr(DstPreCall, DstEvaluated, *currBldrCtx);
     // FIXME: Handle other kinds of trivial constructors as well.
     for (ExplodedNodeSet::iterator I = DstPreCall.begin(), E = DstPreCall.end();
          I != E; ++I)
@@ -626,6 +626,8 @@ void ExprEngine::handleConstructor(const Expr *E,
   // in the CFG, would be called at the end of the full expression or
   // later (for life-time extended temporaries) -- but avoids infeasible
   // paths when no-return temporary destructors are used for assertions.
+  ExplodedNodeSet DstEvaluatedPostProcessed;
+  StmtNodeBuilder Bldr(DstEvaluated, DstEvaluatedPostProcessed, *currBldrCtx);
   const AnalysisDeclContext *ADC = LCtx->getAnalysisDeclContext();
   if (!ADC->getCFGBuildOptions().AddTemporaryDtors) {
     if (llvm::isa_and_nonnull<CXXTempObjectRegion>(TargetRegion) &&
@@ -655,7 +657,7 @@ void ExprEngine::handleConstructor(const Expr *E,
   }
 
   ExplodedNodeSet DstPostArgumentCleanup;
-  for (ExplodedNode *I : DstEvaluated)
+  for (ExplodedNode *I : DstEvaluatedPostProcessed)
     finishArgumentConstruction(DstPostArgumentCleanup, I, *Call);
 
   // If there were other constructors called for object-type arguments
diff --git a/clang/test/Analysis/smart-ptr-text-output.cpp b/clang/test/Analysis/smart-ptr-text-output.cpp
index 5280d0021884d8..1132a37fa66795 100644
--- a/clang/test/Analysis/smart-ptr-text-output.cpp
+++ b/clang/test/Analysis/smart-ptr-text-output.cpp
@@ -36,14 +36,15 @@ void derefAfterCtrWithNullVariable() {
 }
 
 void derefAfterRelease() {
-  std::unique_ptr<A> P(new A());
+  std::unique_ptr<A> P(new A()); // expected-note {{Smart pointer 'P' is constructed}}
+  // FIXME: should mark region as uninteresting after release, so above note will not be there
   P.release(); // expected-note {{Smart pointer 'P' is released and set to null}}
   P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
   // expected-note@-1{{Dereference of null smart pointer 'P'}}
 }
 
 void derefAfterReset() {
-  std::unique_ptr<A> P(new A());
+  std::unique_ptr<A> P(new A()); // expected-note {{Smart pointer 'P' is constructed}}
   P.reset(); // expected-note {{Smart pointer 'P' reset using a null value}}
   P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
   // expected-note@-1{{Dereference of null smart pointer 'P'}}
@@ -51,7 +52,7 @@ void derefAfterReset() {
 
 void derefAfterResetWithNull() {
   A *NullInnerPtr = nullptr; // expected-note {{'NullInnerPtr' initialized to a null pointer value}}
-  std::unique_ptr<A> P(new A());
+  std::unique_ptr<A> P(new A()); // expected-note {{Smart pointer 'P' is constructed}}
   P.reset(NullInnerPtr); // expected-note {{Smart pointer 'P' reset using a null value}}
   P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
   // expected-note@-1{{Dereference of null smart pointer 'P'}}
@@ -67,7 +68,7 @@ void derefOnReleasedNullRawPtr() {
 }
 
 void derefOnSwappedNullPtr() {
-  std::unique_ptr<A> P(new A());
+  std::unique_ptr<A> P(new A()); // expected-note {{Smart pointer 'P' is constructed}}
   std::unique_ptr<A> PNull; // expected-note {{Default constructed smart pointer 'PNull' is null}}
   P.swap(PNull); // expected-note {{Swapped null smart pointer 'PNull' with smart pointer 'P'}}
   PNull->foo(); // No warning.
@@ -77,13 +78,11 @@ void derefOnSwappedNullPtr() {
 
 // FIXME: Fix this test when "std::swap" is modeled seperately.
 void derefOnStdSwappedNullPtr() {
-  std::unique_ptr<A> P;
+  std::unique_ptr<A> P; // expected-note {{Default constructed smart pointer 'P' is null}}
   std::unique_ptr<A> PNull; // expected-note {{Default constructed smart pointer 'PNull' is null}}
   std::swap(P, PNull); // expected-note@Inputs/system-header-simulator-cxx.h:978 {{Swapped null smart pointer 'PNull' with smart pointer 'P'}}
   // expected-note@-1 {{Calling 'swap<A>'}}
   // expected-note@-2 {{Returning from 'swap<A>'}}
-  PNull->foo(); // expected-warning {{Dereference of null smart pointer 'PNull' [alpha.cplusplus.SmartPtr]}}
-  // expected-note@-1{{Dereference of null smart pointer 'PNull'}}
   P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
   // expected-note@-1{{Dereference of null smart pointer 'P'}}
 }
diff --git a/clang/test/Analysis/smart-ptr.cpp b/clang/test/Analysis/smart-ptr.cpp
index f72a918aee203e..bcf1e569d690ab 100644
--- a/clang/test/Analysis/smart-ptr.cpp
+++ b/clang/test/Analysis/smart-ptr.cpp
@@ -41,6 +41,7 @@ A *return_null() {
 
 void derefAfterValidCtr() {
   std::unique_ptr<A> P(new A());
+  clang_analyzer_numTimesReached(); // expected-warning {{1}}
   P->foo(); // No warning.
 }
 
@@ -50,17 +51,20 @@ void derefOfUnknown(std::unique_ptr<A> P) {
 
 void derefAfterDefaultCtr() {
   std::unique_ptr<A> P;
+  clang_analyzer_numTimesReached(); // expected-warning {{1}}
   P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
 }
 
 void derefAfterCtrWithNull() {
   std::unique_ptr<A> P(nullptr);
+  clang_analyzer_numTimesReached(); // expected-warning {{1}}
   *P; // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
 }
 
 void derefAfterCtrWithNullVariable() {
   A *InnerPtr = nullptr;
   std::unique_ptr<A> P(InnerPtr);
+  clang_analyzer_numTimesReached(); // expected-warning {{1}}
   P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
 }
 
@@ -87,6 +91,7 @@ void derefAfterResetWithNull() {
 void derefAfterResetWithNonNull() {
   std::unique_ptr<A> P;
   P.reset(new A());
+  clang_analyzer_numTimesReached(); // expected-warning {{1}}
   P->foo(); // No warning.
 }
 
@@ -116,37 +121,40 @@ void pass_smart_ptr_by_const_rvalue_ref(const std::unique_ptr<A> &&a);
 void pass_smart_ptr_by_ptr(std::unique_ptr<A> *a);
 void pass_smart_ptr_by_const_ptr(const std::unique_ptr<A> *a);
 
-void regioninvalidationTest() {
-  {
-    std::unique_ptr<A> P;
-    pass_smart_ptr_by_ref(P);
-    P->foo(); // no-warning
-  }
-  {
-    std::unique_ptr<A> P;
-    pass_smart_ptr_by_const_ref(P);
-    P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
-  }
-  {
-    std::unique_ptr<A> P;
-    pass_smart_ptr_by_rvalue_ref(std::move(P));
-    P->foo(); // no-warning
-  }
-  {
-    std::unique_ptr<A> P;
-    pass_smart_ptr_by_const_rvalue_ref(std::move(P));
-    P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
-  }
-  {
-    std::unique_ptr<A> P;
-    pass_smart_ptr_by_ptr(&P);
-    P->foo();
-  }
-  {
-    std::unique_ptr<A> P;
-    pass_smart_ptr_by_const_ptr(&P);
-    P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
-  }
+void regioninvalidationWithPassByRef() {
+  std::unique_ptr<A> P;
+  pass_smart_ptr_by_ref(P);
+  P->foo(); // no-warning
+}
+
+void regioninvalidationWithPassByCostRef() {
+  std::unique_ptr<A> P;
+  pass_smart_ptr_by_const_ref(P);
+  P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
+}
+
+void regioninvalidationWithPassByRValueRef() {
+  std::unique_ptr<A> P;
+  pass_smart_ptr_by_rvalue_ref(std::move(P));
+  P->foo(); // no-warning
+}
+
+void regioninvalidationWithPassByConstRValueRef() {
+  std::unique_ptr<A> P;
+  pass_smart_ptr_by_const_rvalue_ref(std::move(P));
+  P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
+}
+
+void regioninvalidationWithPassByPtr() {
+  std::unique_ptr<A> P;
+  pass_smart_ptr_by_ptr(&P);
+  P->foo();
+}
+
+void regioninvalidationWithPassByConstPtr() {
+  std::unique_ptr<A> P;
+  pass_smart_ptr_by_const_ptr(&P);
+  P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
 }
 
 struct StructWithSmartPtr {
@@ -160,37 +168,40 @@ void pass_struct_with_smart_ptr_by_const_rvalue_ref(const StructWithSmartPtr &&a
 void pass_struct_with_smart_ptr_by_ptr(StructWithSmartPtr *a);
 void pass_struct_with_smart_ptr_by_const_ptr(const StructWithSmartPtr *a);
 
-void regioninvalidationTestWithinStruct() {
-  {
-    StructWithSmartPtr S;
-    pass_struct_with_smart_ptr_by_ref(S);
-    S.P->foo(); // no-warning
-  }
-  {
-    StructWithSmartPtr S;
-    pass_struct_with_smart_ptr_by_const_ref(S);
-    S.P->foo(); // expected-warning {{Dereference of null smart pointer 'S.P' [alpha.cplusplus.SmartPtr]}}
-  }
-  {
-    StructWithSmartPtr S;
-    pass_struct_with_smart_ptr_by_rvalue_ref(std::move(S));
-    S.P->foo(); // no-warning
-  }
-  {
-    StructWithSmartPtr S;
-    pass_struct_with_smart_ptr_by_const_rvalue_ref(std::move(S));
-    S.P->foo(); // expected-warning {{Dereference of null smart pointer 'S.P' [alpha.cplusplus.SmartPtr]}}
-  }
-  {
-    StructWithSmartPtr S;
-    pass_struct_with_smart_ptr_by_ptr(&S);
-    S.P->foo();
-  }
-  {
-    StructWithSmartPtr S;
-    pass_struct_with_smart_ptr_by_const_ptr(&S);
-    S.P->foo(); // expected-warning {{Dereference of null smart pointer 'S.P' [alpha.cplusplus.SmartPtr]}}
-  }
+void regioninvalidationWithinStructPassByRef() {
+  StructWithSmartPtr S;
+  pass_struct_with_smart_ptr_by_ref(S);
+  S.P->foo(); // no-warning
+}
+
+void regioninvalidationWithinStructPassByConstRef() {
+  StructWithSmartPtr S;
+  pass_struct_with_smart_ptr_by_const_ref(S);
+  S.P->foo(); // expected-warning {{Dereference of null smart pointer 'S.P' [alpha.cplusplus.SmartPtr]}}
+}
+
+void regioninvalidationWithinStructPassByRValueRef() {
+  StructWithSmartPtr S;
+  pass_struct_with_smart_ptr_by_rvalue_ref(std::move(S));
+  S.P->foo(); // no-warning
+}
+
+void regioninvalidationWithinStructPassByConstRValueRef() {
+  StructWithSmartPtr S;
+  pass_struct_with_smart_ptr_by_const_rvalue_ref(std::move(S));
+  S.P->foo(); // expected-warning {{Dereference of null smart pointer 'S.P' [alpha.cplusplus.SmartPtr]}}
+}
+
+void regioninvalidationWithinStructPassByPtr() {
+  StructWithSmartPtr S;
+  pass_struct_with_smart_ptr_by_ptr(&S);
+  S.P->foo(); // no-warning
+}
+
+void regioninvalidationWithinStructPassByConstPtr() {
+  StructWithSmartPtr S;
+  pass_struct_with_smart_ptr_by_const_ptr(&S);
+  S.P->foo(); // expected-warning {{Dereference of null smart pointer 'S.P' [alpha.cplusplus.SmartPtr]}}
 }
 
 void derefAfterAssignment() {
@@ -217,14 +228,20 @@ void derefOnSwappedNullPtr() {
   (*P).foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
 }
 
-void derefOnStdSwappedNullPtr() {
+void derefOnFirstStdSwappedNullPtr() {
   std::unique_ptr<A> P;
   std::unique_ptr<A> PNull;
   std::swap(P, PNull);
-  PNull->foo(); // expected-warning {{Dereference of null smart pointer 'PNull' [alpha.cplusplus.SmartPtr]}}
   P->foo(); // expected-warning {{Dereference of null smart pointer 'P' [alpha.cplusplus.SmartPtr]}}
 }
 
+void derefOnSecondStdSwappedNullPtr() {
+  std::unique_ptr<A> P;
+  std::unique_ptr<A> PNull;
+  std::swap(P, PNull);
+  PNull->foo(); // expected-warning {{Dereference of null smart pointer 'PNull' [alpha.cplusplus.SmartPtr]}}
+}
+
 void derefOnSwappedValidPtr() {
   std::unique_ptr<A> P(new A());
   std::unique_ptr<A> PValid(new A());

From 2f0178585722ae289a71cfd81f9ca42235e3aefd Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Wed, 19 Aug 2020 01:08:38 +0300
Subject: [PATCH 093/101] [NFC][InstCombine] Aggregate reconstruction: use
 plain map

Now that we no longer require for this map to have stable iteration order,
we no longer need to pay for keeping the iteration order stable,
so switch from `SmallMapVector` to `SmallDenseMap`.
---
 llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index adf3643a8514c2..f6cd3649e677ee 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -945,7 +945,7 @@ Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse(
   // For each predecessor, what is the source aggregate,
   // from which all the elements were originally extracted from?
   // Note that we want for the map to have stable iteration order!
-  SmallMapVector<BasicBlock *, Value *, 4> SourceAggregates;
+  SmallDenseMap<BasicBlock *, Value *, 4> SourceAggregates;
   for (BasicBlock *Pred : Preds) {
     std::pair<decltype(SourceAggregates)::iterator, bool> IV =
         SourceAggregates.insert({Pred, nullptr});

From 4cbceb74bb5676d0181d4d0cab5194d90a42c2ec Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Tue, 18 Aug 2020 14:52:44 -0700
Subject: [PATCH 094/101] [X86] Add basic support for -mtune command line
 option in clang

Building on the backend support from D85165. This parses the command line option in the driver, passes it on to CC1 and adds a function attribute.

-Still need to support tune on the target attribute.
-Need to use "generic" as the tuning by default. But need to change generic in the backend first.
-Need to set tune if march is specified and mtune isn't.
-May need to disable getHostCPUName's ability to guess CPU name from features when it doesn't have a family/model match for mtune=native. That's what gcc appears to do.

Differential Revision: https://reviews.llvm.org/D85384
---
 clang/include/clang/Basic/TargetOptions.h |  3 +++
 clang/include/clang/Driver/Options.td     |  4 +++-
 clang/lib/Basic/Targets.cpp               | 10 ++++++++
 clang/lib/CodeGen/CodeGenModule.cpp       |  5 ++++
 clang/lib/Driver/ToolChains/Clang.cpp     | 13 ++++++++++
 clang/lib/Frontend/CompilerInvocation.cpp |  1 +
 clang/test/Misc/target-invalid-cpu-note.c | 29 +++++++++++++++++++++++
 7 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h
index 4a5d469b8e547c..d1cc024957dae4 100644
--- a/clang/include/clang/Basic/TargetOptions.h
+++ b/clang/include/clang/Basic/TargetOptions.h
@@ -35,6 +35,9 @@ class TargetOptions {
   /// If given, the name of the target CPU to generate code for.
   std::string CPU;
 
+  /// If given, the name of the target CPU to tune code for.
+  std::string TuneCPU;
+
   /// If given, the unit to use for floating point math.
   std::string FPMath;
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 9d36dc6cc3934a..6827c877acf8a2 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2715,7 +2715,7 @@ def module_file_info : Flag<["-"], "module-file-info">, Flags<[DriverOption,CC1O
   HelpText<"Provide information about a particular module file">;
 def mthumb : Flag<["-"], "mthumb">, Group<m_Group>;
 def mtune_EQ : Joined<["-"], "mtune=">, Group<m_Group>,
-  HelpText<"Accepted for compatibility with GCC. Currently has no effect.">;
+  HelpText<"Only supported on X86. Otherwise accepted for compatibility with GCC.">;
 def multi__module : Flag<["-"], "multi_module">;
 def multiply__defined__unused : Separate<["-"], "multiply_defined_unused">;
 def multiply__defined : Separate<["-"], "multiply_defined">;
@@ -3490,6 +3490,8 @@ let Flags = [CC1Option, CC1AsOption, NoDriverOption] in {
 
 def target_cpu : Separate<["-"], "target-cpu">,
   HelpText<"Target a specific cpu type">;
+def tune_cpu : Separate<["-"], "tune-cpu">,
+  HelpText<"Tune for a specific cpu type">;
 def target_feature : Separate<["-"], "target-feature">,
   HelpText<"Target specific attributes">;
 def triple : Separate<["-"], "triple">,
diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index e4456ea7fa0f85..c90becd9f01237 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -652,6 +652,16 @@ TargetInfo::CreateTargetInfo(DiagnosticsEngine &Diags,
     return nullptr;
   }
 
+  // Check the TuneCPU name if specified.
+  if (!Opts->TuneCPU.empty() && !Target->isValidCPUName(Opts->TuneCPU)) {
+    Diags.Report(diag::err_target_unknown_cpu) << Opts->TuneCPU;
+    SmallVector<StringRef, 32> ValidList;
+    Target->fillValidCPUList(ValidList);
+    if (!ValidList.empty())
+      Diags.Report(diag::note_valid_options) << llvm::join(ValidList, ", ");
+    return nullptr;
+  }
+
   // Set the target ABI if specified.
   if (!Opts->ABI.empty() && !Target->setABI(Opts->ABI)) {
     Diags.Report(diag::err_target_unknown_abi) << Opts->ABI;
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index ff35d94626d1dc..23d35f68e141b3 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1749,6 +1749,7 @@ bool CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD,
   // we have a decl for the function and it has a target attribute then
   // parse that and add it to the feature set.
   StringRef TargetCPU = getTarget().getTargetOpts().CPU;
+  StringRef TuneCPU = getTarget().getTargetOpts().TuneCPU;
   std::vector<std::string> Features;
   const auto *FD = dyn_cast_or_null<FunctionDecl>(GD.getDecl());
   FD = FD ? FD->getMostRecentDecl() : FD;
@@ -1783,6 +1784,10 @@ bool CodeGenModule::GetCPUAndFeaturesAttributes(GlobalDecl GD,
     Attrs.addAttribute("target-cpu", TargetCPU);
     AddedAttr = true;
   }
+  if (TuneCPU != "") {
+    Attrs.addAttribute("tune-cpu", TuneCPU);
+    AddedAttr = true;
+  }
   if (!Features.empty()) {
     llvm::sort(Features);
     Attrs.addAttribute("target-features", llvm::join(Features, ","));
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 82cf2538338f04..5350fb505ff379 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Compression.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/TargetParser.h"
@@ -2071,6 +2072,18 @@ void Clang::AddX86TargetArgs(const ArgList &Args,
     CmdArgs.push_back("soft");
     CmdArgs.push_back("-mstack-alignment=4");
   }
+
+  // Handle -mtune.
+  // FIXME: We should default to "generic" unless -march is set to match gcc.
+  if (const Arg *A = Args.getLastArg(clang::driver::options::OPT_mtune_EQ)) {
+    StringRef Name = A->getValue();
+
+    if (Name == "native")
+      Name = llvm::sys::getHostCPUName();
+
+    CmdArgs.push_back("-tune-cpu");
+    CmdArgs.push_back(Args.MakeArgString(Name));
+  }
 }
 
 void Clang::AddHexagonTargetArgs(const ArgList &Args,
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 3b69eef12b90e3..477959f04c414c 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3658,6 +3658,7 @@ static void ParseTargetArgs(TargetOptions &Opts, ArgList &Args,
       Opts.EABIVersion = EABIVersion;
   }
   Opts.CPU = std::string(Args.getLastArgValue(OPT_target_cpu));
+  Opts.TuneCPU = std::string(Args.getLastArgValue(OPT_tune_cpu));
   Opts.FPMath = std::string(Args.getLastArgValue(OPT_mfpmath));
   Opts.FeaturesAsWritten = Args.getAllArgValues(OPT_target_feature);
   Opts.LinkerVersion =
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index bf6eaefe0b3caf..2cad62e2e245f0 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -8,6 +8,11 @@
 // AARCH64: note: valid target CPU values are:
 // AARCH64-SAME: cortex-a35,
 
+// RUN: not %clang_cc1 -triple arm64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_AARCH64
+// TUNE_AARCH64: error: unknown target CPU 'not-a-cpu'
+// TUNE_AARCH64: note: valid target CPU values are:
+// TUNE_AARCH64-SAME: cortex-a35,
+
 // RUN: not %clang_cc1 -triple i386--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix X86
 // X86: error: unknown target CPU 'not-a-cpu'
 // X86: note: valid target CPU values are: i386, i486, winchip-c6, winchip2, c3,
@@ -32,6 +37,30 @@
 // X86_64-SAME: athlon64-sse3, opteron-sse3, amdfam10, barcelona, btver1,
 // X86_64-SAME: btver2, bdver1, bdver2, bdver3, bdver4, znver1, znver2, x86-64
 
+// RUN: not %clang_cc1 -triple i386--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_X86
+// TUNE_X86: error: unknown target CPU 'not-a-cpu'
+// TUNE_X86: note: valid target CPU values are: i386, i486, winchip-c6, winchip2, c3,
+// TUNE_X86-SAME: i586, pentium, pentium-mmx, pentiumpro, i686, pentium2, pentium3,
+// TUNE_X86-SAME: pentium3m, pentium-m, c3-2, yonah, pentium4, pentium4m, prescott,
+// TUNE_X86-SAME: nocona, core2, penryn, bonnell, atom, silvermont, slm, goldmont, goldmont-plus, tremont,
+// TUNE_X86-SAME: nehalem, corei7, westmere, sandybridge, corei7-avx, ivybridge,
+// TUNE_X86-SAME: core-avx-i, haswell, core-avx2, broadwell, skylake, skylake-avx512,
+// TUNE_X86-SAME: skx, cascadelake, cooperlake, cannonlake, icelake-client, icelake-server, tigerlake, knl, knm, lakemont, k6, k6-2, k6-3,
+// TUNE_X86-SAME: athlon, athlon-tbird, athlon-xp, athlon-mp, athlon-4, k8, athlon64,
+// TUNE_X86-SAME: athlon-fx, opteron, k8-sse3, athlon64-sse3, opteron-sse3, amdfam10,
+// TUNE_X86-SAME: barcelona, btver1, btver2, bdver1, bdver2, bdver3, bdver4, znver1, znver2,
+// TUNE_X86-SAME: x86-64, geode
+
+// RUN: not %clang_cc1 -triple x86_64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_X86_64
+// TUNE_X86_64: error: unknown target CPU 'not-a-cpu'
+// TUNE_X86_64: note: valid target CPU values are: nocona, core2, penryn, bonnell,
+// TUNE_X86_64-SAME: atom, silvermont, slm, goldmont, goldmont-plus, tremont, nehalem, corei7, westmere,
+// TUNE_X86_64-SAME: sandybridge, corei7-avx, ivybridge, core-avx-i, haswell,
+// TUNE_X86_64-SAME: core-avx2, broadwell, skylake, skylake-avx512, skx, cascadelake, cooperlake, cannonlake,
+// TUNE_X86_64-SAME: icelake-client, icelake-server, tigerlake, knl, knm, k8, athlon64, athlon-fx, opteron, k8-sse3,
+// TUNE_X86_64-SAME: athlon64-sse3, opteron-sse3, amdfam10, barcelona, btver1,
+// TUNE_X86_64-SAME: btver2, bdver1, bdver2, bdver3, bdver4, znver1, znver2, x86-64
+
 // RUN: not %clang_cc1 -triple nvptx--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix NVPTX
 // NVPTX: error: unknown target CPU 'not-a-cpu'
 // NVPTX: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35,

From d84fe55e0d4ddc2d2d65a6ff988368281a01b385 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 18 Aug 2020 22:15:59 +0000
Subject: [PATCH 095/101] Revert "Separate the Registration from Loading
 dialects in the Context"

This reverts commit e1de2b75501e5eaf8777bd5248382a7c55a44fd6.
Broke a build bot.
---
 .../standalone-opt/standalone-opt.cpp         | 11 +--
 mlir/examples/toy/Ch2/toyc.cpp                |  7 +-
 mlir/examples/toy/Ch3/toyc.cpp                |  6 +-
 mlir/examples/toy/Ch4/toyc.cpp                |  6 +-
 .../toy/Ch5/mlir/LowerToAffineLoops.cpp       |  3 -
 mlir/examples/toy/Ch5/toyc.cpp                |  6 +-
 .../toy/Ch6/mlir/LowerToAffineLoops.cpp       |  3 -
 mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp    |  3 -
 mlir/examples/toy/Ch6/toyc.cpp                |  6 +-
 .../toy/Ch7/mlir/LowerToAffineLoops.cpp       |  3 -
 mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp    |  3 -
 mlir/examples/toy/Ch7/toyc.cpp                |  6 +-
 mlir/include/mlir-c/IR.h                      |  6 --
 mlir/include/mlir/Conversion/Passes.td        | 26 ------
 mlir/include/mlir/Dialect/Affine/Passes.td    |  1 -
 .../include/mlir/Dialect/LLVMIR/LLVMDialect.h |  1 -
 .../include/mlir/Dialect/LLVMIR/LLVMOpBase.td |  5 --
 .../include/mlir/Dialect/LLVMIR/NVVMDialect.h |  1 -
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   |  1 -
 .../mlir/Dialect/LLVMIR/ROCDLDialect.h        |  1 -
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |  1 -
 mlir/include/mlir/Dialect/Linalg/Passes.td    |  8 --
 mlir/include/mlir/Dialect/SCF/Passes.td       |  1 -
 mlir/include/mlir/IR/Dialect.h                | 87 +++----------------
 mlir/include/mlir/IR/FunctionSupport.h        |  4 +-
 mlir/include/mlir/IR/MLIRContext.h            | 67 +++-----------
 mlir/include/mlir/IR/OpBase.td                |  5 --
 mlir/include/mlir/InitAllDialects.h           | 47 +++++-----
 mlir/include/mlir/InitAllTranslations.h       |  4 +-
 mlir/include/mlir/Pass/Pass.h                 |  8 --
 mlir/include/mlir/Pass/PassBase.td            |  3 -
 mlir/include/mlir/Pass/PassManager.h          | 14 ---
 mlir/include/mlir/Support/MlirOptMain.h       | 20 +----
 mlir/include/mlir/TableGen/Dialect.h          |  8 +-
 mlir/include/mlir/TableGen/Pass.h             |  4 -
 mlir/include/mlir/Transforms/Passes.td        |  2 -
 mlir/lib/CAPI/IR/IR.cpp                       |  9 +-
 ...ConvertGPULaunchFuncToVulkanLaunchFunc.cpp |  1 -
 .../Conversion/LinalgToLLVM/LinalgToLLVM.cpp  |  1 -
 mlir/lib/Conversion/PassDetail.h              | 32 -------
 .../StandardToLLVM/StandardToLLVM.cpp         |  2 +-
 .../LegalizeStandardForSPIRV.cpp              |  1 -
 .../Dialect/Affine/Transforms/PassDetail.h    | 10 ---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      |  1 -
 .../Dialect/Linalg/Transforms/PassDetail.h    |  9 --
 mlir/lib/Dialect/SCF/Transforms/PassDetail.h  |  5 --
 mlir/lib/Dialect/SDBM/SDBMExpr.cpp            |  2 +-
 mlir/lib/ExecutionEngine/JitRunner.cpp        |  4 +-
 mlir/lib/IR/Dialect.cpp                       | 36 +++-----
 mlir/lib/IR/MLIRContext.cpp                   | 87 +++++++------------
 mlir/lib/IR/Operation.cpp                     |  2 +-
 mlir/lib/IR/Verifier.cpp                      |  4 +-
 mlir/lib/Parser/AttributeParser.cpp           | 10 +--
 mlir/lib/Parser/DialectSymbolParser.cpp       |  7 +-
 mlir/lib/Parser/Parser.cpp                    | 41 +++------
 mlir/lib/Pass/Pass.cpp                        | 27 ------
 mlir/lib/Pass/PassDetail.h                    |  4 -
 mlir/lib/Support/MlirOptMain.cpp              | 48 +++++-----
 mlir/lib/TableGen/Dialect.cpp                 |  8 --
 mlir/lib/TableGen/Pass.cpp                    |  5 --
 mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp  |  1 -
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  |  5 +-
 mlir/lib/Transforms/PassDetail.h              |  7 --
 mlir/test/CAPI/ir.c                           |  1 -
 mlir/test/EDSC/builder-api-test.cpp           | 18 ++--
 mlir/test/SDBM/sdbm-api-test.cpp              |  9 +-
 .../Dialect/Affine/TestVectorizationUtils.cpp |  4 -
 .../lib/Dialect/SPIRV/TestAvailability.cpp    |  2 +-
 mlir/test/lib/Dialect/Test/TestDialect.cpp    |  4 -
 mlir/test/lib/Dialect/Test/TestDialect.h      |  2 -
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   |  4 -
 .../lib/Transforms/TestAllReduceLowering.cpp  |  4 -
 .../lib/Transforms/TestBufferPlacement.cpp    |  4 -
 .../lib/Transforms/TestGpuMemoryPromotion.cpp |  7 --
 .../lib/Transforms/TestLinalgHoisting.cpp     |  4 -
 .../lib/Transforms/TestLinalgTransforms.cpp   | 11 ---
 .../lib/Transforms/TestVectorTransforms.cpp   |  8 --
 mlir/test/mlir-opt/commandline.mlir           |  2 +-
 .../mlir-linalg-ods-gen.cpp                   |  2 +-
 mlir/tools/mlir-opt/mlir-opt.cpp              |  8 +-
 mlir/tools/mlir-tblgen/DialectGen.cpp         | 20 +----
 mlir/tools/mlir-tblgen/PassGen.cpp            | 21 +----
 mlir/tools/mlir-translate/mlir-translate.cpp  |  3 +-
 .../Dialect/Quant/QuantizationUtilsTest.cpp   | 15 ++--
 .../Dialect/SPIRV/DeserializationTest.cpp     |  3 +-
 .../Dialect/SPIRV/SerializationTest.cpp       |  5 +-
 mlir/unittests/IR/AttributeTest.cpp           | 32 +++----
 mlir/unittests/IR/DialectTest.cpp             |  6 +-
 mlir/unittests/IR/OperationSupportTest.cpp    |  8 +-
 mlir/unittests/Pass/AnalysisManagerTest.cpp   |  8 +-
 mlir/unittests/SDBM/SDBMTest.cpp              |  7 +-
 mlir/unittests/TableGen/OpBuildGen.cpp        |  9 +-
 mlir/unittests/TableGen/StructsGenTest.cpp    |  2 +-
 93 files changed, 231 insertions(+), 759 deletions(-)

diff --git a/mlir/examples/standalone/standalone-opt/standalone-opt.cpp b/mlir/examples/standalone/standalone-opt/standalone-opt.cpp
index 2dfb859ebd0526..b33dab26a71367 100644
--- a/mlir/examples/standalone/standalone-opt/standalone-opt.cpp
+++ b/mlir/examples/standalone/standalone-opt/standalone-opt.cpp
@@ -24,16 +24,9 @@
 int main(int argc, char **argv) {
   mlir::registerAllDialects();
   mlir::registerAllPasses();
-  // TODO: Register standalone passes here.
 
-  mlir::DialectRegistry registry;
   mlir::registerDialect<mlir::standalone::StandaloneDialect>();
-  mlir::registerDialect<mlir::StandardOpsDialect>();
-  // Add the following to include *all* MLIR Core dialects, or selectively
-  // include what you need like above. You only need to register dialects that
-  // will be *parsed* by the tool, not the one generated
-  // registerAllDialects(registry);
+  // TODO: Register standalone passes here.
 
-  return failed(
-      mlir::MlirOptMain(argc, argv, "Standalone optimizer driver\n", registry));
+  return failed(mlir::MlirOptMain(argc, argv, "Standalone optimizer driver\n"));
 }
diff --git a/mlir/examples/toy/Ch2/toyc.cpp b/mlir/examples/toy/Ch2/toyc.cpp
index 99232d8f24a4a5..d0880ce0971b6e 100644
--- a/mlir/examples/toy/Ch2/toyc.cpp
+++ b/mlir/examples/toy/Ch2/toyc.cpp
@@ -68,9 +68,10 @@ std::unique_ptr<toy::ModuleAST> parseInputFile(llvm::StringRef filename) {
 }
 
 int dumpMLIR() {
-  mlir::MLIRContext context(/*loadAllDialects=*/false);
-  // Load our Dialect in this MLIR Context.
-  context.getOrLoadDialect<mlir::toy::ToyDialect>();
+  // Register our Dialect with MLIR.
+  mlir::registerDialect<mlir::toy::ToyDialect>();
+
+  mlir::MLIRContext context;
 
   // Handle '.toy' input to the compiler.
   if (inputType != InputType::MLIR &&
diff --git a/mlir/examples/toy/Ch3/toyc.cpp b/mlir/examples/toy/Ch3/toyc.cpp
index d0430ce16e54a8..f9d5631719e8b6 100644
--- a/mlir/examples/toy/Ch3/toyc.cpp
+++ b/mlir/examples/toy/Ch3/toyc.cpp
@@ -102,10 +102,10 @@ int loadMLIR(llvm::SourceMgr &sourceMgr, mlir::MLIRContext &context,
 }
 
 int dumpMLIR() {
-  mlir::MLIRContext context(/*loadAllDialects=*/false);
-  // Load our Dialect in this MLIR Context.
-  context.getOrLoadDialect<mlir::toy::ToyDialect>();
+  // Register our Dialect with MLIR.
+  mlir::registerDialect<mlir::toy::ToyDialect>();
 
+  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   llvm::SourceMgr sourceMgr;
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
diff --git a/mlir/examples/toy/Ch4/toyc.cpp b/mlir/examples/toy/Ch4/toyc.cpp
index 9f95887d270738..e11f35c5f7e10c 100644
--- a/mlir/examples/toy/Ch4/toyc.cpp
+++ b/mlir/examples/toy/Ch4/toyc.cpp
@@ -103,10 +103,10 @@ int loadMLIR(llvm::SourceMgr &sourceMgr, mlir::MLIRContext &context,
 }
 
 int dumpMLIR() {
-  mlir::MLIRContext context(/*loadAllDialects=*/false);
-  // Load our Dialect in this MLIR Context.
-  context.getOrLoadDialect<mlir::toy::ToyDialect>();
+  // Register our Dialect with MLIR.
+  mlir::registerDialect<mlir::toy::ToyDialect>();
 
+  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   llvm::SourceMgr sourceMgr;
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
diff --git a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
index 92fd246a135886..3097681ea3fad3 100644
--- a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
@@ -256,9 +256,6 @@ struct TransposeOpLowering : public ConversionPattern {
 namespace {
 struct ToyToAffineLoweringPass
     : public PassWrapper<ToyToAffineLoweringPass, FunctionPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<AffineDialect, StandardOpsDialect>();
-  }
   void runOnFunction() final;
 };
 } // end anonymous namespace.
diff --git a/mlir/examples/toy/Ch5/toyc.cpp b/mlir/examples/toy/Ch5/toyc.cpp
index 16faac02fc60d0..ed0496957093bb 100644
--- a/mlir/examples/toy/Ch5/toyc.cpp
+++ b/mlir/examples/toy/Ch5/toyc.cpp
@@ -106,10 +106,10 @@ int loadMLIR(llvm::SourceMgr &sourceMgr, mlir::MLIRContext &context,
 }
 
 int dumpMLIR() {
-  mlir::MLIRContext context(/*loadAllDialects=*/false);
-  // Load our Dialect in this MLIR Context.
-  context.getOrLoadDialect<mlir::toy::ToyDialect>();
+  // Register our Dialect with MLIR.
+  mlir::registerDialect<mlir::toy::ToyDialect>();
 
+  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   llvm::SourceMgr sourceMgr;
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
diff --git a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
index f3857f35e25c95..cac3415f48d68f 100644
--- a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
@@ -255,9 +255,6 @@ struct TransposeOpLowering : public ConversionPattern {
 namespace {
 struct ToyToAffineLoweringPass
     : public PassWrapper<ToyToAffineLoweringPass, FunctionPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<AffineDialect, StandardOpsDialect>();
-  }
   void runOnFunction() final;
 };
 } // end anonymous namespace.
diff --git a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
index 19bf27e1864d18..74b32dc0ca1102 100644
--- a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
@@ -159,9 +159,6 @@ class PrintOpLowering : public ConversionPattern {
 namespace {
 struct ToyToLLVMLoweringPass
     : public PassWrapper<ToyToLLVMLoweringPass, OperationPass<ModuleOp>> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<LLVM::LLVMDialect, scf::SCFDialect>();
-  }
   void runOnOperation() final;
 };
 } // end anonymous namespace
diff --git a/mlir/examples/toy/Ch6/toyc.cpp b/mlir/examples/toy/Ch6/toyc.cpp
index 9504a38b8784c9..bdcdf1af7ea831 100644
--- a/mlir/examples/toy/Ch6/toyc.cpp
+++ b/mlir/examples/toy/Ch6/toyc.cpp
@@ -255,10 +255,10 @@ int main(int argc, char **argv) {
 
   // If we aren't dumping the AST, then we are compiling with/to MLIR.
 
-  mlir::MLIRContext context(/*loadAllDialects=*/false);
-  // Load our Dialect in this MLIR Context.
-  context.getOrLoadDialect<mlir::toy::ToyDialect>();
+  // Register our Dialect with MLIR.
+  mlir::registerDialect<mlir::toy::ToyDialect>();
 
+  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   if (int error = loadAndProcessMLIR(context, module))
     return error;
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
index 92fd246a135886..3097681ea3fad3 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
@@ -256,9 +256,6 @@ struct TransposeOpLowering : public ConversionPattern {
 namespace {
 struct ToyToAffineLoweringPass
     : public PassWrapper<ToyToAffineLoweringPass, FunctionPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<AffineDialect, StandardOpsDialect>();
-  }
   void runOnFunction() final;
 };
 } // end anonymous namespace.
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
index 19bf27e1864d18..74b32dc0ca1102 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
@@ -159,9 +159,6 @@ class PrintOpLowering : public ConversionPattern {
 namespace {
 struct ToyToLLVMLoweringPass
     : public PassWrapper<ToyToLLVMLoweringPass, OperationPass<ModuleOp>> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<LLVM::LLVMDialect, scf::SCFDialect>();
-  }
   void runOnOperation() final;
 };
 } // end anonymous namespace
diff --git a/mlir/examples/toy/Ch7/toyc.cpp b/mlir/examples/toy/Ch7/toyc.cpp
index cb3b455dc7ecbe..c1cc207a406ce2 100644
--- a/mlir/examples/toy/Ch7/toyc.cpp
+++ b/mlir/examples/toy/Ch7/toyc.cpp
@@ -256,10 +256,10 @@ int main(int argc, char **argv) {
 
   // If we aren't dumping the AST, then we are compiling with/to MLIR.
 
-  mlir::MLIRContext context(/*loadAllDialects=*/false);
-  // Load our Dialect in this MLIR Context.
-  context.getOrLoadDialect<mlir::toy::ToyDialect>();
+  // Register our Dialect with MLIR.
+  mlir::registerDialect<mlir::toy::ToyDialect>();
 
+  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   if (int error = loadAndProcessMLIR(context, module))
     return error;
diff --git a/mlir/include/mlir-c/IR.h b/mlir/include/mlir-c/IR.h
index f0c421bd5cce9b..68546bf35625a2 100644
--- a/mlir/include/mlir-c/IR.h
+++ b/mlir/include/mlir-c/IR.h
@@ -88,12 +88,6 @@ MlirContext mlirContextCreate();
 /** Takes an MLIR context owned by the caller and destroys it. */
 void mlirContextDestroy(MlirContext context);
 
-/** Load all the globally registered dialects in the provided context.
- *  TODO: remove the concept of globally registered dialect by exposing the
- *  DialectRegistry.
- */
-void mlirContextLoadAllDialects(MlirContext context);
-
 /*============================================================================*/
 /* Location API.                                                              */
 /*============================================================================*/
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 0a043c01e98140..4ff23d71a5c0bf 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -66,11 +66,6 @@ def ConvertAffineToStandard : Pass<"lower-affine"> {
         `affine.apply`.
   }];
   let constructor = "mlir::createLowerAffinePass()";
-  let dependentDialects = [
-    "scf::SCFDialect",
-    "StandardOpsDialect",
-    "vector::VectorDialect"
-  ];
 }
 
 //===----------------------------------------------------------------------===//
@@ -81,7 +76,6 @@ def ConvertAVX512ToLLVM : Pass<"convert-avx512-to-llvm", "ModuleOp"> {
   let summary = "Convert the operations from the avx512 dialect into the LLVM "
                 "dialect";
   let constructor = "mlir::createConvertAVX512ToLLVMPass()";
-  let dependentDialects = ["LLVM::LLVMDialect", "LLVM::LLVMAVX512Dialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -104,7 +98,6 @@ def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> {
 def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
   let summary = "Generate NVVM operations for gpu operations";
   let constructor = "mlir::createLowerGpuOpsToNVVMOpsPass()";
-  let dependentDialects = ["NVVM::NVVMDialect"];
   let options = [
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
@@ -119,7 +112,6 @@ def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
 def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
   let summary = "Generate ROCDL operations for gpu operations";
   let constructor = "mlir::createLowerGpuOpsToROCDLOpsPass()";
-  let dependentDialects = ["ROCDL::ROCDLDialect"];
   let options = [
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
@@ -134,7 +126,6 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
 def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
   let summary = "Convert GPU dialect to SPIR-V dialect";
   let constructor = "mlir::createConvertGPUToSPIRVPass()";
-  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -145,7 +136,6 @@ def ConvertGpuLaunchFuncToVulkanLaunchFunc
     : Pass<"convert-gpu-launch-to-vulkan-launch", "ModuleOp"> {
   let summary = "Convert gpu.launch_func to vulkanLaunch external call";
   let constructor = "mlir::createConvertGpuLaunchFuncToVulkanLaunchFuncPass()";
-  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 def ConvertVulkanLaunchFuncToVulkanCalls
@@ -153,7 +143,6 @@ def ConvertVulkanLaunchFuncToVulkanCalls
   let summary = "Convert vulkanLaunch external call to Vulkan runtime external "
                 "calls";
   let constructor = "mlir::createConvertVulkanLaunchFuncToVulkanCallsPass()";
-  let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -164,7 +153,6 @@ def ConvertLinalgToLLVM : Pass<"convert-linalg-to-llvm", "ModuleOp"> {
   let summary = "Convert the operations from the linalg dialect into the LLVM "
                 "dialect";
   let constructor = "mlir::createConvertLinalgToLLVMPass()";
-  let dependentDialects = ["scf::SCFDialect", "LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -175,7 +163,6 @@ def ConvertLinalgToStandard : Pass<"convert-linalg-to-std", "ModuleOp"> {
   let summary = "Convert the operations from the linalg dialect into the "
                 "Standard dialect";
   let constructor = "mlir::createConvertLinalgToStandardPass()";
-  let dependentDialects = ["StandardOpsDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -185,7 +172,6 @@ def ConvertLinalgToStandard : Pass<"convert-linalg-to-std", "ModuleOp"> {
 def ConvertLinalgToSPIRV : Pass<"convert-linalg-to-spirv", "ModuleOp"> {
   let summary = "Convert Linalg ops to SPIR-V ops";
   let constructor = "mlir::createLinalgToSPIRVPass()";
-  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -196,7 +182,6 @@ def SCFToStandard : Pass<"convert-scf-to-std"> {
   let summary = "Convert SCF dialect to Standard dialect, replacing structured"
                 " control flow with a CFG";
   let constructor = "mlir::createLowerToCFGPass()";
-  let dependentDialects = ["StandardOpsDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -206,7 +191,6 @@ def SCFToStandard : Pass<"convert-scf-to-std"> {
 def ConvertAffineForToGPU : FunctionPass<"convert-affine-for-to-gpu"> {
   let summary = "Convert top-level AffineFor Ops to GPU kernels";
   let constructor = "mlir::createAffineForToGPUPass()";
-  let dependentDialects = ["gpu::GPUDialect"];
   let options = [
     Option<"numBlockDims", "gpu-block-dims", "unsigned", /*default=*/"1u",
            "Number of GPU block dimensions for mapping">,
@@ -218,7 +202,6 @@ def ConvertAffineForToGPU : FunctionPass<"convert-affine-for-to-gpu"> {
 def ConvertParallelLoopToGpu : Pass<"convert-parallel-loops-to-gpu"> {
   let summary = "Convert mapped scf.parallel ops to gpu launch operations";
   let constructor = "mlir::createParallelLoopToGpuPass()";
-  let dependentDialects = ["AffineDialect", "gpu::GPUDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -229,7 +212,6 @@ def ConvertShapeToStandard : Pass<"convert-shape-to-std", "ModuleOp"> {
   let summary = "Convert operations from the shape dialect into the standard "
                 "dialect";
   let constructor = "mlir::createConvertShapeToStandardPass()";
-  let dependentDialects = ["StandardOpsDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -239,7 +221,6 @@ def ConvertShapeToStandard : Pass<"convert-shape-to-std", "ModuleOp"> {
 def ConvertShapeToSCF : FunctionPass<"convert-shape-to-scf"> {
   let summary = "Convert operations from the shape dialect to the SCF dialect";
   let constructor = "mlir::createConvertShapeToSCFPass()";
-  let dependentDialects = ["scf::SCFDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -249,7 +230,6 @@ def ConvertShapeToSCF : FunctionPass<"convert-shape-to-scf"> {
 def ConvertSPIRVToLLVM : Pass<"convert-spirv-to-llvm", "ModuleOp"> {
   let summary = "Convert SPIR-V dialect to LLVM dialect";
   let constructor = "mlir::createConvertSPIRVToLLVMPass()";
-  let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -284,7 +264,6 @@ def ConvertStandardToLLVM : Pass<"convert-std-to-llvm", "ModuleOp"> {
     LLVM IR types.
   }];
   let constructor = "mlir::createLowerToLLVMPass()";
-  let dependentDialects = ["LLVM::LLVMDialect"];
   let options = [
     Option<"useAlignedAlloc", "use-aligned-alloc", "bool", /*default=*/"false",
            "Use aligned_alloc in place of malloc for heap allocations">,
@@ -312,13 +291,11 @@ def ConvertStandardToLLVM : Pass<"convert-std-to-llvm", "ModuleOp"> {
 def LegalizeStandardForSPIRV : Pass<"legalize-std-for-spirv"> {
   let summary = "Legalize standard ops for SPIR-V lowering";
   let constructor = "mlir::createLegalizeStdOpsForSPIRVLoweringPass()";
-  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 def ConvertStandardToSPIRV : Pass<"convert-std-to-spirv", "ModuleOp"> {
   let summary = "Convert Standard Ops to SPIR-V dialect";
   let constructor = "mlir::createConvertStandardToSPIRVPass()";
-  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -329,7 +306,6 @@ def ConvertVectorToSCF : FunctionPass<"convert-vector-to-scf"> {
   let summary = "Lower the operations from the vector dialect into the SCF "
                 "dialect";
   let constructor = "mlir::createConvertVectorToSCFPass()";
-  let dependentDialects = ["AffineDialect", "scf::SCFDialect"];
   let options = [
     Option<"fullUnroll", "full-unroll", "bool", /*default=*/"false",
            "Perform full unrolling when converting vector transfers to SCF">,
@@ -344,7 +320,6 @@ def ConvertVectorToLLVM : Pass<"convert-vector-to-llvm", "ModuleOp"> {
   let summary = "Lower the operations from the vector dialect into the LLVM "
                 "dialect";
   let constructor = "mlir::createConvertVectorToLLVMPass()";
-  let dependentDialects = ["LLVM::LLVMDialect"];
   let options = [
     Option<"reassociateFPReductions", "reassociate-fp-reductions",
            "bool", /*default=*/"false",
@@ -360,7 +335,6 @@ def ConvertVectorToROCDL : Pass<"convert-vector-to-rocdl", "ModuleOp"> {
   let summary = "Lower the operations from the vector dialect into the ROCDL "
                 "dialect";
   let constructor = "mlir::createConvertVectorToROCDLPass()";
-  let dependentDialects = ["ROCDL::ROCDLDialect"];
 }
 
 #endif // MLIR_CONVERSION_PASSES
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
index f43fabd19aaefe..810640058155fb 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -94,7 +94,6 @@ def AffineLoopUnrollAndJam : FunctionPass<"affine-loop-unroll-jam"> {
 def AffineVectorize : FunctionPass<"affine-super-vectorize"> {
   let summary = "Vectorize to a target independent n-D vector abstraction";
   let constructor = "mlir::createSuperVectorizePass()";
-  let dependentDialects = ["vector::VectorDialect"];
   let options = [
     ListOption<"vectorSizes", "virtual-vector-size", "int64_t",
                "Specify an n-D virtual vector size for vectorization",
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
index 2f465f07a97e42..04700f0aa17dbb 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
@@ -15,7 +15,6 @@
 #define MLIR_DIALECT_LLVMIR_LLVMDIALECT_H_
 
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
-#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Function.h"
 #include "mlir/IR/OpDefinition.h"
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index 226743587bd9d5..e824f97bc28544 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -19,11 +19,6 @@ include "mlir/IR/OpBase.td"
 def LLVM_Dialect : Dialect {
   let name = "llvm";
   let cppNamespace = "LLVM";
-
-  /// FIXME: at the moment this is a dependency of the translation to LLVM IR,
-  /// not really one of this dialect per-se.
-  let dependentDialects = ["omp::OpenMPDialect"];
-
   let hasRegionArgAttrVerify = 1;
   let hasOperationAttrVerify = 1;
   let extraClassDeclaration = [{
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
index 9cc5314bdb901f..86d437c9b561b7 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -14,7 +14,6 @@
 #ifndef MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_
 #define MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_
 
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 7d47e5012ac9a0..5f022e32b801d6 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -23,7 +23,6 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 def NVVM_Dialect : Dialect {
   let name = "nvvm";
   let cppNamespace = "NVVM";
-  let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
index eb40373c3f1171..bf761c357f9074 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
@@ -22,7 +22,6 @@
 #ifndef MLIR_DIALECT_LLVMIR_ROCDLDIALECT_H_
 #define MLIR_DIALECT_LLVMIR_ROCDLDIALECT_H_
 
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index f85c4f02899b46..0cd11690daa8ba 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -23,7 +23,6 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 def ROCDL_Dialect : Dialect {
   let name = "rocdl";
   let cppNamespace = "ROCDL";
-  let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index dcf4b5ec06cb6f..11f12ad30eb6c0 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -30,20 +30,17 @@ def LinalgFusion : FunctionPass<"linalg-fusion"> {
 def LinalgFusionOfTensorOps : Pass<"linalg-fusion-for-tensor-ops"> {
   let summary = "Fuse operations on RankedTensorType in linalg dialect";
   let constructor = "mlir::createLinalgFusionOfTensorOpsPass()";
-  let dependentDialects = ["AffineDialect"];
 }
 
 def LinalgLowerToAffineLoops : FunctionPass<"convert-linalg-to-affine-loops"> {
   let summary = "Lower the operations from the linalg dialect into affine "
                 "loops";
   let constructor = "mlir::createConvertLinalgToAffineLoopsPass()";
-  let dependentDialects = ["AffineDialect"];
 }
 
 def LinalgLowerToLoops : FunctionPass<"convert-linalg-to-loops"> {
   let summary = "Lower the operations from the linalg dialect into loops";
   let constructor = "mlir::createConvertLinalgToLoopsPass()";
-  let dependentDialects = ["scf::SCFDialect", "AffineDialect"];
 }
 
 def LinalgOnTensorsToBuffers : Pass<"convert-linalg-on-tensors-to-buffers", "ModuleOp"> {
@@ -57,7 +54,6 @@ def LinalgLowerToParallelLoops
   let summary = "Lower the operations from the linalg dialect into parallel "
                 "loops";
   let constructor = "mlir::createConvertLinalgToParallelLoopsPass()";
-  let dependentDialects = ["AffineDialect", "scf::SCFDialect"];
 }
 
 def LinalgPromotion : FunctionPass<"linalg-promote-subviews"> {
@@ -74,9 +70,6 @@ def LinalgPromotion : FunctionPass<"linalg-promote-subviews"> {
 def LinalgTiling : FunctionPass<"linalg-tile"> {
   let summary = "Tile operations in the linalg dialect";
   let constructor = "mlir::createLinalgTilingPass()";
-  let dependentDialects = [
-    "AffineDialect", "scf::SCFDialect"
-  ];
   let options = [
     ListOption<"tileSizes", "linalg-tile-sizes", "int64_t",
                "Test generation of dynamic promoted buffers",
@@ -93,7 +86,6 @@ def LinalgTilingToParallelLoops
                "Test generation of dynamic promoted buffers",
                "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
   ];
-  let dependentDialects = ["AffineDialect", "scf::SCFDialect"];
 }
 
 #endif // MLIR_DIALECT_LINALG_PASSES
diff --git a/mlir/include/mlir/Dialect/SCF/Passes.td b/mlir/include/mlir/Dialect/SCF/Passes.td
index 6f3cf0e1264235..483d0ba7c7be08 100644
--- a/mlir/include/mlir/Dialect/SCF/Passes.td
+++ b/mlir/include/mlir/Dialect/SCF/Passes.td
@@ -36,7 +36,6 @@ def SCFParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
                "Factors to tile parallel loops by",
                "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
   ];
-  let dependentDialects = ["AffineDialect"];
 }
 
 #endif // MLIR_DIALECT_SCF_PASSES
diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h
index 8c0fef0d7ccf64..4f9e4cb3618b65 100644
--- a/mlir/include/mlir/IR/Dialect.h
+++ b/mlir/include/mlir/IR/Dialect.h
@@ -16,8 +16,6 @@
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/Support/TypeID.h"
 
-#include <map>
-
 namespace mlir {
 class DialectAsmParser;
 class DialectAsmPrinter;
@@ -25,7 +23,7 @@ class DialectInterface;
 class OpBuilder;
 class Type;
 
-using DialectAllocatorFunction = std::function<Dialect *(MLIRContext *)>;
+using DialectAllocatorFunction = std::function<void(MLIRContext *)>;
 
 /// Dialects are groups of MLIR operations and behavior associated with the
 /// entire group.  For example, hooks into other systems for constant folding,
@@ -214,87 +212,30 @@ class Dialect {
   /// A collection of registered dialect interfaces.
   DenseMap<TypeID, std::unique_ptr<DialectInterface>> registeredInterfaces;
 
+  /// Registers a specific dialect creation function with the global registry.
+  /// Used through the registerDialect template.
+  /// Registrations are deduplicated by dialect TypeID and only the first
+  /// registration will be used.
+  static void
+  registerDialectAllocator(TypeID typeID,
+                           const DialectAllocatorFunction &function);
+  template <typename ConcreteDialect>
   friend void registerDialect();
   friend class MLIRContext;
 };
 
-/// The DialectRegistry maps a dialect namespace to a constructor for the
-/// matching dialect.
-/// This allows for decoupling the list of dialects "available" from the
-/// dialects loaded in the Context. The parser in particular will lazily load
-/// dialects in in the Context as operations are encountered.
-class DialectRegistry {
-  using MapTy =
-      std::map<std::string, std::pair<TypeID, DialectAllocatorFunction>>;
-
-public:
-  template <typename ConcreteDialect>
-  void insert() {
-    insert(TypeID::get<ConcreteDialect>(),
-           ConcreteDialect::getDialectNamespace(),
-           static_cast<DialectAllocatorFunction>(([](MLIRContext *ctx) {
-             // Just allocate the dialect, the context
-             // takes ownership of it.
-             return ctx->getOrLoadDialect<ConcreteDialect>();
-           })));
-  }
-
-  template <typename ConcreteDialect, typename OtherDialect,
-            typename... MoreDialects>
-  void insert() {
-    insert<ConcreteDialect>();
-    insert<OtherDialect, MoreDialects...>();
-  }
-
-  /// Add a new dialect constructor to the registry.
-  void insert(TypeID typeID, StringRef name, DialectAllocatorFunction ctor);
-
-  /// Load a dialect for this namespace in the provided context.
-  Dialect *loadByName(StringRef name, MLIRContext *context);
-
-  // Register all dialects available in the current registry with the registry
-  // in the provided context.
-  void appendTo(DialectRegistry &destination) {
-    for (const auto &nameAndRegistrationIt : registry)
-      destination.insert(nameAndRegistrationIt.second.first,
-                         nameAndRegistrationIt.first,
-                         nameAndRegistrationIt.second.second);
-  }
-  // Load all dialects available in the registry in the provided context.
-  void loadAll(MLIRContext *context) {
-    for (const auto &nameAndRegistrationIt : registry)
-      nameAndRegistrationIt.second.second(context);
-  }
-
-  MapTy::const_iterator begin() const { return registry.begin(); }
-  MapTy::const_iterator end() const { return registry.end(); }
-
-private:
-  MapTy registry;
-};
-
-/// Deprecated: this provides a global registry for convenience, while we're
-/// transitionning the registration mechanism to a stateless approach.
-DialectRegistry &getGlobalDialectRegistry();
-
-/// Registers all dialects from the global registries with the
-/// specified MLIRContext. This won't load the dialects in the context,
-/// but only make them available for lazy loading by name.
+/// Registers all dialects and hooks from the global registries with the
+/// specified MLIRContext.
 /// Note: This method is not thread-safe.
 void registerAllDialects(MLIRContext *context);
 
-/// Register and return the dialect with the given namespace in the provided
-/// context. Returns nullptr is there is no constructor registered for this
-/// dialect.
-inline Dialect *registerDialect(StringRef name, MLIRContext *context) {
-  return getGlobalDialectRegistry().loadByName(name, context);
-}
-
 /// Utility to register a dialect. Client can register their dialect with the
 /// global registry by calling registerDialect<MyDialect>();
 /// Note: This method is not thread-safe.
 template <typename ConcreteDialect> void registerDialect() {
-  getGlobalDialectRegistry().insert<ConcreteDialect>();
+  Dialect::registerDialectAllocator(
+      TypeID::get<ConcreteDialect>(),
+      [](MLIRContext *ctx) { ctx->getOrCreateDialect<ConcreteDialect>(); });
 }
 
 /// DialectRegistration provides a global initializer that registers a Dialect
diff --git a/mlir/include/mlir/IR/FunctionSupport.h b/mlir/include/mlir/IR/FunctionSupport.h
index 3d467cd4f3642f..7e281f393af946 100644
--- a/mlir/include/mlir/IR/FunctionSupport.h
+++ b/mlir/include/mlir/IR/FunctionSupport.h
@@ -428,7 +428,7 @@ LogicalResult FunctionLike<ConcreteType>::verifyTrait(Operation *op) {
       if (!attr.first.strref().contains('.'))
         return funcOp.emitOpError("arguments may only have dialect attributes");
       auto dialectNamePair = attr.first.strref().split('.');
-      if (auto *dialect = ctx->getLoadedDialect(dialectNamePair.first)) {
+      if (auto *dialect = ctx->getRegisteredDialect(dialectNamePair.first)) {
         if (failed(dialect->verifyRegionArgAttribute(op, /*regionIndex=*/0,
                                                      /*argIndex=*/i, attr)))
           return failure();
@@ -444,7 +444,7 @@ LogicalResult FunctionLike<ConcreteType>::verifyTrait(Operation *op) {
       if (!attr.first.strref().contains('.'))
         return funcOp.emitOpError("results may only have dialect attributes");
       auto dialectNamePair = attr.first.strref().split('.');
-      if (auto *dialect = ctx->getLoadedDialect(dialectNamePair.first)) {
+      if (auto *dialect = ctx->getRegisteredDialect(dialectNamePair.first)) {
         if (failed(dialect->verifyRegionResultAttribute(op, /*regionIndex=*/0,
                                                         /*resultIndex=*/i,
                                                         attr)))
diff --git a/mlir/include/mlir/IR/MLIRContext.h b/mlir/include/mlir/IR/MLIRContext.h
index e8a5d6e6d2368b..0192a8ae06af87 100644
--- a/mlir/include/mlir/IR/MLIRContext.h
+++ b/mlir/include/mlir/IR/MLIRContext.h
@@ -19,12 +19,10 @@ namespace mlir {
 class AbstractOperation;
 class DiagnosticEngine;
 class Dialect;
-class DialectRegistry;
 class InFlightDiagnostic;
 class Location;
 class MLIRContextImpl;
 class StorageUniquer;
-DialectRegistry &getGlobalDialectRegistry();
 
 /// MLIRContext is the top-level object for a collection of MLIR modules.  It
 /// holds immortal uniqued objects like types, and the tables used to unique
@@ -36,69 +34,34 @@ DialectRegistry &getGlobalDialectRegistry();
 ///
 class MLIRContext {
 public:
-  /// Create a new Context.
-  /// The loadAllDialects parameters allows to load all dialects from the global
-  /// registry on Context construction. It is deprecated and will be removed
-  /// soon.
-  explicit MLIRContext(bool loadAllDialects = true);
+  explicit MLIRContext();
   ~MLIRContext();
 
-  /// Return information about all IR dialects loaded in the context.
-  std::vector<Dialect *> getLoadedDialects();
-
-  /// Return the dialect registry associated with this context.
-  DialectRegistry &getDialectRegistry();
-
-  /// Return information about all available dialects in the registry in this
-  /// context.
-  std::vector<StringRef> getAvailableDialects();
+  /// Return information about all registered IR dialects.
+  std::vector<Dialect *> getRegisteredDialects();
 
   /// Get a registered IR dialect with the given namespace. If an exact match is
   /// not found, then return nullptr.
-  Dialect *getLoadedDialect(StringRef name);
+  Dialect *getRegisteredDialect(StringRef name);
 
   /// Get a registered IR dialect for the given derived dialect type. The
   /// derived type must provide a static 'getDialectNamespace' method.
-  template <typename T>
-  T *getLoadedDialect() {
-    return static_cast<T *>(getLoadedDialect(T::getDialectNamespace()));
+  template <typename T> T *getRegisteredDialect() {
+    return static_cast<T *>(getRegisteredDialect(T::getDialectNamespace()));
   }
 
   /// Get (or create) a dialect for the given derived dialect type. The derived
   /// type must provide a static 'getDialectNamespace' method.
   template <typename T>
-  T *getOrLoadDialect() {
-    return static_cast<T *>(
-        getOrLoadDialect(T::getDialectNamespace(), TypeID::get<T>(), [this]() {
+  T *getOrCreateDialect() {
+    return static_cast<T *>(getOrCreateDialect(
+        T::getDialectNamespace(), TypeID::get<T>(), [this]() {
           std::unique_ptr<T> dialect(new T(this));
+          dialect->dialectID = TypeID::get<T>();
           return dialect;
         }));
   }
 
-  /// Load a dialect in the context.
-  template <typename Dialect>
-  void loadDialect() {
-    getOrLoadDialect<Dialect>();
-  }
-
-  /// Load a list dialects in the context.
-  template <typename Dialect, typename OtherDialect, typename... MoreDialects>
-  void loadDialect() {
-    getOrLoadDialect<Dialect>();
-    loadDialect<OtherDialect, MoreDialects...>();
-  }
-
-  /// Deprecated: load all globally registered dialects into this context.
-  /// This method will be removed soon, it can be used temporarily as we're
-  /// phasing out the global registry.
-  void loadAllGloballyRegisteredDialects();
-
-  /// Get (or create) a dialect for the given derived dialect name.
-  /// The dialect will be loaded from the registry if no dialect is found.
-  /// If no dialect is loaded for this name and none is available in the
-  /// registry, returns nullptr.
-  Dialect *getOrLoadDialect(StringRef name);
-
   /// Return true if we allow to create operation for unregistered dialects.
   bool allowsUnregisteredDialects();
 
@@ -160,12 +123,10 @@ class MLIRContext {
   const std::unique_ptr<MLIRContextImpl> impl;
 
   /// Get a dialect for the provided namespace and TypeID: abort the program if
-  /// a dialect exist for this namespace with different TypeID. If a dialect has
-  /// not been loaded for this namespace/TypeID yet, use the provided ctor to
-  /// create one on the fly and load it. Returns a pointer to the dialect owned
-  /// by the context.
-  Dialect *getOrLoadDialect(StringRef dialectNamespace, TypeID dialectID,
-                            function_ref<std::unique_ptr<Dialect>()> ctor);
+  /// a dialect exist for this namespace with different TypeID. Returns a
+  /// pointer to the dialect owned by the context.
+  Dialect *getOrCreateDialect(StringRef dialectNamespace, TypeID dialectID,
+                              function_ref<std::unique_ptr<Dialect>()> ctor);
 
   MLIRContext(const MLIRContext &) = delete;
   void operator=(const MLIRContext &) = delete;
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index a28410f028d5f0..9cc57a61728949 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -244,11 +244,6 @@ class Dialect {
   // The description of the dialect.
   string description = ?;
 
-  // A list of dialects this dialect will load on construction as dependencies.
-  // These are dialects that this dialect may involved in canonicalization
-  // pattern or interfaces.
-  list<string> dependentDialects = [];
-
   // The C++ namespace that ops of this dialect should be placed into.
   //
   // By default, uses the name of the dialect as the only namespace. To avoid
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index 147ececc4c5a96..b76b26fe348346 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -35,35 +35,30 @@
 
 namespace mlir {
 
-// Add all the MLIR dialects to the provided registry.
-inline void registerAllDialects(DialectRegistry &registry) {
-  // clang-format off
-  registry.insert<acc::OpenACCDialect,
-                  AffineDialect,
-                  avx512::AVX512Dialect,
-                  gpu::GPUDialect,
-                  LLVM::LLVMAVX512Dialect,
-                  LLVM::LLVMDialect,
-                  linalg::LinalgDialect,
-                  scf::SCFDialect,
-                  omp::OpenMPDialect,
-                  quant::QuantizationDialect,
-                  spirv::SPIRVDialect,
-                  StandardOpsDialect,
-                  vector::VectorDialect,
-                  NVVM::NVVMDialect,
-                  ROCDL::ROCDLDialect,
-                  SDBMDialect,
-                  shape::ShapeDialect>();
-  // clang-format on
-}
-
 // This function should be called before creating any MLIRContext if one expect
 // all the possible dialects to be made available to the context automatically.
 inline void registerAllDialects() {
-  static bool initOnce =
-      ([]() { registerAllDialects(getGlobalDialectRegistry()); }(), true);
-  (void)initOnce;
+  static bool init_once = []() {
+    registerDialect<acc::OpenACCDialect>();
+    registerDialect<AffineDialect>();
+    registerDialect<avx512::AVX512Dialect>();
+    registerDialect<gpu::GPUDialect>();
+    registerDialect<LLVM::LLVMAVX512Dialect>();
+    registerDialect<LLVM::LLVMDialect>();
+    registerDialect<linalg::LinalgDialect>();
+    registerDialect<scf::SCFDialect>();
+    registerDialect<omp::OpenMPDialect>();
+    registerDialect<quant::QuantizationDialect>();
+    registerDialect<spirv::SPIRVDialect>();
+    registerDialect<StandardOpsDialect>();
+    registerDialect<vector::VectorDialect>();
+    registerDialect<NVVM::NVVMDialect>();
+    registerDialect<ROCDL::ROCDLDialect>();
+    registerDialect<SDBMDialect>();
+    registerDialect<shape::ShapeDialect>();
+    return true;
+  }();
+  (void)init_once;
 }
 } // namespace mlir
 
diff --git a/mlir/include/mlir/InitAllTranslations.h b/mlir/include/mlir/InitAllTranslations.h
index a1771dab144c04..31ca0254cf8999 100644
--- a/mlir/include/mlir/InitAllTranslations.h
+++ b/mlir/include/mlir/InitAllTranslations.h
@@ -28,7 +28,7 @@ void registerAVX512ToLLVMIRTranslation();
 // expects all the possible translations to be made available to the context
 // automatically.
 inline void registerAllTranslations() {
-  static bool initOnce = []() {
+  static bool init_once = []() {
     registerFromLLVMIRTranslation();
     registerFromSPIRVTranslation();
     registerToLLVMIRTranslation();
@@ -38,7 +38,7 @@ inline void registerAllTranslations() {
     registerAVX512ToLLVMIRTranslation();
     return true;
   }();
-  (void)initOnce;
+  (void)init_once;
 }
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Pass/Pass.h b/mlir/include/mlir/Pass/Pass.h
index cd4c06acd070b4..8de31d9443190d 100644
--- a/mlir/include/mlir/Pass/Pass.h
+++ b/mlir/include/mlir/Pass/Pass.h
@@ -9,7 +9,6 @@
 #ifndef MLIR_PASS_PASS_H
 #define MLIR_PASS_PASS_H
 
-#include "mlir/IR/Dialect.h"
 #include "mlir/IR/Function.h"
 #include "mlir/Pass/AnalysisManager.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -58,13 +57,6 @@ class Pass {
   /// Returns the derived pass name.
   virtual StringRef getName() const = 0;
 
-  /// Register dependent dialects for the current pass.
-  /// A pass is expected to register the dialects it will create entities for
-  /// (Operations, Types, Attributes), other than dialect that exists in the
-  /// input. For example, a pass that converts from Linalg to Affine would
-  /// register the Affine dialect but does not need to register Linalg.
-  virtual void getDependentDialects(DialectRegistry &registry) const {}
-
   /// Returns the command line argument used when registering this pass. Return
   /// an empty string if one does not exist.
   virtual StringRef getArgument() const {
diff --git a/mlir/include/mlir/Pass/PassBase.td b/mlir/include/mlir/Pass/PassBase.td
index 7a2feff4fe0454..54b44031559e72 100644
--- a/mlir/include/mlir/Pass/PassBase.td
+++ b/mlir/include/mlir/Pass/PassBase.td
@@ -78,9 +78,6 @@ class PassBase<string passArg, string base> {
   // A C++ constructor call to create an instance of this pass.
   code constructor = [{}];
 
-  // A list of dialects this pass may produce entities in.
-  list<string> dependentDialects = [];
-
   // A set of options provided by this pass.
   list<Option> options = [];
 
diff --git a/mlir/include/mlir/Pass/PassManager.h b/mlir/include/mlir/Pass/PassManager.h
index 29e7c07c2ee416..9cbfb0b277100c 100644
--- a/mlir/include/mlir/Pass/PassManager.h
+++ b/mlir/include/mlir/Pass/PassManager.h
@@ -9,7 +9,6 @@
 #ifndef MLIR_PASS_PASSMANAGER_H
 #define MLIR_PASS_PASSMANAGER_H
 
-#include "mlir/IR/Dialect.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/Optional.h"
@@ -59,14 +58,6 @@ class OpPassManager {
   pass_iterator end();
   iterator_range<pass_iterator> getPasses() { return {begin(), end()}; }
 
-  using const_pass_iterator = llvm::pointee_iterator<
-      std::vector<std::unique_ptr<Pass>>::const_iterator>;
-  const_pass_iterator begin() const;
-  const_pass_iterator end() const;
-  iterator_range<const_pass_iterator> getPasses() const {
-    return {begin(), end()};
-  }
-
   /// Run the held passes over the given operation.
   LogicalResult run(Operation *op, AnalysisManager am);
 
@@ -109,11 +100,6 @@ class OpPassManager {
   /// Merge the pass statistics of this class into 'other'.
   void mergeStatisticsInto(OpPassManager &other);
 
-  /// Register dependent dialects for the current pass manager.
-  /// This is forwarding to every pass in this PassManager, see the
-  /// documentation for the same method on the Pass class.
-  void getDependentDialects(DialectRegistry &dialects) const;
-
 private:
   OpPassManager(OperationName name, bool verifyPasses);
 
diff --git a/mlir/include/mlir/Support/MlirOptMain.h b/mlir/include/mlir/Support/MlirOptMain.h
index da03baed2ae708..137cf66f01509c 100644
--- a/mlir/include/mlir/Support/MlirOptMain.h
+++ b/mlir/include/mlir/Support/MlirOptMain.h
@@ -21,14 +21,12 @@ class MemoryBuffer;
 } // end namespace llvm
 
 namespace mlir {
-class DialectRegistry;
 class PassPipelineCLParser;
 
 /// Perform the core processing behind `mlir-opt`:
 /// - outputStream is the stream where the resulting IR is printed.
 /// - buffer is the in-memory file to parser and process.
 /// - passPipeline is the specification of the pipeline that will be applied.
-/// - registry should contain all the dialects that can be parsed in the source.
 /// - splitInputFile will look for a "-----" marker in the input file, and load
 /// each chunk in an individual ModuleOp processed separately.
 /// - verifyDiagnostics enables a verification mode where comments starting with
@@ -37,25 +35,13 @@ class PassPipelineCLParser;
 /// - verifyPasses enables the IR verifier in-between each pass in the pipeline.
 /// - allowUnregisteredDialects allows to parse and create operation without
 /// registering the Dialect in the MLIRContext.
-/// - preloadDialectsInContext will trigger the upfront loading of all
-///   dialects from the global registry in the MLIRContext. This option is
-///   deprecated and will be removed soon.
 LogicalResult MlirOptMain(llvm::raw_ostream &outputStream,
                           std::unique_ptr<llvm::MemoryBuffer> buffer,
                           const PassPipelineCLParser &passPipeline,
-                          DialectRegistry &registry, bool splitInputFile,
-                          bool verifyDiagnostics, bool verifyPasses,
-                          bool allowUnregisteredDialects,
-                          bool preloadDialectsInContext = true);
+                          bool splitInputFile, bool verifyDiagnostics,
+                          bool verifyPasses, bool allowUnregisteredDialects);
 
 /// Implementation for tools like `mlir-opt`.
-/// - toolName is used for the header displayed by `--help`.
-/// - registry should contain all the dialects that can be parsed in the source.
-/// - preloadDialectsInContext will trigger the upfront loading of all
-///   dialects from the global registry in the MLIRContext. This option is
-///   deprecated and will be removed soon.
-LogicalResult MlirOptMain(int argc, char **argv, llvm::StringRef toolName,
-                          DialectRegistry &registry,
-                          bool preloadDialectsInContext = true);
+LogicalResult MlirOptMain(int argc, char **argv, llvm::StringRef toolName);
 
 } // end namespace mlir
diff --git a/mlir/include/mlir/TableGen/Dialect.h b/mlir/include/mlir/TableGen/Dialect.h
index 99217d8c7d3d8c..5e85806f377f4f 100644
--- a/mlir/include/mlir/TableGen/Dialect.h
+++ b/mlir/include/mlir/TableGen/Dialect.h
@@ -14,7 +14,6 @@
 
 #include "mlir/Support/LLVM.h"
 #include <string>
-#include <vector>
 
 namespace llvm {
 class Record;
@@ -26,7 +25,7 @@ namespace tblgen {
 // and provides helper methods for accessing them.
 class Dialect {
 public:
-  explicit Dialect(const llvm::Record *def);
+  explicit Dialect(const llvm::Record *def) : def(def) {}
 
   // Returns the name of this dialect.
   StringRef getName() const;
@@ -44,10 +43,6 @@ class Dialect {
   // Returns the description of the dialect. Returns empty string if none.
   StringRef getDescription() const;
 
-  // Returns the list of dialect (class names) that this dialect depends on.
-  // These are dialects that will be loaded on construction of this dialect.
-  ArrayRef<StringRef> getDependentDialects() const;
-
   // Returns the dialects extra class declaration code.
   llvm::Optional<StringRef> getExtraClassDeclaration() const;
 
@@ -75,7 +70,6 @@ class Dialect {
 
 private:
   const llvm::Record *def;
-  std::vector<StringRef> dependentDialects;
 };
 } // end namespace tblgen
 } // end namespace mlir
diff --git a/mlir/include/mlir/TableGen/Pass.h b/mlir/include/mlir/TableGen/Pass.h
index 968c85416965d0..02427e42a5256f 100644
--- a/mlir/include/mlir/TableGen/Pass.h
+++ b/mlir/include/mlir/TableGen/Pass.h
@@ -94,9 +94,6 @@ class Pass {
   /// Return the C++ constructor call to create an instance of this pass.
   StringRef getConstructor() const;
 
-  /// Return the dialects this pass needs to be registered.
-  ArrayRef<StringRef> getDependentDialects() const;
-
   /// Return the options provided by this pass.
   ArrayRef<PassOption> getOptions() const;
 
@@ -107,7 +104,6 @@ class Pass {
 
 private:
   const llvm::Record *def;
-  std::vector<StringRef> dependentDialects;
   std::vector<PassOption> options;
   std::vector<PassStatistic> statistics;
 };
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 3292d5e7dec2d4..77878057349849 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -162,8 +162,6 @@ def BufferPlacement : FunctionPass<"buffer-placement"> {
 
   }];
   let constructor = "mlir::createBufferPlacementPass()";
-  // TODO: this pass likely shouldn't depend on Linalg?
-  let dependentDialects = ["linalg::LinalgDialect"];
 }
 
 def Canonicalizer : Pass<"canonicalize"> {
diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp
index 928a773b2351a1..1ba1a6aca6f8ed 100644
--- a/mlir/lib/CAPI/IR/IR.cpp
+++ b/mlir/lib/CAPI/IR/IR.cpp
@@ -10,11 +10,9 @@
 
 #include "mlir/CAPI/IR.h"
 #include "mlir/IR/Attributes.h"
-#include "mlir/IR/Dialect.h"
 #include "mlir/IR/Module.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Types.h"
-#include "mlir/InitAllDialects.h"
 #include "mlir/Parser.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -52,17 +50,12 @@ class CallbackOstream : public llvm::raw_ostream {
 /* ========================================================================== */
 
 MlirContext mlirContextCreate() {
-  auto *context = new MLIRContext(/*loadAllDialects=*/false);
+  auto *context = new MLIRContext;
   return wrap(context);
 }
 
 void mlirContextDestroy(MlirContext context) { delete unwrap(context); }
 
-void mlirContextLoadAllDialects(MlirContext context) {
-  registerAllDialects(unwrap(context));
-  getGlobalDialectRegistry().loadAll(unwrap(context));
-}
-
 /* ========================================================================== */
 /* Location API.                                                              */
 /* ========================================================================== */
diff --git a/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp b/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
index 42673936b87854..1ebf48174aafb6 100644
--- a/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
+++ b/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
@@ -16,7 +16,6 @@
 #include "../PassDetail.h"
 #include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
-#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/Serialization.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
diff --git a/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp b/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
index 0460d98b44a470..7b57854dde9833 100644
--- a/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
+++ b/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
@@ -19,7 +19,6 @@
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
-#include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
diff --git a/mlir/lib/Conversion/PassDetail.h b/mlir/lib/Conversion/PassDetail.h
index 7fa5a5a92f2015..6da0bc81e7af60 100644
--- a/mlir/lib/Conversion/PassDetail.h
+++ b/mlir/lib/Conversion/PassDetail.h
@@ -12,43 +12,11 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
-class AffineDialect;
-class StandardOpsDialect;
-
-// Forward declaration from Dialect.h
-template <typename ConcreteDialect>
-void registerDialect(DialectRegistry &registry);
 
 namespace gpu {
-class GPUDialect;
 class GPUModuleOp;
 } // end namespace gpu
 
-namespace LLVM {
-class LLVMDialect;
-class LLVMAVX512Dialect;
-} // end namespace LLVM
-
-namespace NVVM {
-class NVVMDialect;
-} // end namespace NVVM
-
-namespace ROCDL {
-class ROCDLDialect;
-} // end namespace ROCDL
-
-namespace scf {
-class SCFDialect;
-} // end namespace scf
-
-namespace spirv {
-class SPIRVDialect;
-} // end namespace spirv
-
-namespace vector {
-class VectorDialect;
-} // end namespace vector
-
 #define GEN_PASS_CLASSES
 #include "mlir/Conversion/Passes.h.inc"
 
diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
index 44d912bfd8eda6..0ee1166b1a643b 100644
--- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -125,7 +125,7 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx)
 /// Create an LLVMTypeConverter using custom LowerToLLVMOptions.
 LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
                                      const LowerToLLVMOptions &options)
-    : llvmDialect(ctx->getOrLoadDialect<LLVM::LLVMDialect>()),
+    : llvmDialect(ctx->getRegisteredDialect<LLVM::LLVMDialect>()),
       options(options) {
   assert(llvmDialect && "LLVM IR dialect is not registered");
   if (options.indexBitwidth == kDeriveIndexBitwidthFromDataLayout)
diff --git a/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp b/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
index a2e608dcb71347..19643d271f8dc3 100644
--- a/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
+++ b/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
@@ -14,7 +14,6 @@
 #include "../PassDetail.h"
 #include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h"
 #include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h"
-#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/IR/PatternMatch.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/PassDetail.h b/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
index da8f7ac3fc81c7..3bae0592b3d4fa 100644
--- a/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
@@ -12,16 +12,6 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
-// Forward declaration from Dialect.h
-template <typename ConcreteDialect>
-void registerDialect(DialectRegistry &registry);
-
-namespace linalg {
-class LinalgDialect;
-} // end namespace linalg
-namespace vector {
-class VectorDialect;
-} // end namespace vector
 
 #define GEN_PASS_CLASSES
 #include "mlir/Dialect/Affine/Passes.h.inc"
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 5d52d09bb191cb..308272d66d567a 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -1244,7 +1244,6 @@ template <typename NamedStructuredOpType>
 static ParseResult parseNamedStructuredOp(OpAsmParser &parser,
                                           OperationState &result) {
   SmallVector<OpAsmParser::OperandType, 8> operandsInfo;
-  result.getContext()->getOrLoadDialect<StandardOpsDialect>();
 
   // Optional attributes may be added.
   if (parser.parseOperandList(operandsInfo) ||
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PassDetail.h b/mlir/lib/Dialect/Linalg/Transforms/PassDetail.h
index 0415aeb8a1fd6f..7fa05ff1212070 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/Linalg/Transforms/PassDetail.h
@@ -9,18 +9,9 @@
 #ifndef DIALECT_LINALG_TRANSFORMS_PASSDETAIL_H_
 #define DIALECT_LINALG_TRANSFORMS_PASSDETAIL_H_
 
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/IR/Dialect.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
-// Forward declaration from Dialect.h
-template <typename ConcreteDialect>
-void registerDialect(DialectRegistry &registry);
-
-namespace scf {
-class SCFDialect;
-} // end namespace scf
 
 #define GEN_PASS_CLASSES
 #include "mlir/Dialect/Linalg/Passes.h.inc"
diff --git a/mlir/lib/Dialect/SCF/Transforms/PassDetail.h b/mlir/lib/Dialect/SCF/Transforms/PassDetail.h
index 6fa7f227d3da5b..95f8636b27c19d 100644
--- a/mlir/lib/Dialect/SCF/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/SCF/Transforms/PassDetail.h
@@ -12,11 +12,6 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
-// Forward declaration from Dialect.h
-template <typename ConcreteDialect>
-void registerDialect(DialectRegistry &registry);
-
-class AffineDialect;
 
 #define GEN_PASS_CLASSES
 #include "mlir/Dialect/SCF/Passes.h.inc"
diff --git a/mlir/lib/Dialect/SDBM/SDBMExpr.cpp b/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
index a1971c3da3b286..435c7fe25f0c90 100644
--- a/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
+++ b/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
@@ -517,7 +517,7 @@ Optional<SDBMExpr> SDBMExpr::tryConvertAffineExpr(AffineExpr affine) {
 
     SDBMDialect *dialect;
   } converter;
-  converter.dialect = affine.getContext()->getOrLoadDialect<SDBMDialect>();
+  converter.dialect = affine.getContext()->getRegisteredDialect<SDBMDialect>();
 
   if (auto result = converter.visit(affine))
     return result;
diff --git a/mlir/lib/ExecutionEngine/JitRunner.cpp b/mlir/lib/ExecutionEngine/JitRunner.cpp
index 2b18adb3734713..7959183e89682c 100644
--- a/mlir/lib/ExecutionEngine/JitRunner.cpp
+++ b/mlir/lib/ExecutionEngine/JitRunner.cpp
@@ -259,9 +259,7 @@ int mlir::JitRunnerMain(
     }
   }
 
-  MLIRContext context(/*loadAllDialects=*/false);
-  registerAllDialects(&context);
-
+  MLIRContext context;
   auto m = parseMLIRInput(options.inputFilename, &context);
   if (!m) {
     llvm::errs() << "could not parse the input IR\n";
diff --git a/mlir/lib/IR/Dialect.cpp b/mlir/lib/IR/Dialect.cpp
index 44b0ee6b61d0aa..555bb2bf0eb4b6 100644
--- a/mlir/lib/IR/Dialect.cpp
+++ b/mlir/lib/IR/Dialect.cpp
@@ -27,29 +27,21 @@ DialectAsmParser::~DialectAsmParser() {}
 //===----------------------------------------------------------------------===//
 
 /// Registry for all dialect allocation functions.
-static llvm::ManagedStatic<DialectRegistry> dialectRegistry;
-DialectRegistry &mlir::getGlobalDialectRegistry() { return *dialectRegistry; }
-
-void mlir::registerAllDialects(MLIRContext *context) {
-  dialectRegistry->appendTo(context->getDialectRegistry());
-}
-
-Dialect *DialectRegistry::loadByName(StringRef name, MLIRContext *context) {
-  auto it = registry.find(name.str());
-  if (it == registry.end())
-    return nullptr;
-  return it->second.second(context);
+static llvm::ManagedStatic<llvm::MapVector<TypeID, DialectAllocatorFunction>>
+    dialectRegistry;
+
+void Dialect::registerDialectAllocator(
+    TypeID typeID, const DialectAllocatorFunction &function) {
+  assert(function &&
+         "Attempting to register an empty dialect initialize function");
+  dialectRegistry->insert({typeID, function});
 }
 
-void DialectRegistry::insert(TypeID typeID, StringRef name,
-                             DialectAllocatorFunction ctor) {
-  auto inserted =
-      registry.insert(std::make_pair(name, std::make_pair(typeID, ctor)));
-  if (!inserted.second && inserted.first->second.first != typeID) {
-    llvm::report_fatal_error(
-        "Trying to register different dialects for the same namespace: " +
-        name);
-  }
+/// Registers all dialects and hooks from the global registries with the
+/// specified MLIRContext.
+void mlir::registerAllDialects(MLIRContext *context) {
+  for (const auto &it : *dialectRegistry)
+    it.second(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -127,7 +119,7 @@ DialectInterface::~DialectInterface() {}
 
 DialectInterfaceCollectionBase::DialectInterfaceCollectionBase(
     MLIRContext *ctx, TypeID interfaceKind) {
-  for (auto *dialect : ctx->getLoadedDialects()) {
+  for (auto *dialect : ctx->getRegisteredDialects()) {
     if (auto *interface = dialect->getRegisteredInterface(interfaceKind)) {
       interfaces.insert(interface);
       orderedInterfaces.push_back(interface);
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 7c8a637ede0fcf..0d66070657aaff 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -31,13 +31,10 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
-#define DEBUG_TYPE "mlircontext"
-
 using namespace mlir;
 using namespace mlir::detail;
 
@@ -278,8 +275,7 @@ class MLIRContextImpl {
 
   /// This is a list of dialects that are created referring to this context.
   /// The MLIRContext owns the objects.
-  DenseMap<StringRef, std::unique_ptr<Dialect>> loadedDialects;
-  DialectRegistry dialectsRegistry;
+  std::vector<std::unique_ptr<Dialect>> dialects;
 
   /// This is a mapping from operation name to AbstractOperation for registered
   /// operations.
@@ -350,7 +346,7 @@ class MLIRContextImpl {
 };
 } // end namespace mlir
 
-MLIRContext::MLIRContext(bool loadAllDialects) : impl(new MLIRContextImpl()) {
+MLIRContext::MLIRContext() : impl(new MLIRContextImpl()) {
   // Initialize values based on the command line flags if they were provided.
   if (clOptions.isConstructed()) {
     disableMultithreading(clOptions->disableThreading);
@@ -359,9 +355,8 @@ MLIRContext::MLIRContext(bool loadAllDialects) : impl(new MLIRContextImpl()) {
   }
 
   // Register dialects with this context.
-  getOrLoadDialect<BuiltinDialect>();
-  if (loadAllDialects)
-    loadAllGloballyRegisteredDialects();
+  getOrCreateDialect<BuiltinDialect>();
+  registerAllDialects(this);
 
   // Initialize several common attributes and types to avoid the need to lock
   // the context when accessing them.
@@ -443,72 +438,54 @@ DiagnosticEngine &MLIRContext::getDiagEngine() { return getImpl().diagEngine; }
 // Dialect and Operation Registration
 //===----------------------------------------------------------------------===//
 
-DialectRegistry &MLIRContext::getDialectRegistry() {
-  return impl->dialectsRegistry;
-}
-
 /// Return information about all registered IR dialects.
-std::vector<Dialect *> MLIRContext::getLoadedDialects() {
+std::vector<Dialect *> MLIRContext::getRegisteredDialects() {
   std::vector<Dialect *> result;
-  result.reserve(impl->loadedDialects.size());
-  for (auto &dialect : impl->loadedDialects)
-    result.push_back(dialect.second.get());
-  llvm::array_pod_sort(result.begin(), result.end(),
-                       [](Dialect *const *lhs, Dialect *const *rhs) -> int {
-                         return (*lhs)->getNamespace() < (*rhs)->getNamespace();
-                       });
-  return result;
-}
-std::vector<StringRef> MLIRContext::getAvailableDialects() {
-  std::vector<StringRef> result;
-  for (auto &dialect : impl->dialectsRegistry)
-    result.push_back(dialect.first);
+  result.reserve(impl->dialects.size());
+  for (auto &dialect : impl->dialects)
+    result.push_back(dialect.get());
   return result;
 }
 
 /// Get a registered IR dialect with the given namespace. If none is found,
 /// then return nullptr.
-Dialect *MLIRContext::getLoadedDialect(StringRef name) {
+Dialect *MLIRContext::getRegisteredDialect(StringRef name) {
   // Dialects are sorted by name, so we can use binary search for lookup.
-  auto it = impl->loadedDialects.find(name);
-  return (it != impl->loadedDialects.end()) ? it->second.get() : nullptr;
-}
-
-Dialect *MLIRContext::getOrLoadDialect(StringRef name) {
-  Dialect *dialect = getLoadedDialect(name);
-  if (dialect)
-    return dialect;
-  return impl->dialectsRegistry.loadByName(name, this);
+  auto it = llvm::lower_bound(
+      impl->dialects, name,
+      [](const auto &lhs, StringRef rhs) { return lhs->getNamespace() < rhs; });
+  return (it != impl->dialects.end() && (*it)->getNamespace() == name)
+             ? (*it).get()
+             : nullptr;
 }
 
 /// Get a dialect for the provided namespace and TypeID: abort the program if a
 /// dialect exist for this namespace with different TypeID. Returns a pointer to
 /// the dialect owned by the context.
 Dialect *
-MLIRContext::getOrLoadDialect(StringRef dialectNamespace, TypeID dialectID,
-                              function_ref<std::unique_ptr<Dialect>()> ctor) {
+MLIRContext::getOrCreateDialect(StringRef dialectNamespace, TypeID dialectID,
+                                function_ref<std::unique_ptr<Dialect>()> ctor) {
   auto &impl = getImpl();
   // Get the correct insertion position sorted by namespace.
-  std::unique_ptr<Dialect> &dialect = impl.loadedDialects[dialectNamespace];
-
-  if (!dialect) {
-    LLVM_DEBUG(llvm::dbgs()
-               << "Load new dialect in Context" << dialectNamespace);
-    dialect = ctor();
-    assert(dialect && "dialect ctor failed");
-    return dialect.get();
-  }
+  auto insertPt =
+      llvm::lower_bound(impl.dialects, nullptr,
+                        [&](const std::unique_ptr<Dialect> &lhs,
+                            const std::unique_ptr<Dialect> &rhs) {
+                          if (!lhs)
+                            return dialectNamespace < rhs->getNamespace();
+                          return lhs->getNamespace() < dialectNamespace;
+                        });
 
   // Abort if dialect with namespace has already been registered.
-  if (dialect->getTypeID() != dialectID)
+  if (insertPt != impl.dialects.end() &&
+      (*insertPt)->getNamespace() == dialectNamespace) {
+    if ((*insertPt)->getTypeID() == dialectID)
+      return insertPt->get();
     llvm::report_fatal_error("a dialect with namespace '" + dialectNamespace +
                              "' has already been registered");
-
-  return dialect.get();
-}
-
-void MLIRContext::loadAllGloballyRegisteredDialects() {
-  getGlobalDialectRegistry().loadAll(this);
+  }
+  auto it = impl.dialects.insert(insertPt, ctor());
+  return &**it;
 }
 
 bool MLIRContext::allowsUnregisteredDialects() {
diff --git a/mlir/lib/IR/Operation.cpp b/mlir/lib/IR/Operation.cpp
index 67249b83b10470..4ddc3df38a7e80 100644
--- a/mlir/lib/IR/Operation.cpp
+++ b/mlir/lib/IR/Operation.cpp
@@ -214,7 +214,7 @@ Dialect *Operation::getDialect() {
 
   // If this operation hasn't been registered or doesn't have abstract
   // operation, try looking up the dialect name in the context.
-  return getContext()->getLoadedDialect(getName().getDialect());
+  return getContext()->getRegisteredDialect(getName().getDialect());
 }
 
 Region *Operation::getParentRegion() {
diff --git a/mlir/lib/IR/Verifier.cpp b/mlir/lib/IR/Verifier.cpp
index 4caf9891383c57..b1aed8842dc431 100644
--- a/mlir/lib/IR/Verifier.cpp
+++ b/mlir/lib/IR/Verifier.cpp
@@ -50,7 +50,7 @@ class OperationVerifier {
   Dialect *getDialectForAttribute(const NamedAttribute &attr) {
     assert(attr.first.strref().contains('.') && "expected dialect attribute");
     auto dialectNamePair = attr.first.strref().split('.');
-    return ctx->getLoadedDialect(dialectNamePair.first);
+    return ctx->getRegisteredDialect(dialectNamePair.first);
   }
 
 private:
@@ -218,7 +218,7 @@ LogicalResult OperationVerifier::verifyOperation(Operation &op) {
   auto it = dialectAllowsUnknownOps.find(dialectPrefix);
   if (it == dialectAllowsUnknownOps.end()) {
     // If the operation dialect is registered, query it directly.
-    if (auto *dialect = ctx->getLoadedDialect(dialectPrefix))
+    if (auto *dialect = ctx->getRegisteredDialect(dialectPrefix))
       it = dialectAllowsUnknownOps
                .try_emplace(dialectPrefix, dialect->allowsUnknownOperations())
                .first;
diff --git a/mlir/lib/Parser/AttributeParser.cpp b/mlir/lib/Parser/AttributeParser.cpp
index 37ee938a4bcd5c..1c1261e6d765c9 100644
--- a/mlir/lib/Parser/AttributeParser.cpp
+++ b/mlir/lib/Parser/AttributeParser.cpp
@@ -12,7 +12,6 @@
 
 #include "Parser.h"
 #include "mlir/IR/AffineMap.h"
-#include "mlir/IR/Dialect.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/IR/StandardTypes.h"
 #include "llvm/ADT/StringExtras.h"
@@ -247,11 +246,6 @@ ParseResult Parser::parseAttributeDict(NamedAttrList &attributes) {
       return emitError("duplicate key in dictionary attribute");
     consumeToken();
 
-    // Lazy load a dialect in the context if there is a possible namespace.
-    auto splitName = nameId->strref().split('.');
-    if (!splitName.second.empty())
-      getContext()->getOrLoadDialect(splitName.first);
-
     // Try to parse the '=' for the attribute value.
     if (!consumeIf(Token::equal)) {
       // If there is no '=', we treat this as a unit attribute.
@@ -823,9 +817,7 @@ Attribute Parser::parseOpaqueElementsAttr(Type attrType) {
     return (emitError("expected dialect namespace"), nullptr);
 
   auto name = getToken().getStringValue();
-  // Lazy load a dialect in the context if there is a possible namespace.
-  Dialect *dialect = builder.getContext()->getOrLoadDialect(name);
-
+  auto *dialect = builder.getContext()->getRegisteredDialect(name);
   // TODO: Allow for having an unknown dialect on an opaque
   // attribute. Otherwise, it can't be roundtripped without having the dialect
   // registered.
diff --git a/mlir/lib/Parser/DialectSymbolParser.cpp b/mlir/lib/Parser/DialectSymbolParser.cpp
index d45ddf0719897c..3b522a876f2541 100644
--- a/mlir/lib/Parser/DialectSymbolParser.cpp
+++ b/mlir/lib/Parser/DialectSymbolParser.cpp
@@ -526,8 +526,7 @@ Attribute Parser::parseExtendedAttr(Type type) {
           return Attribute();
 
         // If we found a registered dialect, then ask it to parse the attribute.
-        if (Dialect *dialect =
-                builder.getContext()->getOrLoadDialect(dialectName)) {
+        if (auto *dialect = state.context->getRegisteredDialect(dialectName)) {
           return parseSymbol<Attribute>(
               symbolData, state.context, state.symbols, [&](Parser &parser) {
                 CustomDialectAsmParser customParser(symbolData, parser);
@@ -564,9 +563,7 @@ Type Parser::parseExtendedType() {
       [&](StringRef dialectName, StringRef symbolData,
           llvm::SMLoc loc) -> Type {
         // If we found a registered dialect, then ask it to parse the type.
-        auto *dialect = state.context->getOrLoadDialect(dialectName);
-
-        if (dialect) {
+        if (auto *dialect = state.context->getRegisteredDialect(dialectName)) {
           return parseSymbol<Type>(
               symbolData, state.context, state.symbols, [&](Parser &parser) {
                 CustomDialectAsmParser customParser(symbolData, parser);
diff --git a/mlir/lib/Parser/Parser.cpp b/mlir/lib/Parser/Parser.cpp
index 837b08ca54c0b2..3a995a4e2b048f 100644
--- a/mlir/lib/Parser/Parser.cpp
+++ b/mlir/lib/Parser/Parser.cpp
@@ -12,7 +12,6 @@
 
 #include "Parser.h"
 #include "mlir/IR/AffineMap.h"
-#include "mlir/IR/Dialect.h"
 #include "mlir/IR/Module.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Parser.h"
@@ -728,7 +727,7 @@ Operation *OperationParser::parseGenericOperation() {
   // Get location information for the operation.
   auto srcLocation = getEncodedSourceLocation(getToken().getLoc());
 
-  std::string name = getToken().getStringValue();
+  auto name = getToken().getStringValue();
   if (name.empty())
     return (emitError("empty operation name is invalid"), nullptr);
   if (name.find('\0') != StringRef::npos)
@@ -738,15 +737,6 @@ Operation *OperationParser::parseGenericOperation() {
 
   OperationState result(srcLocation, name);
 
-  // Lazy load dialects in the context as needed.
-  if (!result.name.getAbstractOperation()) {
-    StringRef dialectName = StringRef(name).split('.').first;
-    if (!getContext()->getLoadedDialect(dialectName) &&
-        getContext()->getOrLoadDialect(dialectName)) {
-      result.name = OperationName(name, getContext());
-    }
-  }
-
   // Parse the operand list.
   SmallVector<SSAUseInfo, 8> operandInfos;
   if (parseToken(Token::l_paren, "expected '(' to start operand list") ||
@@ -1452,28 +1442,17 @@ class CustomOpAsmParser : public OpAsmParser {
 
 Operation *
 OperationParser::parseCustomOperation(ArrayRef<ResultRecord> resultIDs) {
-  llvm::SMLoc opLoc = getToken().getLoc();
-  StringRef opName = getTokenSpelling();
+  auto opLoc = getToken().getLoc();
+  auto opName = getTokenSpelling();
 
   auto *opDefinition = AbstractOperation::lookup(opName, getContext());
-  if (!opDefinition) {
-    if (opName.contains('.')) {
-      // This op has a dialect, we try to check if we can register it in the
-      // context on the fly.
-      StringRef dialectName = opName.split('.').first;
-      if (!getContext()->getLoadedDialect(dialectName) &&
-          getContext()->getOrLoadDialect(dialectName)) {
-        opDefinition = AbstractOperation::lookup(opName, getContext());
-      }
-    } else {
-      // If the operation name has no namespace prefix we treat it as a standard
-      // operation and prefix it with "std".
-      // TODO: Would it be better to just build a mapping of the registered
-      // operations in the standard dialect?
-      if (getContext()->getOrLoadDialect("std"))
-        opDefinition = AbstractOperation::lookup(Twine("std." + opName).str(),
-                                                 getContext());
-    }
+  if (!opDefinition && !opName.contains('.')) {
+    // If the operation name has no namespace prefix we treat it as a standard
+    // operation and prefix it with "std".
+    // TODO: Would it be better to just build a mapping of the registered
+    // operations in the standard dialect?
+    opDefinition =
+        AbstractOperation::lookup(Twine("std." + opName).str(), getContext());
   }
 
   if (!opDefinition) {
diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp
index 9bc23c2e4a653a..b791bf483e675a 100644
--- a/mlir/lib/Pass/Pass.cpp
+++ b/mlir/lib/Pass/Pass.cpp
@@ -290,13 +290,6 @@ OpPassManager::pass_iterator OpPassManager::begin() {
 }
 OpPassManager::pass_iterator OpPassManager::end() { return impl->passes.end(); }
 
-OpPassManager::const_pass_iterator OpPassManager::begin() const {
-  return impl->passes.begin();
-}
-OpPassManager::const_pass_iterator OpPassManager::end() const {
-  return impl->passes.end();
-}
-
 /// Run all of the passes in this manager over the current operation.
 LogicalResult OpPassManager::run(Operation *op, AnalysisManager am) {
   // Run each of the held passes.
@@ -353,16 +346,6 @@ void OpPassManager::printAsTextualPipeline(raw_ostream &os) {
   ::printAsTextualPipeline(impl->passes, os);
 }
 
-static void registerDialectsForPipeline(const OpPassManager &pm,
-                                        DialectRegistry &dialects) {
-  for (const Pass &pass : pm.getPasses())
-    pass.getDependentDialects(dialects);
-}
-
-void OpPassManager::getDependentDialects(DialectRegistry &dialects) const {
-  registerDialectsForPipeline(*this, dialects);
-}
-
 //===----------------------------------------------------------------------===//
 // OpToOpPassAdaptor
 //===----------------------------------------------------------------------===//
@@ -395,11 +378,6 @@ OpToOpPassAdaptor::OpToOpPassAdaptor(OpPassManager &&mgr) {
   mgrs.emplace_back(std::move(mgr));
 }
 
-void OpToOpPassAdaptor::getDependentDialects(DialectRegistry &dialects) const {
-  for (auto &pm : mgrs)
-    pm.getDependentDialects(dialects);
-}
-
 /// Merge the current pass adaptor into given 'rhs'.
 void OpToOpPassAdaptor::mergeInto(OpToOpPassAdaptor &rhs) {
   for (auto &pm : mgrs) {
@@ -743,11 +721,6 @@ LogicalResult PassManager::run(ModuleOp module) {
   // pipeline.
   getImpl().coalesceAdjacentAdaptorPasses();
 
-  // Register all dialects for the current pipeline.
-  DialectRegistry dependentDialects;
-  getDependentDialects(dependentDialects);
-  dependentDialects.loadAll(module.getContext());
-
   // Construct an analysis manager for the pipeline.
   ModuleAnalysisManager am(module, instrumentor.get());
 
diff --git a/mlir/lib/Pass/PassDetail.h b/mlir/lib/Pass/PassDetail.h
index f69701d85e15a5..2342a1a7af97d8 100644
--- a/mlir/lib/Pass/PassDetail.h
+++ b/mlir/lib/Pass/PassDetail.h
@@ -43,10 +43,6 @@ class OpToOpPassAdaptor
   /// Returns the pass managers held by this adaptor.
   MutableArrayRef<OpPassManager> getPassManagers() { return mgrs; }
 
-  /// Populate the set of dependent dialects for the passes in the current
-  /// adaptor.
-  void getDependentDialects(DialectRegistry &dialects) const override;
-
   /// Return the async pass managers held by this parallel adaptor.
   MutableArrayRef<SmallVector<OpPassManager, 1>> getParallelPassManagers() {
     return asyncExecutors;
diff --git a/mlir/lib/Support/MlirOptMain.cpp b/mlir/lib/Support/MlirOptMain.cpp
index 77b07605407b73..699eded9fe285a 100644
--- a/mlir/lib/Support/MlirOptMain.cpp
+++ b/mlir/lib/Support/MlirOptMain.cpp
@@ -81,18 +81,13 @@ static LogicalResult processBuffer(raw_ostream &os,
                                    std::unique_ptr<MemoryBuffer> ownedBuffer,
                                    bool verifyDiagnostics, bool verifyPasses,
                                    bool allowUnregisteredDialects,
-                                   bool preloadDialectsInContext,
-                                   const PassPipelineCLParser &passPipeline,
-                                   DialectRegistry &registry) {
+                                   const PassPipelineCLParser &passPipeline) {
   // Tell sourceMgr about this buffer, which is what the parser will pick up.
   SourceMgr sourceMgr;
   sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), SMLoc());
 
   // Parse the input file.
-  MLIRContext context(/*loadAllDialects=*/preloadDialectsInContext);
-  registry.appendTo(context.getDialectRegistry());
-  if (preloadDialectsInContext)
-    registry.loadAll(&context);
+  MLIRContext context;
   context.allowUnregisteredDialects(allowUnregisteredDialects);
   context.printOpOnDiagnostic(!verifyDiagnostics);
 
@@ -120,10 +115,9 @@ static LogicalResult processBuffer(raw_ostream &os,
 LogicalResult mlir::MlirOptMain(raw_ostream &outputStream,
                                 std::unique_ptr<MemoryBuffer> buffer,
                                 const PassPipelineCLParser &passPipeline,
-                                DialectRegistry &registry, bool splitInputFile,
-                                bool verifyDiagnostics, bool verifyPasses,
-                                bool allowUnregisteredDialects,
-                                bool preloadDialectsInContext) {
+                                bool splitInputFile, bool verifyDiagnostics,
+                                bool verifyPasses,
+                                bool allowUnregisteredDialects) {
   // The split-input-file mode is a very specific mode that slices the file
   // up into small pieces and checks each independently.
   if (splitInputFile)
@@ -132,19 +126,15 @@ LogicalResult mlir::MlirOptMain(raw_ostream &outputStream,
         [&](std::unique_ptr<MemoryBuffer> chunkBuffer, raw_ostream &os) {
           return processBuffer(os, std::move(chunkBuffer), verifyDiagnostics,
                                verifyPasses, allowUnregisteredDialects,
-                               preloadDialectsInContext, passPipeline,
-                               registry);
+                               passPipeline);
         },
         outputStream);
 
   return processBuffer(outputStream, std::move(buffer), verifyDiagnostics,
-                       verifyPasses, allowUnregisteredDialects,
-                       preloadDialectsInContext, passPipeline, registry);
+                       verifyPasses, allowUnregisteredDialects, passPipeline);
 }
 
-LogicalResult mlir::MlirOptMain(int argc, char **argv, llvm::StringRef toolName,
-                                DialectRegistry &registry,
-                                bool preloadDialectsInContext) {
+LogicalResult mlir::MlirOptMain(int argc, char **argv, StringRef toolName) {
   static cl::opt<std::string> inputFilename(
       cl::Positional, cl::desc("<input file>"), cl::init("-"));
 
@@ -190,19 +180,25 @@ LogicalResult mlir::MlirOptMain(int argc, char **argv, llvm::StringRef toolName,
   {
     llvm::raw_string_ostream os(helpHeader);
     MLIRContext context;
-    interleaveComma(registry, os, [&](auto &registryEntry) {
-      StringRef name = registryEntry.first;
-      os << name;
+    interleaveComma(context.getRegisteredDialects(), os, [&](Dialect *dialect) {
+      StringRef name = dialect->getNamespace();
+      // filter the builtin dialect.
+      if (name.empty())
+        os << "<builtin>";
+      else
+        os << name;
     });
   }
   // Parse pass names in main to ensure static initialization completed.
   cl::ParseCommandLineOptions(argc, argv, helpHeader);
 
   if (showDialects) {
-    llvm::outs() << "Available Dialects:\n";
+    llvm::outs() << "Registered Dialects:\n";
+    MLIRContext context;
     interleave(
-        registry, llvm::outs(),
-        [](auto &registryEntry) { llvm::outs() << registryEntry.first; }, "\n");
+        context.getRegisteredDialects(), llvm::outs(),
+        [](Dialect *dialect) { llvm::outs() << dialect->getNamespace(); },
+        "\n");
     return success();
   }
 
@@ -220,9 +216,9 @@ LogicalResult mlir::MlirOptMain(int argc, char **argv, llvm::StringRef toolName,
     return failure();
   }
 
-  if (failed(MlirOptMain(output->os(), std::move(file), passPipeline, registry,
+  if (failed(MlirOptMain(output->os(), std::move(file), passPipeline,
                          splitInputFile, verifyDiagnostics, verifyPasses,
-                         allowUnregisteredDialects, preloadDialectsInContext)))
+                         allowUnregisteredDialects)))
     return failure();
 
   // Keep the output file if the invocation of MlirOptMain was successful.
diff --git a/mlir/lib/TableGen/Dialect.cpp b/mlir/lib/TableGen/Dialect.cpp
index 2b5f7e534ecc7b..6af77e7df0f6fa 100644
--- a/mlir/lib/TableGen/Dialect.cpp
+++ b/mlir/lib/TableGen/Dialect.cpp
@@ -15,10 +15,6 @@
 
 using namespace mlir;
 using namespace mlir::tblgen;
-Dialect::Dialect(const llvm::Record *def) : def(def) {
-  for (StringRef dialect : def->getValueAsListOfStrings("dependentDialects"))
-    dependentDialects.push_back(dialect);
-}
 
 StringRef Dialect::getName() const { return def->getValueAsString("name"); }
 
@@ -50,10 +46,6 @@ StringRef Dialect::getDescription() const {
   return getAsStringOrEmpty(*def, "description");
 }
 
-ArrayRef<StringRef> Dialect::getDependentDialects() const {
-  return dependentDialects;
-}
-
 llvm::Optional<StringRef> Dialect::getExtraClassDeclaration() const {
   auto value = def->getValueAsString("extraClassDeclaration");
   return value.empty() ? llvm::Optional<StringRef>() : value;
diff --git a/mlir/lib/TableGen/Pass.cpp b/mlir/lib/TableGen/Pass.cpp
index f96180689c55af..4bc46b622c2b9f 100644
--- a/mlir/lib/TableGen/Pass.cpp
+++ b/mlir/lib/TableGen/Pass.cpp
@@ -69,8 +69,6 @@ Pass::Pass(const llvm::Record *def) : def(def) {
     options.push_back(PassOption(init));
   for (auto *init : def->getValueAsListOfDefs("statistics"))
     statistics.push_back(PassStatistic(init));
-  for (StringRef dialect : def->getValueAsListOfStrings("dependentDialects"))
-    dependentDialects.push_back(dialect);
 }
 
 StringRef Pass::getArgument() const {
@@ -90,9 +88,6 @@ StringRef Pass::getDescription() const {
 StringRef Pass::getConstructor() const {
   return def->getValueAsString("constructor");
 }
-ArrayRef<StringRef> Pass::getDependentDialects() const {
-  return dependentDialects;
-}
 
 ArrayRef<PassOption> Pass::getOptions() const { return options; }
 
diff --git a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
index a5d83389387959..470044bc995372 100644
--- a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
@@ -836,7 +836,6 @@ LogicalResult Importer::processBasicBlock(llvm::BasicBlock *bb, Block *block) {
 OwningModuleRef
 mlir::translateLLVMIRToModule(std::unique_ptr<llvm::Module> llvmModule,
                               MLIRContext *context) {
-  context->loadDialect<LLVMDialect>();
   OwningModuleRef module(ModuleOp::create(
       FileLineColLoc::get("", /*line=*/0, /*column=*/0, context)));
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 21f5201c7d697a..f8277d154f2765 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -302,7 +302,8 @@ ModuleTranslation::ModuleTranslation(Operation *module,
     : mlirModule(module), llvmModule(std::move(llvmModule)),
       debugTranslation(
           std::make_unique<DebugTranslation>(module, *this->llvmModule)),
-      ompDialect(module->getContext()->getOrLoadDialect<omp::OpenMPDialect>()),
+      ompDialect(
+          module->getContext()->getRegisteredDialect<omp::OpenMPDialect>()),
       typeTranslator(this->llvmModule->getContext()) {
   assert(satisfiesLLVMModule(mlirModule) &&
          "mlirModule should honor LLVM's module semantics.");
@@ -943,8 +944,8 @@ ModuleTranslation::lookupValues(ValueRange values) {
 
 std::unique_ptr<llvm::Module> ModuleTranslation::prepareLLVMModule(
     Operation *m, llvm::LLVMContext &llvmContext, StringRef name) {
-  m->getContext()->getOrLoadDialect<LLVM::LLVMDialect>();
   auto llvmModule = std::make_unique<llvm::Module>(name, llvmContext);
+
   if (auto dataLayoutAttr =
           m->getAttr(LLVM::LLVMDialect::getDataLayoutAttrName()))
     llvmModule->setDataLayout(dataLayoutAttr.cast<StringAttr>().getValue());
diff --git a/mlir/lib/Transforms/PassDetail.h b/mlir/lib/Transforms/PassDetail.h
index 220ed1aac40797..c6f7e225d71ac5 100644
--- a/mlir/lib/Transforms/PassDetail.h
+++ b/mlir/lib/Transforms/PassDetail.h
@@ -12,13 +12,6 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
-// Forward declaration from Dialect.h
-template <typename ConcreteDialect>
-void registerDialect(DialectRegistry &registry);
-
-namespace linalg {
-class LinalgDialect;
-} // end namespace linalg
 
 #define GEN_PASS_CLASSES
 #include "mlir/Transforms/Passes.h.inc"
diff --git a/mlir/test/CAPI/ir.c b/mlir/test/CAPI/ir.c
index 04d42f29a294e5..56b7ecd7fd7c4c 100644
--- a/mlir/test/CAPI/ir.c
+++ b/mlir/test/CAPI/ir.c
@@ -383,7 +383,6 @@ static int printStandardTypes(MlirContext ctx) {
 int main() {
   mlirRegisterAllDialects();
   MlirContext ctx = mlirContextCreate();
-  mlirContextLoadAllDialects(ctx);
   MlirLocation location = mlirLocationUnknownGet(ctx);
 
   MlirModule moduleOp = makeAdd(ctx, location);
diff --git a/mlir/test/EDSC/builder-api-test.cpp b/mlir/test/EDSC/builder-api-test.cpp
index 062e4b5912297a..3fcfcf24ef8fe4 100644
--- a/mlir/test/EDSC/builder-api-test.cpp
+++ b/mlir/test/EDSC/builder-api-test.cpp
@@ -36,18 +36,16 @@ using namespace mlir::edsc;
 using namespace mlir::edsc::intrinsics;
 
 static MLIRContext &globalContext() {
-  static thread_local MLIRContext context(/*loadAllDialects=*/false);
-  static thread_local bool initOnce = [&]() {
-    // clang-format off
-    context.loadDialect<AffineDialect,
-                        scf::SCFDialect,
-                        linalg::LinalgDialect,
-                        StandardOpsDialect,
-                        vector::VectorDialect>();
-    // clang-format on
+  static bool init_once = []() {
+    registerDialect<AffineDialect>();
+    registerDialect<linalg::LinalgDialect>();
+    registerDialect<scf::SCFDialect>();
+    registerDialect<StandardOpsDialect>();
+    registerDialect<vector::VectorDialect>();
     return true;
   }();
-  (void)initOnce;
+  (void)init_once;
+  static thread_local MLIRContext context;
   context.allowUnregisteredDialects();
   return context;
 }
diff --git a/mlir/test/SDBM/sdbm-api-test.cpp b/mlir/test/SDBM/sdbm-api-test.cpp
index ddefc52fb461d5..0b58e2948145cc 100644
--- a/mlir/test/SDBM/sdbm-api-test.cpp
+++ b/mlir/test/SDBM/sdbm-api-test.cpp
@@ -19,19 +19,18 @@
 
 using namespace mlir;
 
+// Load the SDBM dialect
+static DialectRegistration<SDBMDialect> SDBMRegistration;
 
 static MLIRContext *ctx() {
-  static thread_local MLIRContext context(/*loadAllDialects=*/false);
-  static thread_local bool once =
-      (context.getOrLoadDialect<SDBMDialect>(), true);
-  (void)once;
+  static thread_local MLIRContext context;
   return &context;
 }
 
 static SDBMDialect *dialect() {
   static thread_local SDBMDialect *d = nullptr;
   if (!d) {
-    d = ctx()->getOrLoadDialect<SDBMDialect>();
+    d = ctx()->getRegisteredDialect<SDBMDialect>();
   }
   return d;
 }
diff --git a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
index cfac2dce230075..a6719b060aac9f 100644
--- a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
@@ -14,7 +14,6 @@
 #include "mlir/Analysis/NestedMatcher.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Diagnostics.h"
@@ -73,9 +72,6 @@ struct VectorizerTestPass
     : public PassWrapper<VectorizerTestPass, FunctionPass> {
   static constexpr auto kTestAffineMapOpName = "test_affine_map";
   static constexpr auto kTestAffineMapAttrName = "affine_map";
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<vector::VectorDialect>();
-  }
 
   void runOnFunction() override;
   void testVectorShapeRatio(llvm::raw_ostream &outs);
diff --git a/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp b/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp
index 03c425d6d9062e..0c1069f38b670a 100644
--- a/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp
+++ b/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp
@@ -30,7 +30,7 @@ void PrintOpAvailability::runOnFunction() {
   auto f = getFunction();
   llvm::outs() << f.getName() << "\n";
 
-  Dialect *spvDialect = getContext().getLoadedDialect("spv");
+  Dialect *spvDialect = getContext().getRegisteredDialect("spv");
 
   f.getOperation()->walk([&](Operation *op) {
     if (op->getDialect() != spvDialect)
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index 59d9697be5b478..c873a009f151d2 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -21,10 +21,6 @@
 
 using namespace mlir;
 
-void mlir::registerTestDialect(DialectRegistry &registry) {
-  registry.insert<TestDialect>();
-}
-
 //===----------------------------------------------------------------------===//
 // TestDialect Interfaces
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.h b/mlir/test/lib/Dialect/Test/TestDialect.h
index 34fc1a9534e8d5..fd1914cbc62455 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.h
+++ b/mlir/test/lib/Dialect/Test/TestDialect.h
@@ -37,8 +37,6 @@ namespace mlir {
 #define GET_OP_CLASSES
 #include "TestOps.h.inc"
 
-void registerTestDialect(DialectRegistry &registry);
-
 } // end namespace mlir
 
 #endif // MLIR_TESTDIALECT_H
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index be5d799a025336..f2a17a9f3f5fa6 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -768,10 +768,6 @@ struct TestTypeConversionProducer
 
 struct TestTypeConversionDriver
     : public PassWrapper<TestTypeConversionDriver, OperationPass<ModuleOp>> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<TestDialect>();
-  }
-
   void runOnOperation() override {
     // Initialize the type converter.
     TypeConverter converter;
diff --git a/mlir/test/lib/Transforms/TestAllReduceLowering.cpp b/mlir/test/lib/Transforms/TestAllReduceLowering.cpp
index 0c72b6cd2a89c3..c043d0f02f8d0b 100644
--- a/mlir/test/lib/Transforms/TestAllReduceLowering.cpp
+++ b/mlir/test/lib/Transforms/TestAllReduceLowering.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/GPU/Passes.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 
@@ -20,9 +19,6 @@ using namespace mlir;
 namespace {
 struct TestAllReduceLoweringPass
     : public PassWrapper<TestAllReduceLoweringPass, OperationPass<ModuleOp>> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<StandardOpsDialect>();
-  }
   void runOnOperation() override {
     OwningRewritePatternList patterns;
     populateGpuRewritePatterns(&getContext(), patterns);
diff --git a/mlir/test/lib/Transforms/TestBufferPlacement.cpp b/mlir/test/lib/Transforms/TestBufferPlacement.cpp
index 6cc0924191cb8f..5ad441aa15c3e2 100644
--- a/mlir/test/lib/Transforms/TestBufferPlacement.cpp
+++ b/mlir/test/lib/Transforms/TestBufferPlacement.cpp
@@ -116,10 +116,6 @@ struct TestBufferPlacementPreparationPass
     patterns->insert<GenericOpConverter>(context, placer, converter);
   }
 
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<linalg::LinalgDialect>();
-  }
-
   void runOnOperation() override {
     MLIRContext &context = this->getContext();
     ConversionTarget target(context);
diff --git a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
index 3c2b933e99f6a0..08862dd061402d 100644
--- a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
+++ b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
@@ -13,9 +13,6 @@
 
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/GPU/MemoryPromotion.h"
-#include "mlir/Dialect/SCF/SCF.h"
-#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/Pass/Pass.h"
 
@@ -29,10 +26,6 @@ namespace {
 class TestGpuMemoryPromotionPass
     : public PassWrapper<TestGpuMemoryPromotionPass,
                          OperationPass<gpu::GPUFuncOp>> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<StandardOpsDialect, scf::SCFDialect>();
-  }
-
   void runOnOperation() override {
     gpu::GPUFuncOp op = getOperation();
     for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) {
diff --git a/mlir/test/lib/Transforms/TestLinalgHoisting.cpp b/mlir/test/lib/Transforms/TestLinalgHoisting.cpp
index 5d4031f9004366..d1e478fec3bcba 100644
--- a/mlir/test/lib/Transforms/TestLinalgHoisting.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgHoisting.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Pass/Pass.h"
@@ -23,9 +22,6 @@ struct TestLinalgHoisting
     : public PassWrapper<TestLinalgHoisting, FunctionPass> {
   TestLinalgHoisting() = default;
   TestLinalgHoisting(const TestLinalgHoisting &pass) {}
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<AffineDialect>();
-  }
 
   void runOnFunction() override;
 
diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
index 4fc880a24277b1..dffe4f2a0796a0 100644
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -15,7 +15,6 @@
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
@@ -31,16 +30,6 @@ struct TestLinalgTransforms
   TestLinalgTransforms() = default;
   TestLinalgTransforms(const TestLinalgTransforms &pass) {}
 
-  void getDependentDialects(DialectRegistry &registry) const override {
-    // clang-format off
-    registry.insert<AffineDialect,
-                    scf::SCFDialect,
-                    StandardOpsDialect,
-                    vector::VectorDialect,
-                    gpu::GPUDialect>();
-    // clang-format on
-  }
-
   void runOnFunction() override;
 
   Option<bool> testPatterns{*this, "test-patterns",
diff --git a/mlir/test/lib/Transforms/TestVectorTransforms.cpp b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
index ab8460318b49fb..9da3156d535939 100644
--- a/mlir/test/lib/Transforms/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
@@ -8,9 +8,6 @@
 
 #include <type_traits>
 
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
-#include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorTransforms.h"
@@ -131,11 +128,6 @@ struct TestVectorTransferFullPartialSplitPatterns
   TestVectorTransferFullPartialSplitPatterns() = default;
   TestVectorTransferFullPartialSplitPatterns(
       const TestVectorTransferFullPartialSplitPatterns &pass) {}
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<AffineDialect, linalg::LinalgDialect, scf::SCFDialect>();
-  }
-
   Option<bool> useLinalgOps{
       *this, "use-linalg-copy",
       llvm::cl::desc("Split using a unmasked vector.transfer + linalg.fill + "
diff --git a/mlir/test/mlir-opt/commandline.mlir b/mlir/test/mlir-opt/commandline.mlir
index 4cf6ea9d8a698a..f99a68d6303cef 100644
--- a/mlir/test/mlir-opt/commandline.mlir
+++ b/mlir/test/mlir-opt/commandline.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt --show-dialects | FileCheck %s
-// CHECK: Available Dialects:
+// CHECK: Registered Dialects:
 // CHECK: affine
 // CHECK: gpu
 // CHECK: linalg
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp
index 92efef67e8f4a7..12e6aeef9162ff 100644
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp
@@ -1703,7 +1703,7 @@ int main(int argc, char **argv) {
   if (testEmitIncludeTdHeader)
     output->os() << "include \"mlir/Dialect/Linalg/IR/LinalgStructuredOps.td\"";
 
-  MLIRContext context(/*loadAllDialects=*/false);
+  MLIRContext context;
   llvm::SourceMgr mgr;
   mgr.AddNewSourceBuffer(std::move(file), llvm::SMLoc());
   Parser parser(mgr, &context);
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 6f27949d832977..f5f5a477942b9d 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -48,7 +48,6 @@ void registerTestConstantFold();
 void registerTestConvertGPUKernelToCubinPass();
 void registerTestConvertGPUKernelToHsacoPass();
 void registerTestDominancePass();
-void registerTestDialect(DialectRegistry &);
 void registerTestExpandTanhPass();
 void registerTestFunc();
 void registerTestGpuMemoryPromotionPass();
@@ -131,10 +130,5 @@ int main(int argc, char **argv) {
 #ifdef MLIR_INCLUDE_TESTS
   registerTestPasses();
 #endif
-  DialectRegistry registry;
-  registerAllDialects(registry);
-  registerTestDialect(registry);
-  return failed(MlirOptMain(argc, argv, "MLIR modular optimizer driver\n",
-                            registry,
-                            /*preloadDialectsInContext=*/false));
+  return failed(MlirOptMain(argc, argv, "MLIR modular optimizer driver"));
 }
diff --git a/mlir/tools/mlir-tblgen/DialectGen.cpp b/mlir/tools/mlir-tblgen/DialectGen.cpp
index 3a19379da8a3a3..13421c42c3c2cf 100644
--- a/mlir/tools/mlir-tblgen/DialectGen.cpp
+++ b/mlir/tools/mlir-tblgen/DialectGen.cpp
@@ -61,14 +61,11 @@ filterForDialect(ArrayRef<llvm::Record *> records, Dialect &dialect) {
 ///
 /// {0}: The name of the dialect class.
 /// {1}: The dialect namespace.
-/// {2}: initialization code that is emitted in the ctor body before calling
-/// initialize()
 static const char *const dialectDeclBeginStr = R"(
 class {0} : public ::mlir::Dialect {
   explicit {0}(::mlir::MLIRContext *context)
     : ::mlir::Dialect(getDialectNamespace(), context,
       ::mlir::TypeID::get<{0}>()) {{
-    {2}
     initialize();
   }
   void initialize();
@@ -77,12 +74,6 @@ class {0} : public ::mlir::Dialect {
   static ::llvm::StringRef getDialectNamespace() { return "{1}"; }
 )";
 
-/// Registration for a single dependent dialect: to be inserted in the ctor
-/// above for each dependent dialect.
-const char *const dialectRegistrationTemplate = R"(
-    getContext()->getOrLoadDialect<{0}>();
-)";
-
 /// The code block for the attribute parser/printer hooks.
 static const char *const attrParserDecl = R"(
   /// Parse an attribute registered to this dialect.
@@ -145,18 +136,9 @@ static void emitDialectDecl(Dialect &dialect,
                             iterator_range<DialectFilterIterator> dialectAttrs,
                             iterator_range<DialectFilterIterator> dialectTypes,
                             raw_ostream &os) {
-  /// Build the list of dependent dialects
-  std::string dependentDialectRegistrations;
-  {
-    llvm::raw_string_ostream dialectsOs(dependentDialectRegistrations);
-    for (StringRef dependentDialect : dialect.getDependentDialects())
-      dialectsOs << llvm::formatv(dialectRegistrationTemplate,
-                                  dependentDialect);
-  }
   // Emit the start of the decl.
   std::string cppName = dialect.getCppClassName();
-  os << llvm::formatv(dialectDeclBeginStr, cppName, dialect.getName(),
-                      dependentDialectRegistrations);
+  os << llvm::formatv(dialectDeclBeginStr, cppName, dialect.getName());
 
   // Check for any attributes/types registered to this dialect.  If there are,
   // add the hooks for parsing/printing.
diff --git a/mlir/tools/mlir-tblgen/PassGen.cpp b/mlir/tools/mlir-tblgen/PassGen.cpp
index c1664a0c826c71..c2dcdb8e4ac9b7 100644
--- a/mlir/tools/mlir-tblgen/PassGen.cpp
+++ b/mlir/tools/mlir-tblgen/PassGen.cpp
@@ -36,7 +36,6 @@ static llvm::cl::opt<std::string>
 /// {0}: The def name of the pass record.
 /// {1}: The base class for the pass.
 /// {2): The command line argument for the pass.
-/// {3}: The dependent dialects registration.
 const char *const passDeclBegin = R"(
 //===----------------------------------------------------------------------===//
 // {0}
@@ -64,20 +63,9 @@ class {0}Base : public {1} {
     return std::make_unique<DerivedT>(*static_cast<const DerivedT *>(this));
   }
 
-  /// Return the dialect that must be loaded in the context before this pass.
-  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
-    {3}
-  }
-
 protected:
 )";
 
-/// Registration for a single dependent dialect, to be inserted for each
-/// dependent dialect in the `getDependentDialects` above.
-const char *const dialectRegistrationTemplate = R"(
-  registry.insert<{0}>();
-)";
-
 /// Emit the declarations for each of the pass options.
 static void emitPassOptionDecls(const Pass &pass, raw_ostream &os) {
   for (const PassOption &opt : pass.getOptions()) {
@@ -106,15 +94,8 @@ static void emitPassStatisticDecls(const Pass &pass, raw_ostream &os) {
 
 static void emitPassDecl(const Pass &pass, raw_ostream &os) {
   StringRef defName = pass.getDef()->getName();
-  std::string dependentDialectRegistrations;
-  {
-    llvm::raw_string_ostream dialectsOs(dependentDialectRegistrations);
-    for (StringRef dependentDialect : pass.getDependentDialects())
-      dialectsOs << llvm::formatv(dialectRegistrationTemplate,
-                                  dependentDialect);
-  }
   os << llvm::formatv(passDeclBegin, defName, pass.getBaseClass(),
-                      pass.getArgument(), dependentDialectRegistrations);
+                      pass.getArgument());
   emitPassOptionDecls(pass, os);
   emitPassStatisticDecls(pass, os);
   os << "};\n";
diff --git a/mlir/tools/mlir-translate/mlir-translate.cpp b/mlir/tools/mlir-translate/mlir-translate.cpp
index 0d67286a8a9142..914bd340b3f564 100644
--- a/mlir/tools/mlir-translate/mlir-translate.cpp
+++ b/mlir/tools/mlir-translate/mlir-translate.cpp
@@ -88,8 +88,7 @@ int main(int argc, char **argv) {
   // Processes the memory buffer with a new MLIRContext.
   auto processBuffer = [&](std::unique_ptr<llvm::MemoryBuffer> ownedBuffer,
                            raw_ostream &os) {
-    MLIRContext context(false);
-    registerAllDialects(&context);
+    MLIRContext context;
     context.allowUnregisteredDialects();
     context.printOpOnDiagnostic(!verifyDiagnostics);
     llvm::SourceMgr sourceMgr;
diff --git a/mlir/unittests/Dialect/Quant/QuantizationUtilsTest.cpp b/mlir/unittests/Dialect/Quant/QuantizationUtilsTest.cpp
index bae95e1a13b68f..97c94a54ffc478 100644
--- a/mlir/unittests/Dialect/Quant/QuantizationUtilsTest.cpp
+++ b/mlir/unittests/Dialect/Quant/QuantizationUtilsTest.cpp
@@ -17,6 +17,9 @@
 using namespace mlir;
 using namespace mlir::quant;
 
+// Load the quant dialect
+static DialectRegistration<QuantizationDialect> QuantOpsRegistration;
+
 namespace {
 
 // Test UniformQuantizedValueConverter converts all APFloat to a magic number 5.
@@ -75,8 +78,7 @@ UniformQuantizedType getTestQuantizedType(Type storageType, MLIRContext *ctx) {
 }
 
 TEST(QuantizationUtilsTest, convertFloatAttrUniform) {
-  MLIRContext ctx(/*loadAllDialects=*/false);
-  ctx.getOrLoadDialect<QuantizationDialect>();
+  MLIRContext ctx;
   IntegerType convertedType = IntegerType::get(8, &ctx);
   auto quantizedType = getTestQuantizedType(convertedType, &ctx);
   TestUniformQuantizedValueConverter converter(quantizedType);
@@ -93,8 +95,7 @@ TEST(QuantizationUtilsTest, convertFloatAttrUniform) {
 }
 
 TEST(QuantizationUtilsTest, convertRankedDenseAttrUniform) {
-  MLIRContext ctx(/*loadAllDialects=*/false);
-  ctx.getOrLoadDialect<QuantizationDialect>();
+  MLIRContext ctx;
   IntegerType convertedType = IntegerType::get(8, &ctx);
   auto quantizedType = getTestQuantizedType(convertedType, &ctx);
   TestUniformQuantizedValueConverter converter(quantizedType);
@@ -118,8 +119,7 @@ TEST(QuantizationUtilsTest, convertRankedDenseAttrUniform) {
 }
 
 TEST(QuantizationUtilsTest, convertRankedSplatAttrUniform) {
-  MLIRContext ctx(/*loadAllDialects=*/false);
-  ctx.getOrLoadDialect<QuantizationDialect>();
+  MLIRContext ctx;
   IntegerType convertedType = IntegerType::get(8, &ctx);
   auto quantizedType = getTestQuantizedType(convertedType, &ctx);
   TestUniformQuantizedValueConverter converter(quantizedType);
@@ -143,8 +143,7 @@ TEST(QuantizationUtilsTest, convertRankedSplatAttrUniform) {
 }
 
 TEST(QuantizationUtilsTest, convertRankedSparseAttrUniform) {
-  MLIRContext ctx(/*loadAllDialects=*/false);
-  ctx.getOrLoadDialect<QuantizationDialect>();
+  MLIRContext ctx;
   IntegerType convertedType = IntegerType::get(8, &ctx);
   auto quantizedType = getTestQuantizedType(convertedType, &ctx);
   TestUniformQuantizedValueConverter converter(quantizedType);
diff --git a/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp b/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp
index 4aa2ffed7e2b1f..fe5632d7ae1658 100644
--- a/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp
+++ b/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp
@@ -38,8 +38,7 @@ using ::testing::StrEq;
 /// diagnostic checking utilities.
 class DeserializationTest : public ::testing::Test {
 protected:
-  DeserializationTest() : context(/*loadAllDialects=*/false) {
-    context.getOrLoadDialect<mlir::spirv::SPIRVDialect>();
+  DeserializationTest() {
     // Register a diagnostic handler to capture the diagnostic so that we can
     // check it later.
     context.getDiagEngine().registerHandler([&](Diagnostic &diag) {
diff --git a/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
index cb89cd61de7bdc..3d57e559ca5ec2 100644
--- a/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
+++ b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
@@ -36,10 +36,7 @@ using namespace mlir;
 
 class SerializationTest : public ::testing::Test {
 protected:
-  SerializationTest() : context(/*loadAllDialects=*/false) {
-    context.getOrLoadDialect<mlir::spirv::SPIRVDialect>();
-    createModuleOp();
-  }
+  SerializationTest() { createModuleOp(); }
 
   void createModuleOp() {
     OpBuilder builder(&context);
diff --git a/mlir/unittests/IR/AttributeTest.cpp b/mlir/unittests/IR/AttributeTest.cpp
index 78f7dd53d8fd7a..df449a0da75c56 100644
--- a/mlir/unittests/IR/AttributeTest.cpp
+++ b/mlir/unittests/IR/AttributeTest.cpp
@@ -32,7 +32,7 @@ static void testSplat(Type eltType, const EltTy &splatElt) {
 
 namespace {
 TEST(DenseSplatTest, BoolSplat) {
-  MLIRContext context(false);
+  MLIRContext context;
   IntegerType boolTy = IntegerType::get(1, &context);
   RankedTensorType shape = RankedTensorType::get({2, 2}, boolTy);
 
@@ -57,7 +57,7 @@ TEST(DenseSplatTest, BoolSplat) {
 TEST(DenseSplatTest, LargeBoolSplat) {
   constexpr int64_t boolCount = 56;
 
-  MLIRContext context(false);
+  MLIRContext context;
   IntegerType boolTy = IntegerType::get(1, &context);
   RankedTensorType shape = RankedTensorType::get({boolCount}, boolTy);
 
@@ -80,7 +80,7 @@ TEST(DenseSplatTest, LargeBoolSplat) {
 }
 
 TEST(DenseSplatTest, BoolNonSplat) {
-  MLIRContext context(false);
+  MLIRContext context;
   IntegerType boolTy = IntegerType::get(1, &context);
   RankedTensorType shape = RankedTensorType::get({6}, boolTy);
 
@@ -92,7 +92,7 @@ TEST(DenseSplatTest, BoolNonSplat) {
 
 TEST(DenseSplatTest, OddIntSplat) {
   // Test detecting a splat with an odd(non 8-bit) integer bitwidth.
-  MLIRContext context(false);
+  MLIRContext context;
   constexpr size_t intWidth = 19;
   IntegerType intTy = IntegerType::get(intWidth, &context);
   APInt value(intWidth, 10);
@@ -101,7 +101,7 @@ TEST(DenseSplatTest, OddIntSplat) {
 }
 
 TEST(DenseSplatTest, Int32Splat) {
-  MLIRContext context(false);
+  MLIRContext context;
   IntegerType intTy = IntegerType::get(32, &context);
   int value = 64;
 
@@ -109,7 +109,7 @@ TEST(DenseSplatTest, Int32Splat) {
 }
 
 TEST(DenseSplatTest, IntAttrSplat) {
-  MLIRContext context(false);
+  MLIRContext context;
   IntegerType intTy = IntegerType::get(85, &context);
   Attribute value = IntegerAttr::get(intTy, 109);
 
@@ -117,7 +117,7 @@ TEST(DenseSplatTest, IntAttrSplat) {
 }
 
 TEST(DenseSplatTest, F32Splat) {
-  MLIRContext context(false);
+  MLIRContext context;
   FloatType floatTy = FloatType::getF32(&context);
   float value = 10.0;
 
@@ -125,7 +125,7 @@ TEST(DenseSplatTest, F32Splat) {
 }
 
 TEST(DenseSplatTest, F64Splat) {
-  MLIRContext context(false);
+  MLIRContext context;
   FloatType floatTy = FloatType::getF64(&context);
   double value = 10.0;
 
@@ -133,7 +133,7 @@ TEST(DenseSplatTest, F64Splat) {
 }
 
 TEST(DenseSplatTest, FloatAttrSplat) {
-  MLIRContext context(false);
+  MLIRContext context;
   FloatType floatTy = FloatType::getF32(&context);
   Attribute value = FloatAttr::get(floatTy, 10.0);
 
@@ -141,7 +141,7 @@ TEST(DenseSplatTest, FloatAttrSplat) {
 }
 
 TEST(DenseSplatTest, BF16Splat) {
-  MLIRContext context(false);
+  MLIRContext context;
   FloatType floatTy = FloatType::getBF16(&context);
   Attribute value = FloatAttr::get(floatTy, 10.0);
 
@@ -149,7 +149,7 @@ TEST(DenseSplatTest, BF16Splat) {
 }
 
 TEST(DenseSplatTest, StringSplat) {
-  MLIRContext context(false);
+  MLIRContext context;
   Type stringType =
       OpaqueType::get(Identifier::get("test", &context), "string", &context);
   StringRef value = "test-string";
@@ -157,7 +157,7 @@ TEST(DenseSplatTest, StringSplat) {
 }
 
 TEST(DenseSplatTest, StringAttrSplat) {
-  MLIRContext context(false);
+  MLIRContext context;
   Type stringType =
       OpaqueType::get(Identifier::get("test", &context), "string", &context);
   Attribute stringAttr = StringAttr::get("test-string", stringType);
@@ -165,28 +165,28 @@ TEST(DenseSplatTest, StringAttrSplat) {
 }
 
 TEST(DenseComplexTest, ComplexFloatSplat) {
-  MLIRContext context(false);
+  MLIRContext context;
   ComplexType complexType = ComplexType::get(FloatType::getF32(&context));
   std::complex<float> value(10.0, 15.0);
   testSplat(complexType, value);
 }
 
 TEST(DenseComplexTest, ComplexIntSplat) {
-  MLIRContext context(false);
+  MLIRContext context;
   ComplexType complexType = ComplexType::get(IntegerType::get(64, &context));
   std::complex<int64_t> value(10, 15);
   testSplat(complexType, value);
 }
 
 TEST(DenseComplexTest, ComplexAPFloatSplat) {
-  MLIRContext context(false);
+  MLIRContext context;
   ComplexType complexType = ComplexType::get(FloatType::getF32(&context));
   std::complex<APFloat> value(APFloat(10.0f), APFloat(15.0f));
   testSplat(complexType, value);
 }
 
 TEST(DenseComplexTest, ComplexAPIntSplat) {
-  MLIRContext context(false);
+  MLIRContext context;
   ComplexType complexType = ComplexType::get(IntegerType::get(64, &context));
   std::complex<APInt> value(APInt(64, 10), APInt(64, 15));
   testSplat(complexType, value);
diff --git a/mlir/unittests/IR/DialectTest.cpp b/mlir/unittests/IR/DialectTest.cpp
index 5a0a229d2c82ee..bc389ce1f0daed 100644
--- a/mlir/unittests/IR/DialectTest.cpp
+++ b/mlir/unittests/IR/DialectTest.cpp
@@ -26,12 +26,12 @@ struct AnotherTestDialect : public Dialect {
 };
 
 TEST(DialectDeathTest, MultipleDialectsWithSameNamespace) {
-  MLIRContext context(false);
+  MLIRContext context;
 
   // Registering a dialect with the same namespace twice should result in a
   // failure.
-  context.loadDialect<TestDialect>();
-  ASSERT_DEATH(context.loadDialect<AnotherTestDialect>(), "");
+  context.getOrCreateDialect<TestDialect>();
+  ASSERT_DEATH(context.getOrCreateDialect<AnotherTestDialect>(), "");
 }
 
 } // end namespace
diff --git a/mlir/unittests/IR/OperationSupportTest.cpp b/mlir/unittests/IR/OperationSupportTest.cpp
index 96693309bd1319..95ddcccc565e3a 100644
--- a/mlir/unittests/IR/OperationSupportTest.cpp
+++ b/mlir/unittests/IR/OperationSupportTest.cpp
@@ -25,7 +25,7 @@ static Operation *createOp(MLIRContext *context,
 
 namespace {
 TEST(OperandStorageTest, NonResizable) {
-  MLIRContext context(false);
+  MLIRContext context;
   Builder builder(&context);
 
   Operation *useOp =
@@ -49,7 +49,7 @@ TEST(OperandStorageTest, NonResizable) {
 }
 
 TEST(OperandStorageTest, Resizable) {
-  MLIRContext context(false);
+  MLIRContext context;
   Builder builder(&context);
 
   Operation *useOp =
@@ -77,7 +77,7 @@ TEST(OperandStorageTest, Resizable) {
 }
 
 TEST(OperandStorageTest, RangeReplace) {
-  MLIRContext context(false);
+  MLIRContext context;
   Builder builder(&context);
 
   Operation *useOp =
@@ -113,7 +113,7 @@ TEST(OperandStorageTest, RangeReplace) {
 }
 
 TEST(OperandStorageTest, MutableRange) {
-  MLIRContext context(false);
+  MLIRContext context;
   Builder builder(&context);
 
   Operation *useOp =
diff --git a/mlir/unittests/Pass/AnalysisManagerTest.cpp b/mlir/unittests/Pass/AnalysisManagerTest.cpp
index 918c8985eb0c23..41a90649deef0a 100644
--- a/mlir/unittests/Pass/AnalysisManagerTest.cpp
+++ b/mlir/unittests/Pass/AnalysisManagerTest.cpp
@@ -29,7 +29,7 @@ struct OpSpecificAnalysis {
 };
 
 TEST(AnalysisManagerTest, FineGrainModuleAnalysisPreservation) {
-  MLIRContext context(false);
+  MLIRContext context;
 
   // Test fine grain invalidation of the module analysis manager.
   OwningModuleRef module(ModuleOp::create(UnknownLoc::get(&context)));
@@ -50,7 +50,7 @@ TEST(AnalysisManagerTest, FineGrainModuleAnalysisPreservation) {
 }
 
 TEST(AnalysisManagerTest, FineGrainFunctionAnalysisPreservation) {
-  MLIRContext context(false);
+  MLIRContext context;
   Builder builder(&context);
 
   // Create a function and a module.
@@ -79,7 +79,7 @@ TEST(AnalysisManagerTest, FineGrainFunctionAnalysisPreservation) {
 }
 
 TEST(AnalysisManagerTest, FineGrainChildFunctionAnalysisPreservation) {
-  MLIRContext context(false);
+  MLIRContext context;
   Builder builder(&context);
 
   // Create a function and a module.
@@ -122,7 +122,7 @@ struct CustomInvalidatingAnalysis {
 };
 
 TEST(AnalysisManagerTest, CustomInvalidation) {
-  MLIRContext context(false);
+  MLIRContext context;
   Builder builder(&context);
 
   // Create a function and a module.
diff --git a/mlir/unittests/SDBM/SDBMTest.cpp b/mlir/unittests/SDBM/SDBMTest.cpp
index bbe87e3d292c85..61d670650b4bfd 100644
--- a/mlir/unittests/SDBM/SDBMTest.cpp
+++ b/mlir/unittests/SDBM/SDBMTest.cpp
@@ -17,17 +17,18 @@
 
 using namespace mlir;
 
+/// Load the SDBM dialect.
+static DialectRegistration<SDBMDialect> SDBMRegistration;
 
 static MLIRContext *ctx() {
-  static thread_local MLIRContext context(false);
-  context.getOrLoadDialect<SDBMDialect>();
+  static thread_local MLIRContext context;
   return &context;
 }
 
 static SDBMDialect *dialect() {
   static thread_local SDBMDialect *d = nullptr;
   if (!d) {
-    d = ctx()->getOrLoadDialect<SDBMDialect>();
+    d = ctx()->getRegisteredDialect<SDBMDialect>();
   }
   return d;
 }
diff --git a/mlir/unittests/TableGen/OpBuildGen.cpp b/mlir/unittests/TableGen/OpBuildGen.cpp
index 46a37da6e9441f..3e3256e96cd045 100644
--- a/mlir/unittests/TableGen/OpBuildGen.cpp
+++ b/mlir/unittests/TableGen/OpBuildGen.cpp
@@ -25,16 +25,11 @@ namespace mlir {
 // Test Fixture
 //===----------------------------------------------------------------------===//
 
-static MLIRContext &getContext() {
-  static MLIRContext ctx(false);
-  ctx.getOrLoadDialect<TestDialect>();
-  return ctx;
-}
 /// Test fixture for providing basic utilities for testing.
 class OpBuildGenTest : public ::testing::Test {
 protected:
   OpBuildGenTest()
-      : ctx(getContext()), builder(&ctx), loc(builder.getUnknownLoc()),
+      : ctx{}, builder(&ctx), loc(builder.getUnknownLoc()),
         i32Ty(builder.getI32Type()), f32Ty(builder.getF32Type()),
         cstI32(builder.create<TableGenConstant>(loc, i32Ty)),
         cstF32(builder.create<TableGenConstant>(loc, f32Ty)),
@@ -91,7 +86,7 @@ class OpBuildGenTest : public ::testing::Test {
   }
 
 protected:
-  MLIRContext &ctx;
+  MLIRContext ctx;
   OpBuilder builder;
   Location loc;
   Type i32Ty;
diff --git a/mlir/unittests/TableGen/StructsGenTest.cpp b/mlir/unittests/TableGen/StructsGenTest.cpp
index 14b0abc675bff3..c58fedb4ec4f03 100644
--- a/mlir/unittests/TableGen/StructsGenTest.cpp
+++ b/mlir/unittests/TableGen/StructsGenTest.cpp
@@ -42,7 +42,7 @@ static test::TestStruct getTestStruct(mlir::MLIRContext *context) {
 /// Validates that test::TestStruct::classof correctly identifies a valid
 /// test::TestStruct.
 TEST(StructsGenTest, ClassofTrue) {
-  mlir::MLIRContext context(false);
+  mlir::MLIRContext context;
   auto structAttr = getTestStruct(&context);
   ASSERT_TRUE(test::TestStruct::classof(structAttr));
 }

From 514bcb325dc9584f378be982ce102d8d5a531d5c Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 18 Aug 2020 15:20:46 -0700
Subject: [PATCH 096/101] [lldb] Remove unused function getArchFlag (NFC)

---
 .../Python/lldbsuite/test/plugins/builder_base.py  | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/plugins/builder_base.py b/lldb/packages/Python/lldbsuite/test/plugins/builder_base.py
index 4d355d9d88050a..a14b0de7d0975f 100644
--- a/lldb/packages/Python/lldbsuite/test/plugins/builder_base.py
+++ b/lldb/packages/Python/lldbsuite/test/plugins/builder_base.py
@@ -37,20 +37,6 @@ def getCompiler():
     return os.path.abspath(compiler)
 
 
-def getArchFlag():
-    """Returns the flag required to specify the arch"""
-    compiler = getCompiler()
-    if compiler is None:
-        return ""
-    elif "gcc" in compiler:
-        archflag = "-m"
-    elif "clang" in compiler:
-        archflag = "-arch"
-    else:
-        archflag = None
-
-    return ("ARCHFLAG=" + archflag) if archflag else ""
-
 def getMake(test_subdir, test_name):
     """Returns the invocation for GNU make.
        The first argument is a tuple of the relative path to the testcase

From 592b8996bf9b55eec21e1c9e563f51b6108ec2d2 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Tue, 18 Aug 2020 18:59:55 -0400
Subject: [PATCH 097/101] Hook up OpenBSD 64-bit RISC-V support

---
 clang/lib/Basic/Targets.cpp    | 2 ++
 clang/test/Preprocessor/init.c | 1 +
 2 files changed, 3 insertions(+)

diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index c90becd9f01237..50a3b0e83a56fb 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -391,6 +391,8 @@ TargetInfo *AllocateTarget(const llvm::Triple &Triple,
     switch (os) {
     case llvm::Triple::FreeBSD:
       return new FreeBSDTargetInfo<RISCV64TargetInfo>(Triple, Opts);
+    case llvm::Triple::OpenBSD:
+      return new OpenBSDTargetInfo<RISCV64TargetInfo>(Triple, Opts);
     case llvm::Triple::Fuchsia:
       return new FuchsiaTargetInfo<RISCV64TargetInfo>(Triple, Opts);
     case llvm::Triple::Linux:
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 65928b08be8e08..63a96f20de88b2 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1607,6 +1607,7 @@
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64el-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=riscv64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // OPENBSD:#define __ELF__ 1
 // OPENBSD:#define __INT16_TYPE__ short
 // OPENBSD:#define __INT32_TYPE__ int

From a7d0b7a786c769440143acb94f533149faac12c9 Mon Sep 17 00:00:00 2001
From: Elliott Hughes <enh@google.com>
Date: Fri, 10 Apr 2020 17:42:00 -0700
Subject: [PATCH 098/101] ld128 demangle: allow space for 'L' suffix.

Summary:
Caught by HWASAN on arm64 Android (which uses ld128 for long double). This
was running the existing fuzzer.

The specific minimized fuzz input to reproduce this is:

  __cxa_demangle("1\006ILeeeEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE", 0, 0, 0);

Reviewers: eugenis, srhines, #libc_abi!

Subscribers: kristof.beyls, danielkiss, libcxx-commits

Tags: #libc_abi

Differential Revision: https://reviews.llvm.org/D77924
---
 libcxxabi/src/demangle/ItaniumDemangle.h     | 7 ++++++-
 libcxxabi/test/test_demangle.pass.cpp        | 7 +++++++
 llvm/include/llvm/Demangle/ItaniumDemangle.h | 7 ++++++-
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 6ab873218386b8..ede9c6d27d3263 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -5203,7 +5203,12 @@ struct FloatData<long double>
 #else
     static const size_t mangled_size = 20;  // May need to be adjusted to 16 or 24 on other platforms
 #endif
-    static const size_t max_demangled_size = 40;
+    // `-0x1.ffffffffffffffffffffffffffffp+16383` + 'L' + '\0' == 42 bytes.
+    // 28 'f's * 4 bits == 112 bits, which is the number of mantissa bits.
+    // Negatives are one character longer than positives.
+    // `0x1.` and `p` are constant, and exponents `+16383` and `-16382` are the
+    // same length. 1 sign bit, 112 mantissa bits, and 15 exponent bits == 128.
+    static const size_t max_demangled_size = 42;
     static constexpr const char *spec = "%LaL";
 };
 
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index ef75b61a94af26..236c1d96b7266d 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -15,6 +15,8 @@
 
 // Is long double fp80?  (Only x87 extended double has 64-bit mantissa)
 #define LDBL_FP80 (__LDBL_MANT_DIG__ == 64)
+// Is long double fp128?
+#define LDBL_FP128 (__LDBL_MANT_DIG__ == 113)
 
 const char* cases[][2] =
 {
@@ -29837,6 +29839,11 @@ struct FPLiteralCase {
         "void test0::h<float>(char (&) [(unsigned int)((sizeof (float)) + (0xap-1L))])",
     }},
 #endif
+#if LDBL_FP128
+    // This was found by libFuzzer+HWASan on aarch64 Android.
+    {"1\006ILeeeEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE",
+     {"\x6<-0x1.cecececececececececececececep+11983"}},
+#endif
 };
 const unsigned NF = sizeof(fp_literal_cases) / sizeof(fp_literal_cases[0]);
 const unsigned NEF = sizeof(fp_literal_cases[0].expecting) / sizeof(fp_literal_cases[0].expecting[0]);
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 6ab873218386b8..ede9c6d27d3263 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -5203,7 +5203,12 @@ struct FloatData<long double>
 #else
     static const size_t mangled_size = 20;  // May need to be adjusted to 16 or 24 on other platforms
 #endif
-    static const size_t max_demangled_size = 40;
+    // `-0x1.ffffffffffffffffffffffffffffp+16383` + 'L' + '\0' == 42 bytes.
+    // 28 'f's * 4 bits == 112 bits, which is the number of mantissa bits.
+    // Negatives are one character longer than positives.
+    // `0x1.` and `p` are constant, and exponents `+16383` and `-16382` are the
+    // same length. 1 sign bit, 112 mantissa bits, and 15 exponent bits == 128.
+    static const size_t max_demangled_size = 42;
     static constexpr const char *spec = "%LaL";
 };
 

From 250f43d3ecc8d6a3780c9aa2e3770c0193a28850 Mon Sep 17 00:00:00 2001
From: River Riddle <riddleriver@gmail.com>
Date: Tue, 18 Aug 2020 15:59:53 -0700
Subject: [PATCH 099/101] [mlir] Remove the use of "kinds" from Attributes and
 Types

This greatly simplifies a large portion of the underlying infrastructure, allows for lookups of singleton classes to be much more efficient and always thread-safe(no locking). As a result of this, the dialect symbol registry has been removed as it is no longer necessary.

For users broken by this change, an alert was sent out(https://llvm.discourse.group/t/removing-kinds-from-attributes-and-types) that helps prevent a majority of the breakage surface area. All that should be necessary, if the advice in that alert was followed, is removing the kind passed to the ::get methods.

Differential Revision: https://reviews.llvm.org/D86121
---
 .../include/flang/Optimizer/Dialect/FIRAttr.h |  27 ---
 .../include/flang/Optimizer/Dialect/FIRType.h |  45 +---
 flang/lib/Optimizer/Dialect/FIRAttr.cpp       |  14 +-
 flang/lib/Optimizer/Dialect/FIRType.cpp       | 154 +++++++-------
 mlir/docs/Tutorials/Toy/Ch-7.md               |   7 +-
 mlir/examples/toy/Ch7/include/toy/Dialect.h   |   7 -
 mlir/examples/toy/Ch7/mlir/Dialect.cpp        |   7 +-
 mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h  |  57 ++---
 .../mlir/Dialect/Linalg/IR/LinalgTypes.h      |  10 -
 mlir/include/mlir/Dialect/Quant/QuantTypes.h  |   9 -
 .../mlir/Dialect/SPIRV/SPIRVAttributes.h      |   9 -
 mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h  |  13 --
 mlir/include/mlir/Dialect/Shape/IR/Shape.h    |  36 ----
 mlir/include/mlir/IR/AttributeSupport.h       |  36 +++-
 mlir/include/mlir/IR/Attributes.h             |  59 ------
 mlir/include/mlir/IR/Dialect.h                |  30 ++-
 .../include/mlir/IR/DialectSymbolRegistry.def |  44 ----
 mlir/include/mlir/IR/OpImplementation.h       |   2 +-
 mlir/include/mlir/IR/StandardTypes.h          |  27 ---
 mlir/include/mlir/IR/StorageUniquerSupport.h  |  10 +-
 mlir/include/mlir/IR/TypeSupport.h            |  35 +++-
 mlir/include/mlir/IR/Types.h                  |  37 +---
 mlir/include/mlir/Support/StorageUniquer.h    | 194 ++++++++++--------
 mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp      |  44 ++--
 mlir/lib/Dialect/Quant/IR/QuantTypes.cpp      |  32 ++-
 mlir/lib/Dialect/SDBM/SDBMDialect.cpp         |  10 +-
 mlir/lib/Dialect/SDBM/SDBMExpr.cpp            |  20 +-
 mlir/lib/Dialect/SDBM/SDBMExprDetail.h        |  28 ++-
 mlir/lib/Dialect/SPIRV/SPIRVAttributes.cpp    |   8 +-
 mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp         |  32 ++-
 mlir/lib/IR/AffineExpr.cpp                    |  16 +-
 mlir/lib/IR/AffineExprDetail.h                |  22 +-
 mlir/lib/IR/AsmPrinter.cpp                    |  12 +-
 mlir/lib/IR/Attributes.cpp                    |  47 ++---
 mlir/lib/IR/Location.cpp                      |  16 +-
 mlir/lib/IR/MLIRContext.cpp                   |  81 ++++----
 mlir/lib/IR/StandardTypes.cpp                 |  33 ++-
 mlir/lib/IR/Types.cpp                         |   8 +-
 mlir/lib/Support/StorageUniquer.cpp           | 194 ++++++++----------
 mlir/test/lib/Dialect/Test/TestDialect.cpp    |   2 +-
 mlir/test/lib/Dialect/Test/TestTypes.h        |   9 +-
 mlir/test/lib/IR/TestTypes.cpp                |   4 +-
 42 files changed, 593 insertions(+), 894 deletions(-)
 delete mode 100644 mlir/include/mlir/IR/DialectSymbolRegistry.def

diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.h b/flang/include/flang/Optimizer/Dialect/FIRAttr.h
index e9b16909f3fb25..e0008161f993fd 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.h
@@ -25,17 +25,6 @@ struct RealAttributeStorage;
 struct TypeAttributeStorage;
 } // namespace detail
 
-enum AttributeKind {
-  FIR_ATTR = mlir::Attribute::FIRST_FIR_ATTR,
-  FIR_EXACTTYPE, // instance_of, precise type relation
-  FIR_SUBCLASS,  // subsumed_by, is-a (subclass) relation
-  FIR_POINT,
-  FIR_CLOSEDCLOSED_INTERVAL,
-  FIR_OPENCLOSED_INTERVAL,
-  FIR_CLOSEDOPEN_INTERVAL,
-  FIR_REAL_ATTR,
-};
-
 class ExactTypeAttr
     : public mlir::Attribute::AttrBase<ExactTypeAttr, mlir::Attribute,
                                        detail::TypeAttributeStorage> {
@@ -47,8 +36,6 @@ class ExactTypeAttr
   static ExactTypeAttr get(mlir::Type value);
 
   mlir::Type getType() const;
-
-  static constexpr unsigned getId() { return AttributeKind::FIR_EXACTTYPE; }
 };
 
 class SubclassAttr
@@ -62,8 +49,6 @@ class SubclassAttr
   static SubclassAttr get(mlir::Type value);
 
   mlir::Type getType() const;
-
-  static constexpr unsigned getId() { return AttributeKind::FIR_SUBCLASS; }
 };
 
 // Attributes for building SELECT CASE multiway branches
@@ -80,9 +65,6 @@ class ClosedIntervalAttr
 
   static constexpr llvm::StringRef getAttrName() { return "interval"; }
   static ClosedIntervalAttr get(mlir::MLIRContext *ctxt);
-  static constexpr unsigned getId() {
-    return AttributeKind::FIR_CLOSEDCLOSED_INTERVAL;
-  }
 };
 
 /// An upper bound is an open interval (including the bound value) as given as
@@ -97,9 +79,6 @@ class UpperBoundAttr
 
   static constexpr llvm::StringRef getAttrName() { return "upper"; }
   static UpperBoundAttr get(mlir::MLIRContext *ctxt);
-  static constexpr unsigned getId() {
-    return AttributeKind::FIR_OPENCLOSED_INTERVAL;
-  }
 };
 
 /// A lower bound is an open interval (including the bound value) as given as
@@ -114,9 +93,6 @@ class LowerBoundAttr
 
   static constexpr llvm::StringRef getAttrName() { return "lower"; }
   static LowerBoundAttr get(mlir::MLIRContext *ctxt);
-  static constexpr unsigned getId() {
-    return AttributeKind::FIR_CLOSEDOPEN_INTERVAL;
-  }
 };
 
 /// A pointer interval is a closed interval as given as an ssa-value. The
@@ -131,7 +107,6 @@ class PointIntervalAttr
 
   static constexpr llvm::StringRef getAttrName() { return "point"; }
   static PointIntervalAttr get(mlir::MLIRContext *ctxt);
-  static constexpr unsigned getId() { return AttributeKind::FIR_POINT; }
 };
 
 /// A real attribute is used to workaround MLIR's default parsing of a real
@@ -150,8 +125,6 @@ class RealAttr
 
   int getFKind() const;
   llvm::APFloat getValue() const;
-
-  static constexpr unsigned getId() { return AttributeKind::FIR_REAL_ATTR; }
 };
 
 mlir::Attribute parseFirAttribute(FIROpsDialect *dialect,
diff --git a/flang/include/flang/Optimizer/Dialect/FIRType.h b/flang/include/flang/Optimizer/Dialect/FIRType.h
index 3d3125c97e9304..6d2aec25fa8f91 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRType.h
+++ b/flang/include/flang/Optimizer/Dialect/FIRType.h
@@ -54,29 +54,6 @@ struct SequenceTypeStorage;
 struct TypeDescTypeStorage;
 } // namespace detail
 
-/// Integral identifier for all the types comprising the FIR type system
-enum TypeKind {
-  // The enum starts at the range reserved for this dialect.
-  FIR_TYPE = mlir::Type::FIRST_FIR_TYPE,
-  FIR_BOX,       // (static) descriptor
-  FIR_BOXCHAR,   // CHARACTER pointer and length
-  FIR_BOXPROC,   // procedure with host association
-  FIR_CHARACTER, // intrinsic type
-  FIR_COMPLEX,   // intrinsic type
-  FIR_DERIVED,   // derived
-  FIR_DIMS,
-  FIR_FIELD,
-  FIR_HEAP,
-  FIR_INT, // intrinsic type
-  FIR_LEN,
-  FIR_LOGICAL, // intrinsic type
-  FIR_POINTER, // POINTER attr
-  FIR_REAL,    // intrinsic type
-  FIR_REFERENCE,
-  FIR_SEQUENCE, // DIMENSION attr
-  FIR_TYPEDESC,
-};
-
 // These isa_ routines follow the precedent of llvm::isa_or_null<>
 
 /// Is `t` any of the FIR dialect types?
@@ -111,12 +88,6 @@ bool isa_aggregate(mlir::Type t);
 /// not a memory reference type, then returns a null `Type`.
 mlir::Type dyn_cast_ptrEleTy(mlir::Type t);
 
-/// Boilerplate mixin template
-template <typename A, unsigned Id>
-struct IntrinsicTypeMixin {
-  static constexpr unsigned getId() { return Id; }
-};
-
 // Intrinsic types
 
 /// Model of the Fortran CHARACTER intrinsic type, including the KIND type
@@ -124,8 +95,7 @@ struct IntrinsicTypeMixin {
 /// is thus the type of a single character value.
 class CharacterType
     : public mlir::Type::TypeBase<CharacterType, mlir::Type,
-                                  detail::CharacterTypeStorage>,
-      public IntrinsicTypeMixin<CharacterType, TypeKind::FIR_CHARACTER> {
+                                  detail::CharacterTypeStorage> {
 public:
   using Base::Base;
   static CharacterType get(mlir::MLIRContext *ctxt, KindTy kind);
@@ -136,8 +106,7 @@ class CharacterType
 /// parameter. COMPLEX is a floating point type with a real and imaginary
 /// member.
 class CplxType : public mlir::Type::TypeBase<CplxType, mlir::Type,
-                                             detail::CplxTypeStorage>,
-                 public IntrinsicTypeMixin<CplxType, TypeKind::FIR_COMPLEX> {
+                                             detail::CplxTypeStorage> {
 public:
   using Base::Base;
   static CplxType get(mlir::MLIRContext *ctxt, KindTy kind);
@@ -151,8 +120,7 @@ class CplxType : public mlir::Type::TypeBase<CplxType, mlir::Type,
 /// Model of a Fortran INTEGER intrinsic type, including the KIND type
 /// parameter.
 class IntType
-    : public mlir::Type::TypeBase<IntType, mlir::Type, detail::IntTypeStorage>,
-      public IntrinsicTypeMixin<IntType, TypeKind::FIR_INT> {
+    : public mlir::Type::TypeBase<IntType, mlir::Type, detail::IntTypeStorage> {
 public:
   using Base::Base;
   static IntType get(mlir::MLIRContext *ctxt, KindTy kind);
@@ -163,8 +131,7 @@ class IntType
 /// parameter.
 class LogicalType
     : public mlir::Type::TypeBase<LogicalType, mlir::Type,
-                                  detail::LogicalTypeStorage>,
-      public IntrinsicTypeMixin<LogicalType, TypeKind::FIR_LOGICAL> {
+                                  detail::LogicalTypeStorage> {
 public:
   using Base::Base;
   static LogicalType get(mlir::MLIRContext *ctxt, KindTy kind);
@@ -174,8 +141,7 @@ class LogicalType
 /// Model of a Fortran REAL (and DOUBLE PRECISION) intrinsic type, including the
 /// KIND type parameter.
 class RealType : public mlir::Type::TypeBase<RealType, mlir::Type,
-                                             detail::RealTypeStorage>,
-                 public IntrinsicTypeMixin<RealType, TypeKind::FIR_REAL> {
+                                             detail::RealTypeStorage> {
 public:
   using Base::Base;
   static RealType get(mlir::MLIRContext *ctxt, KindTy kind);
@@ -400,7 +366,6 @@ class RecordType : public mlir::Type::TypeBase<RecordType, mlir::Type,
   static RecordType get(mlir::MLIRContext *ctxt, llvm::StringRef name);
   void finalize(llvm::ArrayRef<TypePair> lenPList,
                 llvm::ArrayRef<TypePair> typeList);
-  static constexpr unsigned getId() { return TypeKind::FIR_DERIVED; }
 
   detail::RecordTypeStorage const *uniqueKey() const;
 
diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
index 09780d306fb873..0a219d1cab7499 100644
--- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
@@ -74,13 +74,13 @@ struct TypeAttributeStorage : public mlir::AttributeStorage {
 } // namespace detail
 
 ExactTypeAttr ExactTypeAttr::get(mlir::Type value) {
-  return Base::get(value.getContext(), FIR_EXACTTYPE, value);
+  return Base::get(value.getContext(), value);
 }
 
 mlir::Type ExactTypeAttr::getType() const { return getImpl()->getType(); }
 
 SubclassAttr SubclassAttr::get(mlir::Type value) {
-  return Base::get(value.getContext(), FIR_SUBCLASS, value);
+  return Base::get(value.getContext(), value);
 }
 
 mlir::Type SubclassAttr::getType() const { return getImpl()->getType(); }
@@ -88,26 +88,26 @@ mlir::Type SubclassAttr::getType() const { return getImpl()->getType(); }
 using AttributeUniquer = mlir::detail::AttributeUniquer;
 
 ClosedIntervalAttr ClosedIntervalAttr::get(mlir::MLIRContext *ctxt) {
-  return AttributeUniquer::get<ClosedIntervalAttr>(ctxt, getId());
+  return AttributeUniquer::get<ClosedIntervalAttr>(ctxt);
 }
 
 UpperBoundAttr UpperBoundAttr::get(mlir::MLIRContext *ctxt) {
-  return AttributeUniquer::get<UpperBoundAttr>(ctxt, getId());
+  return AttributeUniquer::get<UpperBoundAttr>(ctxt);
 }
 
 LowerBoundAttr LowerBoundAttr::get(mlir::MLIRContext *ctxt) {
-  return AttributeUniquer::get<LowerBoundAttr>(ctxt, getId());
+  return AttributeUniquer::get<LowerBoundAttr>(ctxt);
 }
 
 PointIntervalAttr PointIntervalAttr::get(mlir::MLIRContext *ctxt) {
-  return AttributeUniquer::get<PointIntervalAttr>(ctxt, getId());
+  return AttributeUniquer::get<PointIntervalAttr>(ctxt);
 }
 
 // RealAttr
 
 RealAttr RealAttr::get(mlir::MLIRContext *ctxt,
                        const RealAttr::ValueType &key) {
-  return Base::get(ctxt, getId(), key);
+  return Base::get(ctxt, key);
 }
 
 int RealAttr::getFKind() const { return getImpl()->getFKind(); }
diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
index c29412b86944a1..e4c548035d4a43 100644
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -824,13 +824,11 @@ bool inbounds(A v, B lb, B ub) {
 }
 
 bool isa_fir_type(mlir::Type t) {
-  return inbounds(t.getKind(), mlir::Type::FIRST_FIR_TYPE,
-                  mlir::Type::LAST_FIR_TYPE);
+  return llvm::isa<FIROpsDialect>(t.getDialect());
 }
 
 bool isa_std_type(mlir::Type t) {
-  return inbounds(t.getKind(), mlir::Type::FIRST_STANDARD_TYPE,
-                  mlir::Type::LAST_STANDARD_TYPE);
+  return t.getDialect().getNamespace().empty();
 }
 
 bool isa_fir_or_std_type(mlir::Type t) {
@@ -868,7 +866,7 @@ mlir::Type dyn_cast_ptrEleTy(mlir::Type t) {
 // CHARACTER
 
 CharacterType fir::CharacterType::get(mlir::MLIRContext *ctxt, KindTy kind) {
-  return Base::get(ctxt, FIR_CHARACTER, kind);
+  return Base::get(ctxt, kind);
 }
 
 int fir::CharacterType::getFKind() const { return getImpl()->getFKind(); }
@@ -876,7 +874,7 @@ int fir::CharacterType::getFKind() const { return getImpl()->getFKind(); }
 // Dims
 
 DimsType fir::DimsType::get(mlir::MLIRContext *ctxt, unsigned rank) {
-  return Base::get(ctxt, FIR_DIMS, rank);
+  return Base::get(ctxt, rank);
 }
 
 unsigned fir::DimsType::getRank() const { return getImpl()->getRank(); }
@@ -884,19 +882,19 @@ unsigned fir::DimsType::getRank() const { return getImpl()->getRank(); }
 // Field
 
 FieldType fir::FieldType::get(mlir::MLIRContext *ctxt) {
-  return Base::get(ctxt, FIR_FIELD, 0);
+  return Base::get(ctxt, 0);
 }
 
 // Len
 
 LenType fir::LenType::get(mlir::MLIRContext *ctxt) {
-  return Base::get(ctxt, FIR_LEN, 0);
+  return Base::get(ctxt, 0);
 }
 
 // LOGICAL
 
 LogicalType fir::LogicalType::get(mlir::MLIRContext *ctxt, KindTy kind) {
-  return Base::get(ctxt, FIR_LOGICAL, kind);
+  return Base::get(ctxt, kind);
 }
 
 int fir::LogicalType::getFKind() const { return getImpl()->getFKind(); }
@@ -904,7 +902,7 @@ int fir::LogicalType::getFKind() const { return getImpl()->getFKind(); }
 // INTEGER
 
 IntType fir::IntType::get(mlir::MLIRContext *ctxt, KindTy kind) {
-  return Base::get(ctxt, FIR_INT, kind);
+  return Base::get(ctxt, kind);
 }
 
 int fir::IntType::getFKind() const { return getImpl()->getFKind(); }
@@ -912,7 +910,7 @@ int fir::IntType::getFKind() const { return getImpl()->getFKind(); }
 // COMPLEX
 
 CplxType fir::CplxType::get(mlir::MLIRContext *ctxt, KindTy kind) {
-  return Base::get(ctxt, FIR_COMPLEX, kind);
+  return Base::get(ctxt, kind);
 }
 
 mlir::Type fir::CplxType::getElementType() const {
@@ -924,7 +922,7 @@ KindTy fir::CplxType::getFKind() const { return getImpl()->getFKind(); }
 // REAL
 
 RealType fir::RealType::get(mlir::MLIRContext *ctxt, KindTy kind) {
-  return Base::get(ctxt, FIR_REAL, kind);
+  return Base::get(ctxt, kind);
 }
 
 int fir::RealType::getFKind() const { return getImpl()->getFKind(); }
@@ -932,7 +930,7 @@ int fir::RealType::getFKind() const { return getImpl()->getFKind(); }
 // Box<T>
 
 BoxType fir::BoxType::get(mlir::Type elementType, mlir::AffineMapAttr map) {
-  return Base::get(elementType.getContext(), FIR_BOX, elementType, map);
+  return Base::get(elementType.getContext(), elementType, map);
 }
 
 mlir::Type fir::BoxType::getEleTy() const {
@@ -953,7 +951,7 @@ fir::BoxType::verifyConstructionInvariants(mlir::Location, mlir::Type eleTy,
 // BoxChar<C>
 
 BoxCharType fir::BoxCharType::get(mlir::MLIRContext *ctxt, KindTy kind) {
-  return Base::get(ctxt, FIR_BOXCHAR, kind);
+  return Base::get(ctxt, kind);
 }
 
 CharacterType fir::BoxCharType::getEleTy() const {
@@ -963,7 +961,7 @@ CharacterType fir::BoxCharType::getEleTy() const {
 // BoxProc<T>
 
 BoxProcType fir::BoxProcType::get(mlir::Type elementType) {
-  return Base::get(elementType.getContext(), FIR_BOXPROC, elementType);
+  return Base::get(elementType.getContext(), elementType);
 }
 
 mlir::Type fir::BoxProcType::getEleTy() const {
@@ -984,7 +982,7 @@ fir::BoxProcType::verifyConstructionInvariants(mlir::Location loc,
 // Reference<T>
 
 ReferenceType fir::ReferenceType::get(mlir::Type elementType) {
-  return Base::get(elementType.getContext(), FIR_REFERENCE, elementType);
+  return Base::get(elementType.getContext(), elementType);
 }
 
 mlir::Type fir::ReferenceType::getEleTy() const {
@@ -1005,7 +1003,7 @@ fir::ReferenceType::verifyConstructionInvariants(mlir::Location loc,
 
 PointerType fir::PointerType::get(mlir::Type elementType) {
   assert(singleIndirectionLevel(elementType) && "invalid element type");
-  return Base::get(elementType.getContext(), FIR_POINTER, elementType);
+  return Base::get(elementType.getContext(), elementType);
 }
 
 mlir::Type fir::PointerType::getEleTy() const {
@@ -1033,7 +1031,7 @@ fir::PointerType::verifyConstructionInvariants(mlir::Location loc,
 
 HeapType fir::HeapType::get(mlir::Type elementType) {
   assert(singleIndirectionLevel(elementType) && "invalid element type");
-  return Base::get(elementType.getContext(), FIR_HEAP, elementType);
+  return Base::get(elementType.getContext(), elementType);
 }
 
 mlir::Type fir::HeapType::getEleTy() const {
@@ -1054,7 +1052,7 @@ fir::HeapType::verifyConstructionInvariants(mlir::Location loc,
 SequenceType fir::SequenceType::get(const Shape &shape, mlir::Type elementType,
                                     mlir::AffineMapAttr map) {
   auto *ctxt = elementType.getContext();
-  return Base::get(ctxt, FIR_SEQUENCE, shape, elementType, map);
+  return Base::get(ctxt, shape, elementType, map);
 }
 
 mlir::Type fir::SequenceType::getEleTy() const {
@@ -1136,7 +1134,7 @@ llvm::hash_code fir::hash_value(const SequenceType::Shape &sh) {
 /// This type captures a Fortran "derived type"
 
 RecordType fir::RecordType::get(mlir::MLIRContext *ctxt, llvm::StringRef name) {
-  return Base::get(ctxt, FIR_DERIVED, name);
+  return Base::get(ctxt, name);
 }
 
 void fir::RecordType::finalize(llvm::ArrayRef<TypePair> lenPList,
@@ -1179,7 +1177,7 @@ mlir::Type fir::RecordType::getType(llvm::StringRef ident) {
 
 TypeDescType fir::TypeDescType::get(mlir::Type ofType) {
   assert(!ofType.isa<ReferenceType>());
-  return Base::get(ofType.getContext(), FIR_TYPEDESC, ofType);
+  return Base::get(ofType.getContext(), ofType);
 }
 
 mlir::Type fir::TypeDescType::getOfTy() const { return getImpl()->getOfType(); }
@@ -1222,9 +1220,7 @@ void fir::verifyIntegralType(mlir::Type type) {
 void fir::printFirType(FIROpsDialect *, mlir::Type ty,
                        mlir::DialectAsmPrinter &p) {
   auto &os = p.getStream();
-  switch (ty.getKind()) {
-  case fir::FIR_BOX: {
-    auto type = ty.cast<BoxType>();
+  if (auto type = ty.dyn_cast<BoxType>()) {
     os << "box<";
     p.printType(type.getEleTy());
     if (auto map = type.getLayoutMap()) {
@@ -1232,24 +1228,28 @@ void fir::printFirType(FIROpsDialect *, mlir::Type ty,
       p.printAttribute(map);
     }
     os << '>';
-  } break;
-  case fir::FIR_BOXCHAR: {
-    auto type = ty.cast<BoxCharType>().getEleTy();
-    os << "boxchar<" << type.cast<fir::CharacterType>().getFKind() << '>';
-  } break;
-  case fir::FIR_BOXPROC:
+    return;
+  }
+  if (auto type = ty.dyn_cast<BoxCharType>()) {
+    os << "boxchar<" << type.getEleTy().cast<fir::CharacterType>().getFKind()
+       << '>';
+    return;
+  }
+  if (auto type = ty.dyn_cast<BoxProcType>()) {
     os << "boxproc<";
-    p.printType(ty.cast<BoxProcType>().getEleTy());
+    p.printType(type.getEleTy());
     os << '>';
-    break;
-  case fir::FIR_CHARACTER: // intrinsic
-    os << "char<" << ty.cast<CharacterType>().getFKind() << '>';
-    break;
-  case fir::FIR_COMPLEX: // intrinsic
-    os << "complex<" << ty.cast<CplxType>().getFKind() << '>';
-    break;
-  case fir::FIR_DERIVED: { // derived
-    auto type = ty.cast<fir::RecordType>();
+    return;
+  }
+  if (auto type = ty.dyn_cast<CharacterType>()) {
+    os << "char<" << type.getFKind() << '>';
+    return;
+  }
+  if (auto type = ty.dyn_cast<CplxType>()) {
+    os << "complex<" << type.getFKind() << '>';
+    return;
+  }
+  if (auto type = ty.dyn_cast<RecordType>()) {
     os << "type<" << type.getName();
     if (!recordTypeVisited.count(type.uniqueKey())) {
       recordTypeVisited.insert(type.uniqueKey());
@@ -1274,43 +1274,52 @@ void fir::printFirType(FIROpsDialect *, mlir::Type ty,
       recordTypeVisited.erase(type.uniqueKey());
     }
     os << '>';
-  } break;
-  case fir::FIR_DIMS:
-    os << "dims<" << ty.cast<DimsType>().getRank() << '>';
-    break;
-  case fir::FIR_FIELD:
+    return;
+  }
+  if (auto type = ty.dyn_cast<DimsType>()) {
+    os << "dims<" << type.getRank() << '>';
+    return;
+  }
+  if (ty.isa<FieldType>()) {
     os << "field";
-    break;
-  case fir::FIR_HEAP:
+    return;
+  }
+  if (auto type = ty.dyn_cast<HeapType>()) {
     os << "heap<";
-    p.printType(ty.cast<HeapType>().getEleTy());
+    p.printType(type.getEleTy());
     os << '>';
-    break;
-  case fir::FIR_INT: // intrinsic
-    os << "int<" << ty.cast<fir::IntType>().getFKind() << '>';
-    break;
-  case fir::FIR_LEN:
+    return;
+  }
+  if (auto type = ty.dyn_cast<fir::IntType>()) {
+    os << "int<" << type.getFKind() << '>';
+    return;
+  }
+  if (auto type = ty.dyn_cast<LenType>()) {
     os << "len";
-    break;
-  case fir::FIR_LOGICAL: // intrinsic
-    os << "logical<" << ty.cast<LogicalType>().getFKind() << '>';
-    break;
-  case fir::FIR_POINTER:
+    return;
+  }
+  if (auto type = ty.dyn_cast<LogicalType>()) {
+    os << "logical<" << type.getFKind() << '>';
+    return;
+  }
+  if (auto type = ty.dyn_cast<PointerType>()) {
     os << "ptr<";
-    p.printType(ty.cast<PointerType>().getEleTy());
+    p.printType(type.getEleTy());
     os << '>';
-    break;
-  case fir::FIR_REAL: // intrinsic
-    os << "real<" << ty.cast<fir::RealType>().getFKind() << '>';
-    break;
-  case fir::FIR_REFERENCE:
+    return;
+  }
+  if (auto type = ty.dyn_cast<fir::RealType>()) {
+    os << "real<" << type.getFKind() << '>';
+    return;
+  }
+  if (auto type = ty.dyn_cast<ReferenceType>()) {
     os << "ref<";
-    p.printType(ty.cast<ReferenceType>().getEleTy());
+    p.printType(type.getEleTy());
     os << '>';
-    break;
-  case fir::FIR_SEQUENCE: {
+    return;
+  }
+  if (auto type = ty.dyn_cast<SequenceType>()) {
     os << "array";
-    auto type = ty.cast<SequenceType>();
     auto shape = type.getShape();
     if (shape.size()) {
       printBounds(os, shape);
@@ -1323,11 +1332,12 @@ void fir::printFirType(FIROpsDialect *, mlir::Type ty,
       map.print(os);
     }
     os << '>';
-  } break;
-  case fir::FIR_TYPEDESC:
+    return;
+  }
+  if (auto type = ty.dyn_cast<TypeDescType>()) {
     os << "tdesc<";
-    p.printType(ty.cast<TypeDescType>().getOfTy());
+    p.printType(type.getOfTy());
     os << '>';
-    break;
+    return;
   }
 }
diff --git a/mlir/docs/Tutorials/Toy/Ch-7.md b/mlir/docs/Tutorials/Toy/Ch-7.md
index cbab1e1cadb0cf..c20b8d95617d76 100644
--- a/mlir/docs/Tutorials/Toy/Ch-7.md
+++ b/mlir/docs/Tutorials/Toy/Ch-7.md
@@ -190,11 +190,10 @@ public:
     assert(!elementTypes.empty() && "expected at least 1 element type");
 
     // Call into a helper 'get' method in 'TypeBase' to get a uniqued instance
-    // of this type. The first two parameters are the context to unique in and
-    // the kind of the type. The parameters after the type kind are forwarded to
-    // the storage instance.
+    // of this type. The first parameter is the context to unique in. The
+    // parameters after the type kind are forwarded to the storage instance.
     mlir::MLIRContext *ctx = elementTypes.front().getContext();
-    return Base::get(ctx, ToyTypes::Struct, elementTypes);
+    return Base::get(ctx, elementTypes);
   }
 
   /// Returns the element types of this struct type.
diff --git a/mlir/examples/toy/Ch7/include/toy/Dialect.h b/mlir/examples/toy/Ch7/include/toy/Dialect.h
index b695169924012a..4eceb422efa63d 100644
--- a/mlir/examples/toy/Ch7/include/toy/Dialect.h
+++ b/mlir/examples/toy/Ch7/include/toy/Dialect.h
@@ -63,13 +63,6 @@ class ToyDialect : public mlir::Dialect {
 // Toy Types
 //===----------------------------------------------------------------------===//
 
-/// Create a local enumeration with all of the types that are defined by Toy.
-namespace ToyTypes {
-enum Types {
-  Struct = mlir::Type::FIRST_TOY_TYPE,
-};
-} // end namespace ToyTypes
-
 /// This class defines the Toy struct type. It represents a collection of
 /// element types. All derived types in MLIR must inherit from the CRTP class
 /// 'Type::TypeBase'. It takes as template parameters the concrete type
diff --git a/mlir/examples/toy/Ch7/mlir/Dialect.cpp b/mlir/examples/toy/Ch7/mlir/Dialect.cpp
index e233a554993450..04c796ce6d0b7c 100644
--- a/mlir/examples/toy/Ch7/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch7/mlir/Dialect.cpp
@@ -474,11 +474,10 @@ StructType StructType::get(llvm::ArrayRef<mlir::Type> elementTypes) {
   assert(!elementTypes.empty() && "expected at least 1 element type");
 
   // Call into a helper 'get' method in 'TypeBase' to get a uniqued instance
-  // of this type. The first two parameters are the context to unique in and the
-  // kind of the type. The parameters after the type kind are forwarded to the
-  // storage instance.
+  // of this type. The first parameter is the context to unique in. The
+  // parameters after the type kind are forwarded to the storage instance.
   mlir::MLIRContext *ctx = elementTypes.front().getContext();
-  return Base::get(ctx, ToyTypes::Struct, elementTypes);
+  return Base::get(ctx, elementTypes);
 }
 
 /// Returns the element types of this struct type.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
index b71964b5d0f84d..e9a62cf5bac5c2 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.h
@@ -64,34 +64,6 @@ class LLVMIntegerType;
 /// structs, the entire type is the identifier) and are thread-safe.
 class LLVMType : public Type {
 public:
-  enum Kind {
-    // Keep non-parametric types contiguous in the enum.
-    VoidType = FIRST_LLVM_TYPE + 1,
-    HalfType,
-    BFloatType,
-    FloatType,
-    DoubleType,
-    FP128Type,
-    X86FP80Type,
-    PPCFP128Type,
-    X86MMXType,
-    LabelType,
-    TokenType,
-    MetadataType,
-    // End of non-parametric types.
-    FunctionType,
-    IntegerType,
-    PointerType,
-    FixedVectorType,
-    ScalableVectorType,
-    ArrayType,
-    StructType,
-    FIRST_NEW_LLVM_TYPE = VoidType,
-    LAST_NEW_LLVM_TYPE = StructType,
-    FIRST_TRIVIAL_TYPE = VoidType,
-    LAST_TRIVIAL_TYPE = MetadataType
-  };
-
   /// Inherit base constructors.
   using Type::Type;
 
@@ -256,27 +228,24 @@ class LLVMType : public Type {
 //===----------------------------------------------------------------------===//
 
 // Batch-define trivial types.
-#define DEFINE_TRIVIAL_LLVM_TYPE(ClassName, Kind)                              \
+#define DEFINE_TRIVIAL_LLVM_TYPE(ClassName)                                    \
   class ClassName : public Type::TypeBase<ClassName, LLVMType, TypeStorage> {  \
   public:                                                                      \
     using Base::Base;                                                          \
-    static ClassName get(MLIRContext *context) {                               \
-      return Base::get(context, Kind);                                         \
-    }                                                                          \
   }
 
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMVoidType, LLVMType::VoidType);
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMHalfType, LLVMType::HalfType);
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMBFloatType, LLVMType::BFloatType);
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMFloatType, LLVMType::FloatType);
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMDoubleType, LLVMType::DoubleType);
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMFP128Type, LLVMType::FP128Type);
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMX86FP80Type, LLVMType::X86FP80Type);
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMPPCFP128Type, LLVMType::PPCFP128Type);
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMX86MMXType, LLVMType::X86MMXType);
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMTokenType, LLVMType::TokenType);
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMLabelType, LLVMType::LabelType);
-DEFINE_TRIVIAL_LLVM_TYPE(LLVMMetadataType, LLVMType::MetadataType);
+DEFINE_TRIVIAL_LLVM_TYPE(LLVMVoidType);
+DEFINE_TRIVIAL_LLVM_TYPE(LLVMHalfType);
+DEFINE_TRIVIAL_LLVM_TYPE(LLVMBFloatType);
+DEFINE_TRIVIAL_LLVM_TYPE(LLVMFloatType);
+DEFINE_TRIVIAL_LLVM_TYPE(LLVMDoubleType);
+DEFINE_TRIVIAL_LLVM_TYPE(LLVMFP128Type);
+DEFINE_TRIVIAL_LLVM_TYPE(LLVMX86FP80Type);
+DEFINE_TRIVIAL_LLVM_TYPE(LLVMPPCFP128Type);
+DEFINE_TRIVIAL_LLVM_TYPE(LLVMX86MMXType);
+DEFINE_TRIVIAL_LLVM_TYPE(LLVMTokenType);
+DEFINE_TRIVIAL_LLVM_TYPE(LLVMLabelType);
+DEFINE_TRIVIAL_LLVM_TYPE(LLVMMetadataType);
 
 #undef DEFINE_TRIVIAL_LLVM_TYPE
 
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h
index 17e803db82114c..18b2c3aaa53d11 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgTypes.h
@@ -16,11 +16,6 @@ namespace mlir {
 class MLIRContext;
 
 namespace linalg {
-enum LinalgTypes {
-  Range = Type::FIRST_LINALG_TYPE,
-  LAST_USED_LINALG_TYPE = Range,
-};
-
 #include "mlir/Dialect/Linalg/IR/LinalgOpsDialect.h.inc"
 
 /// A RangeType represents a minimal range abstraction (min, max, step).
@@ -36,11 +31,6 @@ class RangeType : public Type::TypeBase<RangeType, Type, TypeStorage> {
 public:
   // Used for generic hooks in TypeBase.
   using Base::Base;
-  /// Construction hook.
-  static RangeType get(MLIRContext *context) {
-    /// Custom, uniq'ed construction in the MLIRContext.
-    return Base::get(context, LinalgTypes::Range);
-  }
 };
 
 } // namespace linalg
diff --git a/mlir/include/mlir/Dialect/Quant/QuantTypes.h b/mlir/include/mlir/Dialect/Quant/QuantTypes.h
index ccdc289a9a7c7b..567b63936dd371 100644
--- a/mlir/include/mlir/Dialect/Quant/QuantTypes.h
+++ b/mlir/include/mlir/Dialect/Quant/QuantTypes.h
@@ -31,15 +31,6 @@ struct UniformQuantizedPerAxisTypeStorage;
 
 } // namespace detail
 
-namespace QuantizationTypes {
-enum Kind {
-  Any = Type::FIRST_QUANTIZATION_TYPE,
-  UniformQuantized,
-  UniformQuantizedPerAxis,
-  LAST_USED_QUANTIZATION_TYPE = UniformQuantizedPerAxis,
-};
-} // namespace QuantizationTypes
-
 /// Enumeration of bit-mapped flags related to quantized types.
 namespace QuantizationFlags {
 enum FlagValue {
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVAttributes.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVAttributes.h
index 6788d5952cd47b..b1909b36755358 100644
--- a/mlir/include/mlir/Dialect/SPIRV/SPIRVAttributes.h
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVAttributes.h
@@ -32,15 +32,6 @@ struct TargetEnvAttributeStorage;
 struct VerCapExtAttributeStorage;
 } // namespace detail
 
-/// SPIR-V dialect-specific attribute kinds.
-namespace AttrKind {
-enum Kind {
-  InterfaceVarABI = Attribute::FIRST_SPIRV_ATTR, /// Interface var ABI
-  TargetEnv,                                     /// Target environment
-  VerCapExt, /// (version, extension, capability) triple
-};
-} // namespace AttrKind
-
 /// An attribute that specifies the information regarding the interface
 /// variable: descriptor set, binding, storage class.
 class InterfaceVarABIAttr
diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h
index a9d120b5d114a0..2d224effdee356 100644
--- a/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h
+++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h
@@ -65,19 +65,6 @@ struct StructTypeStorage;
 
 } // namespace detail
 
-namespace TypeKind {
-enum Kind {
-  Array = Type::FIRST_SPIRV_TYPE,
-  CooperativeMatrix,
-  Image,
-  Matrix,
-  Pointer,
-  RuntimeArray,
-  Struct,
-  LAST_SPIRV_TYPE = Struct,
-};
-}
-
 // Base SPIR-V type for providing availability queries.
 class SPIRVType : public Type {
 public:
diff --git a/mlir/include/mlir/Dialect/Shape/IR/Shape.h b/mlir/include/mlir/Dialect/Shape/IR/Shape.h
index 3168e87b3df005..cc601bdedaca60 100644
--- a/mlir/include/mlir/Dialect/Shape/IR/Shape.h
+++ b/mlir/include/mlir/Dialect/Shape/IR/Shape.h
@@ -29,56 +29,28 @@ namespace shape {
 /// Alias type for extent tensors.
 RankedTensorType getExtentTensorType(MLIRContext *ctx);
 
-namespace ShapeTypes {
-enum Kind {
-  Component = Type::FIRST_SHAPE_TYPE,
-  Element,
-  Shape,
-  Size,
-  ValueShape,
-  Witness,
-  LAST_SHAPE_TYPE = Witness
-};
-} // namespace ShapeTypes
-
 /// The component type corresponding to shape, element type and attribute.
 class ComponentType : public Type::TypeBase<ComponentType, Type, TypeStorage> {
 public:
   using Base::Base;
-
-  static ComponentType get(MLIRContext *context) {
-    return Base::get(context, ShapeTypes::Kind::Component);
-  }
 };
 
 /// The element type of the shaped type.
 class ElementType : public Type::TypeBase<ElementType, Type, TypeStorage> {
 public:
   using Base::Base;
-
-  static ElementType get(MLIRContext *context) {
-    return Base::get(context, ShapeTypes::Kind::Element);
-  }
 };
 
 /// The shape descriptor type represents rank and dimension sizes.
 class ShapeType : public Type::TypeBase<ShapeType, Type, TypeStorage> {
 public:
   using Base::Base;
-
-  static ShapeType get(MLIRContext *context) {
-    return Base::get(context, ShapeTypes::Kind::Shape);
-  }
 };
 
 /// The type of a single dimension.
 class SizeType : public Type::TypeBase<SizeType, Type, TypeStorage> {
 public:
   using Base::Base;
-
-  static SizeType get(MLIRContext *context) {
-    return Base::get(context, ShapeTypes::Kind::Size);
-  }
 };
 
 /// The ValueShape represents a (potentially unknown) runtime value and shape.
@@ -86,10 +58,6 @@ class ValueShapeType
     : public Type::TypeBase<ValueShapeType, Type, TypeStorage> {
 public:
   using Base::Base;
-
-  static ValueShapeType get(MLIRContext *context) {
-    return Base::get(context, ShapeTypes::Kind::ValueShape);
-  }
 };
 
 /// The Witness represents a runtime constraint, to be used as shape related
@@ -97,10 +65,6 @@ class ValueShapeType
 class WitnessType : public Type::TypeBase<WitnessType, Type, TypeStorage> {
 public:
   using Base::Base;
-
-  static WitnessType get(MLIRContext *context) {
-    return Base::get(context, ShapeTypes::Kind::Witness);
-  }
 };
 
 #define GET_OP_CLASSES
diff --git a/mlir/include/mlir/IR/AttributeSupport.h b/mlir/include/mlir/IR/AttributeSupport.h
index 31e6285164ab21..35084a20493f58 100644
--- a/mlir/include/mlir/IR/AttributeSupport.h
+++ b/mlir/include/mlir/IR/AttributeSupport.h
@@ -137,15 +137,23 @@ namespace detail {
 // MLIRContext. This class manages all creation and uniquing of attributes.
 class AttributeUniquer {
 public:
-  /// Get an uniqued instance of attribute T.
+  /// Get an uniqued instance of a parametric attribute T.
   template <typename T, typename... Args>
-  static T get(MLIRContext *ctx, unsigned kind, Args &&... args) {
+  static typename std::enable_if_t<
+      !std::is_same<typename T::ImplType, AttributeStorage>::value, T>
+  get(MLIRContext *ctx, Args &&...args) {
     return ctx->getAttributeUniquer().get<typename T::ImplType>(
-        T::getTypeID(),
         [ctx](AttributeStorage *storage) {
           initializeAttributeStorage(storage, ctx, T::getTypeID());
         },
-        kind, std::forward<Args>(args)...);
+        T::getTypeID(), std::forward<Args>(args)...);
+  }
+  /// Get an uniqued instance of a singleton attribute T.
+  template <typename T>
+  static typename std::enable_if_t<
+      std::is_same<typename T::ImplType, AttributeStorage>::value, T>
+  get(MLIRContext *ctx) {
+    return ctx->getAttributeUniquer().get<typename T::ImplType>(T::getTypeID());
   }
 
   template <typename T, typename... Args>
@@ -156,6 +164,26 @@ class AttributeUniquer {
                                              std::forward<Args>(args)...);
   }
 
+  /// Register a parametric attribute instance T with the uniquer.
+  template <typename T>
+  static typename std::enable_if_t<
+      !std::is_same<typename T::ImplType, AttributeStorage>::value>
+  registerAttribute(MLIRContext *ctx) {
+    ctx->getAttributeUniquer()
+        .registerParametricStorageType<typename T::ImplType>(T::getTypeID());
+  }
+  /// Register a singleton attribute instance T with the uniquer.
+  template <typename T>
+  static typename std::enable_if_t<
+      std::is_same<typename T::ImplType, AttributeStorage>::value>
+  registerAttribute(MLIRContext *ctx) {
+    ctx->getAttributeUniquer()
+        .registerSingletonStorageType<typename T::ImplType>(
+            T::getTypeID(), [ctx](AttributeStorage *storage) {
+              initializeAttributeStorage(storage, ctx, T::getTypeID());
+            });
+  }
+
 private:
   /// Initialize the given attribute storage instance.
   static void initializeAttributeStorage(AttributeStorage *storage,
diff --git a/mlir/include/mlir/IR/Attributes.h b/mlir/include/mlir/IR/Attributes.h
index 75ac2adc302c1b..aa8f2eafb896b0 100644
--- a/mlir/include/mlir/IR/Attributes.h
+++ b/mlir/include/mlir/IR/Attributes.h
@@ -54,14 +54,6 @@ struct SparseElementsAttributeStorage;
 /// passed by value.
 class Attribute {
 public:
-  /// Integer identifier for all the concrete attribute kinds.
-  enum Kind {
-  // Reserve attribute kinds for dialect specific extensions.
-#define DEFINE_SYM_KIND_RANGE(Dialect)                                         \
-  FIRST_##Dialect##_ATTR, LAST_##Dialect##_ATTR = FIRST_##Dialect##_ATTR + 0xff,
-#include "DialectSymbolRegistry.def"
-  };
-
   /// Utility class for implementing attributes.
   template <typename ConcreteType, typename BaseType, typename StorageType,
             template <typename T> class... Traits>
@@ -94,9 +86,6 @@ class Attribute {
   // Support dyn_cast'ing Attribute to itself.
   static bool classof(Attribute) { return true; }
 
-  /// Return the classification for this attribute.
-  unsigned getKind() const { return impl->getKind(); }
-
   /// Return a unique identifier for the concrete attribute type. This is used
   /// to support dynamic type casting.
   TypeID getTypeID() { return impl->getAbstractAttribute().getTypeID(); }
@@ -173,54 +162,6 @@ class AttributeInterface
   friend InterfaceBase;
 };
 
-//===----------------------------------------------------------------------===//
-// StandardAttributes
-//===----------------------------------------------------------------------===//
-
-namespace StandardAttributes {
-enum Kind {
-  AffineMap = Attribute::FIRST_STANDARD_ATTR,
-  Array,
-  Dictionary,
-  Float,
-  Integer,
-  IntegerSet,
-  Opaque,
-  String,
-  SymbolRef,
-  Type,
-  Unit,
-
-  /// Elements Attributes.
-  DenseIntOrFPElements,
-  DenseStringElements,
-  OpaqueElements,
-  SparseElements,
-  FIRST_ELEMENTS_ATTR = DenseIntOrFPElements,
-  LAST_ELEMENTS_ATTR = SparseElements,
-
-  /// Locations.
-  CallSiteLocation,
-  FileLineColLocation,
-  FusedLocation,
-  NameLocation,
-  OpaqueLocation,
-  UnknownLocation,
-
-  // Represents a location as a 'void*' pointer to a front-end's opaque
-  // location information, which must live longer than the MLIR objects that
-  // refer to it.  OpaqueLocation's are never serialized.
-  //
-  // TODO: OpaqueLocation,
-
-  // Represents a value inlined through a function call.
-  // TODO: InlinedLocation,
-
-  FIRST_LOCATION_ATTR = CallSiteLocation,
-  LAST_LOCATION_ATTR = UnknownLocation,
-};
-} // namespace StandardAttributes
-
 //===----------------------------------------------------------------------===//
 // AffineMapAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h
index 4f9e4cb3618b65..12a19af2054626 100644
--- a/mlir/include/mlir/IR/Dialect.h
+++ b/mlir/include/mlir/IR/Dialect.h
@@ -154,21 +154,15 @@ class Dialect {
 
   void addOperation(AbstractOperation opInfo);
 
-  /// This method is used by derived classes to add their types to the set.
+  /// Register a set of type classes with this dialect.
   template <typename... Args> void addTypes() {
-    (void)std::initializer_list<int>{
-        0, (addType(Args::getTypeID(), AbstractType::get<Args>(*this)), 0)...};
+    (void)std::initializer_list<int>{0, (addType<Args>(), 0)...};
   }
-  void addType(TypeID typeID, AbstractType &&typeInfo);
 
-  /// This method is used by derived classes to add their attributes to the set.
+  /// Register a set of attribute classes with this dialect.
   template <typename... Args> void addAttributes() {
-    (void)std::initializer_list<int>{
-        0,
-        (addAttribute(Args::getTypeID(), AbstractAttribute::get<Args>(*this)),
-         0)...};
+    (void)std::initializer_list<int>{0, (addAttribute<Args>(), 0)...};
   }
-  void addAttribute(TypeID typeID, AbstractAttribute &&attrInfo);
 
   /// Enable support for unregistered operations.
   void allowUnknownOperations(bool allow = true) { unknownOpsAllowed = allow; }
@@ -189,6 +183,22 @@ class Dialect {
   Dialect(const Dialect &) = delete;
   void operator=(Dialect &) = delete;
 
+  /// Register an attribute instance with this dialect.
+  template <typename T> void addAttribute() {
+    // Add this attribute to the dialect and register it with the uniquer.
+    addAttribute(T::getTypeID(), AbstractAttribute::get<T>(*this));
+    detail::AttributeUniquer::registerAttribute<T>(context);
+  }
+  void addAttribute(TypeID typeID, AbstractAttribute &&attrInfo);
+
+  /// Register a type instance with this dialect.
+  template <typename T> void addType() {
+    // Add this type to the dialect and register it with the uniquer.
+    addType(T::getTypeID(), AbstractType::get<T>(*this));
+    detail::TypeUniquer::registerType<T>(context);
+  }
+  void addType(TypeID typeID, AbstractType &&typeInfo);
+
   /// The namespace of this dialect.
   StringRef name;
 
diff --git a/mlir/include/mlir/IR/DialectSymbolRegistry.def b/mlir/include/mlir/IR/DialectSymbolRegistry.def
deleted file mode 100644
index acba383e911354..00000000000000
--- a/mlir/include/mlir/IR/DialectSymbolRegistry.def
+++ /dev/null
@@ -1,44 +0,0 @@
-//===- DialectSymbolRegistry.def - MLIR Dialect Symbol Registry -*- C++ -*-===//
-//
-// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file enumerates the different dialects that define custom classes
-// within the attribute or type system.
-//
-//===----------------------------------------------------------------------===//
-
-DEFINE_SYM_KIND_RANGE(STANDARD)
-DEFINE_SYM_KIND_RANGE(TENSORFLOW_CONTROL)
-DEFINE_SYM_KIND_RANGE(TENSORFLOW_EXECUTOR)
-DEFINE_SYM_KIND_RANGE(TENSORFLOW)
-DEFINE_SYM_KIND_RANGE(LLVM)
-DEFINE_SYM_KIND_RANGE(QUANTIZATION)
-DEFINE_SYM_KIND_RANGE(IREE) // IREE stands for IR Execution Engine
-DEFINE_SYM_KIND_RANGE(LINALG) // Linear Algebra Dialect
-DEFINE_SYM_KIND_RANGE(FIR) // Flang Fortran IR Dialect
-DEFINE_SYM_KIND_RANGE(OPENACC) // OpenACC IR Dialect
-DEFINE_SYM_KIND_RANGE(OPENMP) // OpenMP IR Dialect
-DEFINE_SYM_KIND_RANGE(TOY) // Toy language (tutorial) Dialect
-DEFINE_SYM_KIND_RANGE(SPIRV) // SPIR-V dialect
-DEFINE_SYM_KIND_RANGE(XLA_HLO) // XLA HLO dialect
-DEFINE_SYM_KIND_RANGE(SHAPE) // Shape dialect
-DEFINE_SYM_KIND_RANGE(TF_FRAMEWORK) // TF Framework dialect
-
-// The following ranges are reserved for experimenting with MLIR dialects in a
-// private context without having to register them here.
-DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_0)
-DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_1)
-DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_2)
-DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_3)
-DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_4)
-DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_5)
-DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_6)
-DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_7)
-DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_8)
-DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_9)
-
-#undef DEFINE_SYM_KIND_RANGE
diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
index 0124ef5f7c0a4e..df54919ade1e49 100644
--- a/mlir/include/mlir/IR/OpImplementation.h
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -756,7 +756,7 @@ class OpAsmDialectInterface
   /// all attributes of the given kind in the form : <alias>[0-9]+. These
   /// aliases must not contain `.`.
   virtual void getAttributeKindAliases(
-      SmallVectorImpl<std::pair<unsigned, StringRef>> &aliases) const {}
+      SmallVectorImpl<std::pair<TypeID, StringRef>> &aliases) const {}
   /// Hook for defining Attribute aliases. These aliases must not contain `.` or
   /// end with a numeric digit([0-9]+).
   virtual void getAttributeAliases(
diff --git a/mlir/include/mlir/IR/StandardTypes.h b/mlir/include/mlir/IR/StandardTypes.h
index 6ceddec5337749..e309595415d14c 100644
--- a/mlir/include/mlir/IR/StandardTypes.h
+++ b/mlir/include/mlir/IR/StandardTypes.h
@@ -38,33 +38,6 @@ struct TupleTypeStorage;
 
 } // namespace detail
 
-namespace StandardTypes {
-enum Kind {
-  // Floating point.
-  BF16 = Type::Kind::FIRST_STANDARD_TYPE,
-  F16,
-  F32,
-  F64,
-  FIRST_FLOATING_POINT_TYPE = BF16,
-  LAST_FLOATING_POINT_TYPE = F64,
-
-  // Target pointer sized integer, used (e.g.) in affine mappings.
-  Index,
-
-  // Derived types.
-  Integer,
-  Vector,
-  RankedTensor,
-  UnrankedTensor,
-  MemRef,
-  UnrankedMemRef,
-  Complex,
-  Tuple,
-  None,
-};
-
-} // namespace StandardTypes
-
 //===----------------------------------------------------------------------===//
 // ComplexType
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/IR/StorageUniquerSupport.h b/mlir/include/mlir/IR/StorageUniquerSupport.h
index 48026c219082a5..75bc40abdaeff9 100644
--- a/mlir/include/mlir/IR/StorageUniquerSupport.h
+++ b/mlir/include/mlir/IR/StorageUniquerSupport.h
@@ -82,29 +82,29 @@ class StorageUserBase : public BaseT, public Traits<ConcreteT>... {
     return detail::InterfaceMap::template get<Traits<ConcreteT>...>();
   }
 
-protected:
   /// Get or create a new ConcreteT instance within the ctx. This
   /// function is guaranteed to return a non null object and will assert if
   /// the arguments provided are invalid.
   template <typename... Args>
-  static ConcreteT get(MLIRContext *ctx, unsigned kind, Args... args) {
+  static ConcreteT get(MLIRContext *ctx, Args... args) {
     // Ensure that the invariants are correct for construction.
     assert(succeeded(ConcreteT::verifyConstructionInvariants(
         generateUnknownStorageLocation(ctx), args...)));
-    return UniquerT::template get<ConcreteT>(ctx, kind, args...);
+    return UniquerT::template get<ConcreteT>(ctx, args...);
   }
 
   /// Get or create a new ConcreteT instance within the ctx, defined at
   /// the given, potentially unknown, location. If the arguments provided are
   /// invalid then emit errors and return a null object.
   template <typename LocationT, typename... Args>
-  static ConcreteT getChecked(LocationT loc, unsigned kind, Args... args) {
+  static ConcreteT getChecked(LocationT loc, Args... args) {
     // If the construction invariants fail then we return a null attribute.
     if (failed(ConcreteT::verifyConstructionInvariants(loc, args...)))
       return ConcreteT();
-    return UniquerT::template get<ConcreteT>(loc.getContext(), kind, args...);
+    return UniquerT::template get<ConcreteT>(loc.getContext(), args...);
   }
 
+protected:
   /// Mutate the current storage instance. This will not change the unique key.
   /// The arguments are forwarded to 'ConcreteT::mutate'.
   template <typename... Args> LogicalResult mutate(Args &&...args) {
diff --git a/mlir/include/mlir/IR/TypeSupport.h b/mlir/include/mlir/IR/TypeSupport.h
index aa2daefd26c462..ace5eaa733454d 100644
--- a/mlir/include/mlir/IR/TypeSupport.h
+++ b/mlir/include/mlir/IR/TypeSupport.h
@@ -121,15 +121,23 @@ namespace detail {
 /// A utility class to get, or create, unique instances of types within an
 /// MLIRContext. This class manages all creation and uniquing of types.
 struct TypeUniquer {
-  /// Get an uniqued instance of a type T.
+  /// Get an uniqued instance of a parametric type T.
   template <typename T, typename... Args>
-  static T get(MLIRContext *ctx, unsigned kind, Args &&... args) {
+  static typename std::enable_if_t<
+      !std::is_same<typename T::ImplType, TypeStorage>::value, T>
+  get(MLIRContext *ctx, Args &&...args) {
     return ctx->getTypeUniquer().get<typename T::ImplType>(
-        T::getTypeID(),
         [&](TypeStorage *storage) {
           storage->initialize(AbstractType::lookup(T::getTypeID(), ctx));
         },
-        kind, std::forward<Args>(args)...);
+        T::getTypeID(), std::forward<Args>(args)...);
+  }
+  /// Get an uniqued instance of a singleton type T.
+  template <typename T>
+  static typename std::enable_if_t<
+      std::is_same<typename T::ImplType, TypeStorage>::value, T>
+  get(MLIRContext *ctx) {
+    return ctx->getTypeUniquer().get<typename T::ImplType>(T::getTypeID());
   }
 
   /// Change the mutable component of the given type instance in the provided
@@ -141,6 +149,25 @@ struct TypeUniquer {
     return ctx->getTypeUniquer().mutate(T::getTypeID(), impl,
                                         std::forward<Args>(args)...);
   }
+
+  /// Register a parametric type instance T with the uniquer.
+  template <typename T>
+  static typename std::enable_if_t<
+      !std::is_same<typename T::ImplType, TypeStorage>::value>
+  registerType(MLIRContext *ctx) {
+    ctx->getTypeUniquer().registerParametricStorageType<typename T::ImplType>(
+        T::getTypeID());
+  }
+  /// Register a singleton type instance T with the uniquer.
+  template <typename T>
+  static typename std::enable_if_t<
+      std::is_same<typename T::ImplType, TypeStorage>::value>
+  registerType(MLIRContext *ctx) {
+    ctx->getTypeUniquer().registerSingletonStorageType<TypeStorage>(
+        T::getTypeID(), [&](TypeStorage *storage) {
+          storage->initialize(AbstractType::lookup(T::getTypeID(), ctx));
+        });
+  }
 };
 } // namespace detail
 
diff --git a/mlir/include/mlir/IR/Types.h b/mlir/include/mlir/IR/Types.h
index 8101690daeb648..ad7e436068bc54 100644
--- a/mlir/include/mlir/IR/Types.h
+++ b/mlir/include/mlir/IR/Types.h
@@ -34,11 +34,11 @@ struct OpaqueTypeStorage;
 ///
 /// Some types are "primitives" meaning they do not have any parameters, for
 /// example the Index type.  Parametric types have additional information that
-/// differentiates the types of the same kind between them, for example the
-/// Integer type has bitwidth, making i8 and i16 belong to the same kind by be
-/// different instances of the IntegerType.  Type parameters are part of the
-/// unique immutable key.  The mutable component of the type can be modified
-/// after the type is created, but cannot affect the identity of the type.
+/// differentiates the types of the same class, for example the Integer type has
+/// bitwidth, making i8 and i16 belong to the same kind by be different
+/// instances of the IntegerType. Type parameters are part of the unique
+/// immutable key.  The mutable component of the type can be modified after the
+/// type is created, but cannot affect the identity of the type.
 ///
 /// Types are constructed and uniqued via the 'detail::TypeUniquer' class.
 ///
@@ -53,20 +53,19 @@ struct OpaqueTypeStorage;
 ///      * This method is expected to return failure if a type cannot be
 ///        constructed with 'args', success otherwise.
 ///      * 'args' must correspond with the arguments passed into the
-///        'TypeBase::get' call after the type kind.
+///        'TypeBase::get' call.
 ///
 ///
 /// Type storage objects inherit from TypeStorage and contain the following:
-///    - The type kind (for LLVM-style RTTI).
 ///    - The dialect that defined the type.
 ///    - Any parameters of the type.
 ///    - An optional mutable component.
 /// For non-parametric types, a convenience DefaultTypeStorage is provided.
 /// Parametric storage types must derive TypeStorage and respect the following:
 ///    - Define a type alias, KeyTy, to a type that uniquely identifies the
-///      instance of the type within its kind.
+///      instance of the type.
 ///      * The key type must be constructible from the values passed into the
-///        detail::TypeUniquer::get call after the type kind.
+///        detail::TypeUniquer::get call.
 ///      * If the KeyTy does not have an llvm::DenseMapInfo specialization, the
 ///        storage class must define a hashing method:
 ///         'static unsigned hashKey(const KeyTy &)'
@@ -84,23 +83,6 @@ struct OpaqueTypeStorage;
 //       the key.
 class Type {
 public:
-  /// Integer identifier for all the concrete type kinds.
-  /// Note: This is not an enum class as each dialect will likely define a
-  /// separate enumeration for the specific types that they define. Not being an
-  /// enum class also simplifies the handling of type kinds by not requiring
-  /// casts for each use.
-  enum Kind {
-    // Builtin types.
-    Function,
-    Opaque,
-    LAST_BUILTIN_TYPE = Opaque,
-
-  // Reserve type kinds for dialect specific type system extensions.
-#define DEFINE_SYM_KIND_RANGE(Dialect)                                         \
-  FIRST_##Dialect##_TYPE, LAST_##Dialect##_TYPE = FIRST_##Dialect##_TYPE + 0xff,
-#include "DialectSymbolRegistry.def"
-  };
-
   /// Utility class for implementing types.
   template <typename ConcreteType, typename BaseType, typename StorageType,
             template <typename T> class... Traits>
@@ -136,9 +118,6 @@ class Type {
   /// dynamic type casting.
   TypeID getTypeID() { return impl->getAbstractType().getTypeID(); }
 
-  /// Return the classification for this type.
-  unsigned getKind() const;
-
   /// Return the LLVMContext in which this type was uniqued.
   MLIRContext *getContext() const;
 
diff --git a/mlir/include/mlir/Support/StorageUniquer.h b/mlir/include/mlir/Support/StorageUniquer.h
index 6c7c7b0496da7c..eb04688be19026 100644
--- a/mlir/include/mlir/Support/StorageUniquer.h
+++ b/mlir/include/mlir/Support/StorageUniquer.h
@@ -11,12 +11,11 @@
 
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/TypeID.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/Allocator.h"
 
 namespace mlir {
-class TypeID;
-
 namespace detail {
 struct StorageUniquerImpl;
 
@@ -29,22 +28,19 @@ template <typename ImplTy, typename T>
 using has_impltype_hash_t = decltype(ImplTy::hashKey(std::declval<T>()));
 } // namespace detail
 
-/// A utility class to get, or create instances of storage classes. These
-/// storage classes must respect the following constraints:
-///    - Derive from StorageUniquer::BaseStorage.
-///    - Provide an unsigned 'kind' value to be used as part of the unique'ing
-///      process.
+/// A utility class to get or create instances of "storage classes". These
+/// storage classes must derive from 'StorageUniquer::BaseStorage'.
 ///
-/// For non-parametric storage classes, i.e. those that are solely uniqued by
-/// their kind, nothing else is needed. Instances of these classes can be
-/// created by calling `get` without trailing arguments.
+/// For non-parametric storage classes, i.e. singleton classes, nothing else is
+/// needed. Instances of these classes can be created by calling `get` without
+/// trailing arguments.
 ///
 /// Otherwise, the parametric storage classes may be created with `get`,
 /// and must respect the following:
 ///    - Define a type alias, KeyTy, to a type that uniquely identifies the
-///      instance of the storage class within its kind.
+///      instance of the storage class.
 ///      * The key type must be constructible from the values passed into the
-///        getComplex call after the kind.
+///        getComplex call.
 ///      * If the KeyTy does not have an llvm::DenseMapInfo specialization, the
 ///        storage class must define a hashing method:
 ///         'static unsigned hashKey(const KeyTy &)'
@@ -83,32 +79,11 @@ using has_impltype_hash_t = decltype(ImplTy::hashKey(std::declval<T>()));
 /// class.
 class StorageUniquer {
 public:
-  StorageUniquer();
-  ~StorageUniquer();
-
-  /// Set the flag specifying if multi-threading is disabled within the uniquer.
-  void disableMultithreading(bool disable = true);
-
-  /// Register a new storage object with this uniquer using the given unique
-  /// type id.
-  void registerStorageType(TypeID id);
-
   /// This class acts as the base storage that all storage classes must derived
   /// from.
   class BaseStorage {
-  public:
-    /// Get the kind classification of this storage.
-    unsigned getKind() const { return kind; }
-
   protected:
-    BaseStorage() : kind(0) {}
-
-  private:
-    /// Allow access to the kind field.
-    friend detail::StorageUniquerImpl;
-
-    /// Classification of the subclass, used for type checking.
-    unsigned kind;
+    BaseStorage() = default;
   };
 
   /// This is a utility allocator used to allocate memory for instances of
@@ -145,19 +120,61 @@ class StorageUniquer {
     llvm::BumpPtrAllocator allocator;
   };
 
-  /// Gets a uniqued instance of 'Storage'. 'initFn' is an optional parameter
-  /// that can be used to initialize a newly inserted storage instance. This
-  /// function is used for derived types that have complex storage or uniquing
+  StorageUniquer();
+  ~StorageUniquer();
+
+  /// Set the flag specifying if multi-threading is disabled within the uniquer.
+  void disableMultithreading(bool disable = true);
+
+  /// Register a new parametric storage class, this is necessary to create
+  /// instances of this class type. `id` is the type identifier that will be
+  /// used to identify this type when creating instances of it via 'get'.
+  template <typename Storage> void registerParametricStorageType(TypeID id) {
+    registerParametricStorageTypeImpl(id);
+  }
+  /// Utility override when the storage type represents the type id.
+  template <typename Storage> void registerParametricStorageType() {
+    registerParametricStorageType<Storage>(TypeID::get<Storage>());
+  }
+  /// Register a new singleton storage class, this is necessary to get the
+  /// singletone instance. `id` is the type identifier that will be used to
+  /// access the singleton instance via 'get'. An optional initialization
+  /// function may also be provided to initialize the newly created storage
+  /// instance, and used when the singleton instance is created.
+  template <typename Storage>
+  void registerSingletonStorageType(TypeID id,
+                                    function_ref<void(Storage *)> initFn) {
+    auto ctorFn = [&](StorageAllocator &allocator) {
+      auto *storage = new (allocator.allocate<Storage>()) Storage();
+      if (initFn)
+        initFn(storage);
+      return storage;
+    };
+    registerSingletonImpl(id, ctorFn);
+  }
+  template <typename Storage> void registerSingletonStorageType(TypeID id) {
+    registerSingletonStorageType<Storage>(id, llvm::None);
+  }
+  /// Utility override when the storage type represents the type id.
+  template <typename Storage>
+  void registerSingletonStorageType(
+      function_ref<void(Storage *)> initFn = llvm::None) {
+    registerSingletonStorageType<Storage>(TypeID::get<Storage>(), initFn);
+  }
+
+  /// Gets a uniqued instance of 'Storage'. 'id' is the type id used when
+  /// registering the storage instance. 'initFn' is an optional parameter that
+  /// can be used to initialize a newly inserted storage instance. This function
+  /// is used for derived types that have complex storage or uniquing
   /// constraints.
-  template <typename Storage, typename Arg, typename... Args>
-  Storage *get(const TypeID &id, function_ref<void(Storage *)> initFn,
-               unsigned kind, Arg &&arg, Args &&...args) {
+  template <typename Storage, typename... Args>
+  Storage *get(function_ref<void(Storage *)> initFn, TypeID id,
+               Args &&...args) {
     // Construct a value of the derived key type.
-    auto derivedKey =
-        getKey<Storage>(std::forward<Arg>(arg), std::forward<Args>(args)...);
+    auto derivedKey = getKey<Storage>(std::forward<Args>(args)...);
 
-    // Create a hash of the kind and the derived key.
-    unsigned hashValue = getHash<Storage>(kind, derivedKey);
+    // Create a hash of the derived key.
+    unsigned hashValue = getHash<Storage>(derivedKey);
 
     // Generate an equality function for the derived storage.
     auto isEqual = [&derivedKey](const BaseStorage *existing) {
@@ -174,29 +191,29 @@ class StorageUniquer {
 
     // Get an instance for the derived storage.
     return static_cast<Storage *>(
-        getImpl(id, kind, hashValue, isEqual, ctorFn));
+        getParametricStorageTypeImpl(id, hashValue, isEqual, ctorFn));
+  }
+  /// Utility override when the storage type represents the type id.
+  template <typename Storage, typename... Args>
+  Storage *get(function_ref<void(Storage *)> initFn, Args &&...args) {
+    return get<Storage>(initFn, TypeID::get<Storage>(),
+                        std::forward<Args>(args)...);
   }
 
-  /// Gets a uniqued instance of 'Storage'. 'initFn' is an optional parameter
-  /// that can be used to initialize a newly inserted storage instance. This
-  /// function is used for derived types that use no additional storage or
-  /// uniquing outside of the kind.
-  template <typename Storage>
-  Storage *get(const TypeID &id, function_ref<void(Storage *)> initFn,
-               unsigned kind) {
-    auto ctorFn = [&](StorageAllocator &allocator) {
-      auto *storage = new (allocator.allocate<Storage>()) Storage();
-      if (initFn)
-        initFn(storage);
-      return storage;
-    };
-    return static_cast<Storage *>(getImpl(id, kind, ctorFn));
+  /// Gets a uniqued instance of 'Storage' which is a singleton storage type.
+  /// 'id' is the type id used when registering the storage instance.
+  template <typename Storage> Storage *get(TypeID id) {
+    return static_cast<Storage *>(getSingletonImpl(id));
+  }
+  /// Utility override when the storage type represents the type id.
+  template <typename Storage> Storage *get() {
+    return get<Storage>(TypeID::get<Storage>());
   }
 
   /// Changes the mutable component of 'storage' by forwarding the trailing
   /// arguments to the 'mutate' function of the derived class.
   template <typename Storage, typename... Args>
-  LogicalResult mutate(const TypeID &id, Storage *storage, Args &&...args) {
+  LogicalResult mutate(TypeID id, Storage *storage, Args &&...args) {
     auto mutationFn = [&](StorageAllocator &allocator) -> LogicalResult {
       return static_cast<Storage &>(*storage).mutate(
           allocator, std::forward<Args>(args)...);
@@ -207,13 +224,13 @@ class StorageUniquer {
   /// Erases a uniqued instance of 'Storage'. This function is used for derived
   /// types that have complex storage or uniquing constraints.
   template <typename Storage, typename Arg, typename... Args>
-  void erase(const TypeID &id, unsigned kind, Arg &&arg, Args &&...args) {
+  void erase(TypeID id, Arg &&arg, Args &&...args) {
     // Construct a value of the derived key type.
     auto derivedKey =
         getKey<Storage>(std::forward<Arg>(arg), std::forward<Args>(args)...);
 
-    // Create a hash of the kind and the derived key.
-    unsigned hashValue = getHash<Storage>(kind, derivedKey);
+    // Create a hash of the derived key.
+    unsigned hashValue = getHash<Storage>(derivedKey);
 
     // Generate an equality function for the derived storage.
     auto isEqual = [&derivedKey](const BaseStorage *existing) {
@@ -221,32 +238,42 @@ class StorageUniquer {
     };
 
     // Attempt to erase the storage instance.
-    eraseImpl(id, kind, hashValue, isEqual, [](BaseStorage *storage) {
+    eraseImpl(id, hashValue, isEqual, [](BaseStorage *storage) {
       static_cast<Storage *>(storage)->cleanup();
     });
   }
 
 private:
   /// Implementation for getting/creating an instance of a derived type with
-  /// complex storage.
-  BaseStorage *getImpl(const TypeID &id, unsigned kind, unsigned hashValue,
-                       function_ref<bool(const BaseStorage *)> isEqual,
-                       function_ref<BaseStorage *(StorageAllocator &)> ctorFn);
+  /// parametric storage.
+  BaseStorage *getParametricStorageTypeImpl(
+      TypeID id, unsigned hashValue,
+      function_ref<bool(const BaseStorage *)> isEqual,
+      function_ref<BaseStorage *(StorageAllocator &)> ctorFn);
 
-  /// Implementation for getting/creating an instance of a derived type with
-  /// default storage.
-  BaseStorage *getImpl(const TypeID &id, unsigned kind,
-                       function_ref<BaseStorage *(StorageAllocator &)> ctorFn);
+  /// Implementation for registering an instance of a derived type with
+  /// parametric storage.
+  void registerParametricStorageTypeImpl(TypeID id);
+
+  /// Implementation for getting an instance of a derived type with default
+  /// storage.
+  BaseStorage *getSingletonImpl(TypeID id);
+
+  /// Implementation for registering an instance of a derived type with default
+  /// storage.
+  void
+  registerSingletonImpl(TypeID id,
+                        function_ref<BaseStorage *(StorageAllocator &)> ctorFn);
 
   /// Implementation for erasing an instance of a derived type with complex
   /// storage.
-  void eraseImpl(const TypeID &id, unsigned kind, unsigned hashValue,
+  void eraseImpl(TypeID id, unsigned hashValue,
                  function_ref<bool(const BaseStorage *)> isEqual,
                  function_ref<void(BaseStorage *)> cleanupFn);
 
   /// Implementation for mutating an instance of a derived storage.
   LogicalResult
-  mutateImpl(const TypeID &id,
+  mutateImpl(TypeID id,
              function_ref<LogicalResult(StorageAllocator &)> mutationFn);
 
   /// The internal implementation class.
@@ -276,27 +303,26 @@ class StorageUniquer {
   }
 
   //===--------------------------------------------------------------------===//
-  // Key and Kind Hashing
+  // Key Hashing
   //===--------------------------------------------------------------------===//
 
-  /// Used to generate a hash for the 'ImplTy::KeyTy' and kind of a storage
-  /// instance if there is an 'ImplTy::hashKey' overload for 'DerivedKey'.
+  /// Used to generate a hash for the 'ImplTy::KeyTy' of a storage instance if
+  /// there is an 'ImplTy::hashKey' overload for 'DerivedKey'.
   template <typename ImplTy, typename DerivedKey>
   static typename std::enable_if<
       llvm::is_detected<detail::has_impltype_hash_t, ImplTy, DerivedKey>::value,
       ::llvm::hash_code>::type
-  getHash(unsigned kind, const DerivedKey &derivedKey) {
-    return llvm::hash_combine(kind, ImplTy::hashKey(derivedKey));
+  getHash(const DerivedKey &derivedKey) {
+    return ImplTy::hashKey(derivedKey);
   }
-  /// If there is no 'ImplTy::hashKey' default to using the
-  /// 'llvm::DenseMapInfo' definition for 'DerivedKey' for generating a hash.
+  /// If there is no 'ImplTy::hashKey' default to using the 'llvm::DenseMapInfo'
+  /// definition for 'DerivedKey' for generating a hash.
   template <typename ImplTy, typename DerivedKey>
   static typename std::enable_if<!llvm::is_detected<detail::has_impltype_hash_t,
                                                     ImplTy, DerivedKey>::value,
                                  ::llvm::hash_code>::type
-  getHash(unsigned kind, const DerivedKey &derivedKey) {
-    return llvm::hash_combine(
-        kind, DenseMapInfo<DerivedKey>::getHashValue(derivedKey));
+  getHash(const DerivedKey &derivedKey) {
+    return DenseMapInfo<DerivedKey>::getHashValue(derivedKey);
   }
 };
 } // end namespace mlir
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index f4a278dfbbb061..727efbb4704f12 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -264,14 +264,13 @@ bool LLVMArrayType::isValidElementType(LLVMType type) {
 
 LLVMArrayType LLVMArrayType::get(LLVMType elementType, unsigned numElements) {
   assert(elementType && "expected non-null subtype");
-  return Base::get(elementType.getContext(), LLVMType::ArrayType, elementType,
-                   numElements);
+  return Base::get(elementType.getContext(), elementType, numElements);
 }
 
 LLVMArrayType LLVMArrayType::getChecked(Location loc, LLVMType elementType,
                                         unsigned numElements) {
   assert(elementType && "expected non-null subtype");
-  return Base::getChecked(loc, LLVMType::ArrayType, elementType, numElements);
+  return Base::getChecked(loc, elementType, numElements);
 }
 
 LLVMType LLVMArrayType::getElementType() { return getImpl()->elementType; }
@@ -301,16 +300,14 @@ LLVMFunctionType LLVMFunctionType::get(LLVMType result,
                                        ArrayRef<LLVMType> arguments,
                                        bool isVarArg) {
   assert(result && "expected non-null result");
-  return Base::get(result.getContext(), LLVMType::FunctionType, result,
-                   arguments, isVarArg);
+  return Base::get(result.getContext(), result, arguments, isVarArg);
 }
 
 LLVMFunctionType LLVMFunctionType::getChecked(Location loc, LLVMType result,
                                               ArrayRef<LLVMType> arguments,
                                               bool isVarArg) {
   assert(result && "expected non-null result");
-  return Base::getChecked(loc, LLVMType::FunctionType, result, arguments,
-                          isVarArg);
+  return Base::getChecked(loc, result, arguments, isVarArg);
 }
 
 LLVMType LLVMFunctionType::getReturnType() {
@@ -347,11 +344,11 @@ LogicalResult LLVMFunctionType::verifyConstructionInvariants(
 // Integer type.
 
 LLVMIntegerType LLVMIntegerType::get(MLIRContext *ctx, unsigned bitwidth) {
-  return Base::get(ctx, LLVMType::IntegerType, bitwidth);
+  return Base::get(ctx, bitwidth);
 }
 
 LLVMIntegerType LLVMIntegerType::getChecked(Location loc, unsigned bitwidth) {
-  return Base::getChecked(loc, LLVMType::IntegerType, bitwidth);
+  return Base::getChecked(loc, bitwidth);
 }
 
 unsigned LLVMIntegerType::getBitWidth() { return getImpl()->bitwidth; }
@@ -374,13 +371,12 @@ bool LLVMPointerType::isValidElementType(LLVMType type) {
 
 LLVMPointerType LLVMPointerType::get(LLVMType pointee, unsigned addressSpace) {
   assert(pointee && "expected non-null subtype");
-  return Base::get(pointee.getContext(), LLVMType::PointerType, pointee,
-                   addressSpace);
+  return Base::get(pointee.getContext(), pointee, addressSpace);
 }
 
 LLVMPointerType LLVMPointerType::getChecked(Location loc, LLVMType pointee,
                                             unsigned addressSpace) {
-  return Base::getChecked(loc, LLVMType::PointerType, pointee, addressSpace);
+  return Base::getChecked(loc, pointee, addressSpace);
 }
 
 LLVMType LLVMPointerType::getElementType() { return getImpl()->pointeeType; }
@@ -405,32 +401,32 @@ bool LLVMStructType::isValidElementType(LLVMType type) {
 
 LLVMStructType LLVMStructType::getIdentified(MLIRContext *context,
                                              StringRef name) {
-  return Base::get(context, LLVMType::StructType, name, /*opaque=*/false);
+  return Base::get(context, name, /*opaque=*/false);
 }
 
 LLVMStructType LLVMStructType::getIdentifiedChecked(Location loc,
                                                     StringRef name) {
-  return Base::getChecked(loc, LLVMType::StructType, name, /*opaque=*/false);
+  return Base::getChecked(loc, name, /*opaque=*/false);
 }
 
 LLVMStructType LLVMStructType::getLiteral(MLIRContext *context,
                                           ArrayRef<LLVMType> types,
                                           bool isPacked) {
-  return Base::get(context, LLVMType::StructType, types, isPacked);
+  return Base::get(context, types, isPacked);
 }
 
 LLVMStructType LLVMStructType::getLiteralChecked(Location loc,
                                                  ArrayRef<LLVMType> types,
                                                  bool isPacked) {
-  return Base::getChecked(loc, LLVMType::StructType, types, isPacked);
+  return Base::getChecked(loc, types, isPacked);
 }
 
 LLVMStructType LLVMStructType::getOpaque(StringRef name, MLIRContext *context) {
-  return Base::get(context, LLVMType::StructType, name, /*opaque=*/true);
+  return Base::get(context, name, /*opaque=*/true);
 }
 
 LLVMStructType LLVMStructType::getOpaqueChecked(Location loc, StringRef name) {
-  return Base::getChecked(loc, LLVMType::StructType, name, /*opaque=*/true);
+  return Base::getChecked(loc, name, /*opaque=*/true);
 }
 
 LogicalResult LLVMStructType::setBody(ArrayRef<LLVMType> types, bool isPacked) {
@@ -508,16 +504,14 @@ LLVMVectorType::verifyConstructionInvariants(Location loc, LLVMType elementType,
 LLVMFixedVectorType LLVMFixedVectorType::get(LLVMType elementType,
                                              unsigned numElements) {
   assert(elementType && "expected non-null subtype");
-  return Base::get(elementType.getContext(), LLVMType::FixedVectorType,
-                   elementType, numElements);
+  return Base::get(elementType.getContext(), elementType, numElements);
 }
 
 LLVMFixedVectorType LLVMFixedVectorType::getChecked(Location loc,
                                                     LLVMType elementType,
                                                     unsigned numElements) {
   assert(elementType && "expected non-null subtype");
-  return Base::getChecked(loc, LLVMType::FixedVectorType, elementType,
-                          numElements);
+  return Base::getChecked(loc, elementType, numElements);
 }
 
 unsigned LLVMFixedVectorType::getNumElements() {
@@ -527,16 +521,14 @@ unsigned LLVMFixedVectorType::getNumElements() {
 LLVMScalableVectorType LLVMScalableVectorType::get(LLVMType elementType,
                                                    unsigned minNumElements) {
   assert(elementType && "expected non-null subtype");
-  return Base::get(elementType.getContext(), LLVMType::ScalableVectorType,
-                   elementType, minNumElements);
+  return Base::get(elementType.getContext(), elementType, minNumElements);
 }
 
 LLVMScalableVectorType
 LLVMScalableVectorType::getChecked(Location loc, LLVMType elementType,
                                    unsigned minNumElements) {
   assert(elementType && "expected non-null subtype");
-  return Base::getChecked(loc, LLVMType::ScalableVectorType, elementType,
-                          minNumElements);
+  return Base::getChecked(loc, elementType, minNumElements);
 }
 
 unsigned LLVMScalableVectorType::getMinNumElements() {
diff --git a/mlir/lib/Dialect/Quant/IR/QuantTypes.cpp b/mlir/lib/Dialect/Quant/IR/QuantTypes.cpp
index ef7d8144b25995..41e64d1540f3dd 100644
--- a/mlir/lib/Dialect/Quant/IR/QuantTypes.cpp
+++ b/mlir/lib/Dialect/Quant/IR/QuantTypes.cpp
@@ -204,8 +204,8 @@ AnyQuantizedType AnyQuantizedType::get(unsigned flags, Type storageType,
                                        Type expressedType,
                                        int64_t storageTypeMin,
                                        int64_t storageTypeMax) {
-  return Base::get(storageType.getContext(), QuantizationTypes::Any, flags,
-                   storageType, expressedType, storageTypeMin, storageTypeMax);
+  return Base::get(storageType.getContext(), flags, storageType, expressedType,
+                   storageTypeMin, storageTypeMax);
 }
 
 AnyQuantizedType AnyQuantizedType::getChecked(unsigned flags, Type storageType,
@@ -213,8 +213,8 @@ AnyQuantizedType AnyQuantizedType::getChecked(unsigned flags, Type storageType,
                                               int64_t storageTypeMin,
                                               int64_t storageTypeMax,
                                               Location location) {
-  return Base::getChecked(location, QuantizationTypes::Any, flags, storageType,
-                          expressedType, storageTypeMin, storageTypeMax);
+  return Base::getChecked(location, flags, storageType, expressedType,
+                          storageTypeMin, storageTypeMax);
 }
 
 LogicalResult AnyQuantizedType::verifyConstructionInvariants(
@@ -240,10 +240,8 @@ UniformQuantizedType UniformQuantizedType::get(unsigned flags, Type storageType,
                                                int64_t zeroPoint,
                                                int64_t storageTypeMin,
                                                int64_t storageTypeMax) {
-  return Base::get(storageType.getContext(),
-                   QuantizationTypes::UniformQuantized, flags, storageType,
-                   expressedType, scale, zeroPoint, storageTypeMin,
-                   storageTypeMax);
+  return Base::get(storageType.getContext(), flags, storageType, expressedType,
+                   scale, zeroPoint, storageTypeMin, storageTypeMax);
 }
 
 UniformQuantizedType
@@ -251,9 +249,8 @@ UniformQuantizedType::getChecked(unsigned flags, Type storageType,
                                  Type expressedType, double scale,
                                  int64_t zeroPoint, int64_t storageTypeMin,
                                  int64_t storageTypeMax, Location location) {
-  return Base::getChecked(location, QuantizationTypes::UniformQuantized, flags,
-                          storageType, expressedType, scale, zeroPoint,
-                          storageTypeMin, storageTypeMax);
+  return Base::getChecked(location, flags, storageType, expressedType, scale,
+                          zeroPoint, storageTypeMin, storageTypeMax);
 }
 
 LogicalResult UniformQuantizedType::verifyConstructionInvariants(
@@ -295,10 +292,9 @@ UniformQuantizedPerAxisType UniformQuantizedPerAxisType::get(
     ArrayRef<double> scales, ArrayRef<int64_t> zeroPoints,
     int32_t quantizedDimension, int64_t storageTypeMin,
     int64_t storageTypeMax) {
-  return Base::get(storageType.getContext(),
-                   QuantizationTypes::UniformQuantizedPerAxis, flags,
-                   storageType, expressedType, scales, zeroPoints,
-                   quantizedDimension, storageTypeMin, storageTypeMax);
+  return Base::get(storageType.getContext(), flags, storageType, expressedType,
+                   scales, zeroPoints, quantizedDimension, storageTypeMin,
+                   storageTypeMax);
 }
 
 UniformQuantizedPerAxisType UniformQuantizedPerAxisType::getChecked(
@@ -306,9 +302,9 @@ UniformQuantizedPerAxisType UniformQuantizedPerAxisType::getChecked(
     ArrayRef<double> scales, ArrayRef<int64_t> zeroPoints,
     int32_t quantizedDimension, int64_t storageTypeMin, int64_t storageTypeMax,
     Location location) {
-  return Base::getChecked(location, QuantizationTypes::UniformQuantizedPerAxis,
-                          flags, storageType, expressedType, scales, zeroPoints,
-                          quantizedDimension, storageTypeMin, storageTypeMax);
+  return Base::getChecked(location, flags, storageType, expressedType, scales,
+                          zeroPoints, quantizedDimension, storageTypeMin,
+                          storageTypeMax);
 }
 
 LogicalResult UniformQuantizedPerAxisType::verifyConstructionInvariants(
diff --git a/mlir/lib/Dialect/SDBM/SDBMDialect.cpp b/mlir/lib/Dialect/SDBM/SDBMDialect.cpp
index 09c9d1dfd3d83f..4e3e050b4a4f8e 100644
--- a/mlir/lib/Dialect/SDBM/SDBMDialect.cpp
+++ b/mlir/lib/Dialect/SDBM/SDBMDialect.cpp
@@ -13,11 +13,11 @@ using namespace mlir;
 
 SDBMDialect::SDBMDialect(MLIRContext *context)
     : Dialect(getDialectNamespace(), context, TypeID::get<SDBMDialect>()) {
-  uniquer.registerStorageType(TypeID::get<detail::SDBMBinaryExprStorage>());
-  uniquer.registerStorageType(TypeID::get<detail::SDBMConstantExprStorage>());
-  uniquer.registerStorageType(TypeID::get<detail::SDBMDiffExprStorage>());
-  uniquer.registerStorageType(TypeID::get<detail::SDBMNegExprStorage>());
-  uniquer.registerStorageType(TypeID::get<detail::SDBMTermExprStorage>());
+  uniquer.registerParametricStorageType<detail::SDBMBinaryExprStorage>();
+  uniquer.registerParametricStorageType<detail::SDBMConstantExprStorage>();
+  uniquer.registerParametricStorageType<detail::SDBMDiffExprStorage>();
+  uniquer.registerParametricStorageType<detail::SDBMNegExprStorage>();
+  uniquer.registerParametricStorageType<detail::SDBMTermExprStorage>();
 }
 
 SDBMDialect::~SDBMDialect() = default;
diff --git a/mlir/lib/Dialect/SDBM/SDBMExpr.cpp b/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
index 435c7fe25f0c90..8da6c40bba88c2 100644
--- a/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
+++ b/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
@@ -246,7 +246,6 @@ SDBMSumExpr SDBMSumExpr::get(SDBMTermExpr lhs, SDBMConstantExpr rhs) {
 
   StorageUniquer &uniquer = lhs.getDialect()->getUniquer();
   return uniquer.get<detail::SDBMBinaryExprStorage>(
-      TypeID::get<detail::SDBMBinaryExprStorage>(),
       /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Add), lhs, rhs);
 }
 
@@ -533,9 +532,7 @@ SDBMDiffExpr SDBMDiffExpr::get(SDBMDirectExpr lhs, SDBMTermExpr rhs) {
   assert(rhs && "expected SDBM dimension");
 
   StorageUniquer &uniquer = lhs.getDialect()->getUniquer();
-  return uniquer.get<detail::SDBMDiffExprStorage>(
-      TypeID::get<detail::SDBMDiffExprStorage>(),
-      /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Diff), lhs, rhs);
+  return uniquer.get<detail::SDBMDiffExprStorage>(/*initFn=*/{}, lhs, rhs);
 }
 
 SDBMDirectExpr SDBMDiffExpr::getLHS() const {
@@ -575,7 +572,6 @@ SDBMStripeExpr SDBMStripeExpr::get(SDBMDirectExpr var,
 
   StorageUniquer &uniquer = var.getDialect()->getUniquer();
   return uniquer.get<detail::SDBMBinaryExprStorage>(
-      TypeID::get<detail::SDBMBinaryExprStorage>(),
       /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Stripe), var,
       stripeFactor);
 }
@@ -611,8 +607,7 @@ SDBMDimExpr SDBMDimExpr::get(SDBMDialect *dialect, unsigned position) {
 
   StorageUniquer &uniquer = dialect->getUniquer();
   return uniquer.get<detail::SDBMTermExprStorage>(
-      TypeID::get<detail::SDBMTermExprStorage>(), assignDialect,
-      static_cast<unsigned>(SDBMExprKind::DimId), position);
+      assignDialect, static_cast<unsigned>(SDBMExprKind::DimId), position);
 }
 
 //===----------------------------------------------------------------------===//
@@ -628,8 +623,7 @@ SDBMSymbolExpr SDBMSymbolExpr::get(SDBMDialect *dialect, unsigned position) {
 
   StorageUniquer &uniquer = dialect->getUniquer();
   return uniquer.get<detail::SDBMTermExprStorage>(
-      TypeID::get<detail::SDBMTermExprStorage>(), assignDialect,
-      static_cast<unsigned>(SDBMExprKind::SymbolId), position);
+      assignDialect, static_cast<unsigned>(SDBMExprKind::SymbolId), position);
 }
 
 //===----------------------------------------------------------------------===//
@@ -644,9 +638,7 @@ SDBMConstantExpr SDBMConstantExpr::get(SDBMDialect *dialect, int64_t value) {
   };
 
   StorageUniquer &uniquer = dialect->getUniquer();
-  return uniquer.get<detail::SDBMConstantExprStorage>(
-      TypeID::get<detail::SDBMConstantExprStorage>(), assignCtx,
-      static_cast<unsigned>(SDBMExprKind::Constant), value);
+  return uniquer.get<detail::SDBMConstantExprStorage>(assignCtx, value);
 }
 
 int64_t SDBMConstantExpr::getValue() const {
@@ -661,9 +653,7 @@ SDBMNegExpr SDBMNegExpr::get(SDBMDirectExpr var) {
   assert(var && "expected non-null SDBM direct expression");
 
   StorageUniquer &uniquer = var.getDialect()->getUniquer();
-  return uniquer.get<detail::SDBMNegExprStorage>(
-      TypeID::get<detail::SDBMNegExprStorage>(),
-      /*initFn=*/{}, static_cast<unsigned>(SDBMExprKind::Neg), var);
+  return uniquer.get<detail::SDBMNegExprStorage>(/*initFn=*/{}, var);
 }
 
 SDBMDirectExpr SDBMNegExpr::getVar() const {
diff --git a/mlir/lib/Dialect/SDBM/SDBMExprDetail.h b/mlir/lib/Dialect/SDBM/SDBMExprDetail.h
index e344917fb2ca9e..8d91334c807e08 100644
--- a/mlir/lib/Dialect/SDBM/SDBMExprDetail.h
+++ b/mlir/lib/Dialect/SDBM/SDBMExprDetail.h
@@ -25,27 +25,28 @@ namespace detail {
 
 // Base storage class for SDBMExpr.
 struct SDBMExprStorage : public StorageUniquer::BaseStorage {
-  SDBMExprKind getKind() {
-    return static_cast<SDBMExprKind>(BaseStorage::getKind());
-  }
+  SDBMExprKind getKind() { return kind; }
 
   SDBMDialect *dialect;
+  SDBMExprKind kind;
 };
 
 // Storage class for SDBM sum and stripe expressions.
 struct SDBMBinaryExprStorage : public SDBMExprStorage {
-  using KeyTy = std::pair<SDBMDirectExpr, SDBMConstantExpr>;
+  using KeyTy = std::tuple<unsigned, SDBMDirectExpr, SDBMConstantExpr>;
 
   bool operator==(const KeyTy &key) const {
-    return std::get<0>(key) == lhs && std::get<1>(key) == rhs;
+    return static_cast<SDBMExprKind>(std::get<0>(key)) == kind &&
+           std::get<1>(key) == lhs && std::get<2>(key) == rhs;
   }
 
   static SDBMBinaryExprStorage *
   construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
     auto *result = allocator.allocate<SDBMBinaryExprStorage>();
-    result->lhs = std::get<0>(key);
-    result->rhs = std::get<1>(key);
+    result->lhs = std::get<1>(key);
+    result->rhs = std::get<2>(key);
     result->dialect = result->lhs.getDialect();
+    result->kind = static_cast<SDBMExprKind>(std::get<0>(key));
     return result;
   }
 
@@ -67,6 +68,7 @@ struct SDBMDiffExprStorage : public SDBMExprStorage {
     result->lhs = std::get<0>(key);
     result->rhs = std::get<1>(key);
     result->dialect = result->lhs.getDialect();
+    result->kind = SDBMExprKind::Diff;
     return result;
   }
 
@@ -84,6 +86,7 @@ struct SDBMConstantExprStorage : public SDBMExprStorage {
   construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
     auto *result = allocator.allocate<SDBMConstantExprStorage>();
     result->constant = key;
+    result->kind = SDBMExprKind::Constant;
     return result;
   }
 
@@ -92,14 +95,18 @@ struct SDBMConstantExprStorage : public SDBMExprStorage {
 
 // Storage class for SDBM dimension and symbol expressions.
 struct SDBMTermExprStorage : public SDBMExprStorage {
-  using KeyTy = unsigned;
+  using KeyTy = std::pair<unsigned, unsigned>;
 
-  bool operator==(const KeyTy &key) const { return position == key; }
+  bool operator==(const KeyTy &key) const {
+    return kind == static_cast<SDBMExprKind>(key.first) &&
+           position == key.second;
+  }
 
   static SDBMTermExprStorage *
   construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
     auto *result = allocator.allocate<SDBMTermExprStorage>();
-    result->position = key;
+    result->kind = static_cast<SDBMExprKind>(key.first);
+    result->position = key.second;
     return result;
   }
 
@@ -117,6 +124,7 @@ struct SDBMNegExprStorage : public SDBMExprStorage {
     auto *result = allocator.allocate<SDBMNegExprStorage>();
     result->expr = key;
     result->dialect = key.getDialect();
+    result->kind = SDBMExprKind::Neg;
     return result;
   }
 
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVAttributes.cpp b/mlir/lib/Dialect/SPIRV/SPIRVAttributes.cpp
index b2df52b076081f..c2bf4840ddc845 100644
--- a/mlir/lib/Dialect/SPIRV/SPIRVAttributes.cpp
+++ b/mlir/lib/Dialect/SPIRV/SPIRVAttributes.cpp
@@ -120,8 +120,7 @@ spirv::InterfaceVarABIAttr::get(IntegerAttr descriptorSet, IntegerAttr binding,
                                 IntegerAttr storageClass) {
   assert(descriptorSet && binding);
   MLIRContext *context = descriptorSet.getContext();
-  return Base::get(context, spirv::AttrKind::InterfaceVarABI, descriptorSet,
-                   binding, storageClass);
+  return Base::get(context, descriptorSet, binding, storageClass);
 }
 
 StringRef spirv::InterfaceVarABIAttr::getKindName() {
@@ -195,8 +194,7 @@ spirv::VerCapExtAttr spirv::VerCapExtAttr::get(IntegerAttr version,
                                                ArrayAttr extensions) {
   assert(version && capabilities && extensions);
   MLIRContext *context = version.getContext();
-  return Base::get(context, spirv::AttrKind::VerCapExt, version, capabilities,
-                   extensions);
+  return Base::get(context, version, capabilities, extensions);
 }
 
 StringRef spirv::VerCapExtAttr::getKindName() { return "vce"; }
@@ -272,7 +270,7 @@ spirv::TargetEnvAttr spirv::TargetEnvAttr::get(spirv::VerCapExtAttr triple,
                                                DictionaryAttr limits) {
   assert(triple && limits && "expected valid triple and limits");
   MLIRContext *context = triple.getContext();
-  return Base::get(context, spirv::AttrKind::TargetEnv, triple, limits);
+  return Base::get(context, triple, limits);
 }
 
 StringRef spirv::TargetEnvAttr::getKindName() { return "target_env"; }
diff --git a/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp
index b52ea812306e9c..e9cb4b2835e57e 100644
--- a/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp
+++ b/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp
@@ -124,15 +124,14 @@ struct spirv::detail::ArrayTypeStorage : public TypeStorage {
 
 ArrayType ArrayType::get(Type elementType, unsigned elementCount) {
   assert(elementCount && "ArrayType needs at least one element");
-  return Base::get(elementType.getContext(), TypeKind::Array, elementType,
-                   elementCount, /*stride=*/0);
+  return Base::get(elementType.getContext(), elementType, elementCount,
+                   /*stride=*/0);
 }
 
 ArrayType ArrayType::get(Type elementType, unsigned elementCount,
                          unsigned stride) {
   assert(elementCount && "ArrayType needs at least one element");
-  return Base::get(elementType.getContext(), TypeKind::Array, elementType,
-                   elementCount, stride);
+  return Base::get(elementType.getContext(), elementType, elementCount, stride);
 }
 
 unsigned ArrayType::getNumElements() const { return getImpl()->elementCount; }
@@ -285,8 +284,7 @@ struct spirv::detail::CooperativeMatrixTypeStorage : public TypeStorage {
 CooperativeMatrixNVType CooperativeMatrixNVType::get(Type elementType,
                                                      Scope scope, unsigned rows,
                                                      unsigned columns) {
-  return Base::get(elementType.getContext(), TypeKind::CooperativeMatrix,
-                   elementType, scope, rows, columns);
+  return Base::get(elementType.getContext(), elementType, scope, rows, columns);
 }
 
 Type CooperativeMatrixNVType::getElementType() const {
@@ -389,7 +387,7 @@ ImageType
 ImageType::get(std::tuple<Type, Dim, ImageDepthInfo, ImageArrayedInfo,
                           ImageSamplingInfo, ImageSamplerUseInfo, ImageFormat>
                    value) {
-  return Base::get(std::get<0>(value).getContext(), TypeKind::Image, value);
+  return Base::get(std::get<0>(value).getContext(), value);
 }
 
 Type ImageType::getElementType() const { return getImpl()->elementType; }
@@ -453,8 +451,7 @@ struct spirv::detail::PointerTypeStorage : public TypeStorage {
 };
 
 PointerType PointerType::get(Type pointeeType, StorageClass storageClass) {
-  return Base::get(pointeeType.getContext(), TypeKind::Pointer, pointeeType,
-                   storageClass);
+  return Base::get(pointeeType.getContext(), pointeeType, storageClass);
 }
 
 Type PointerType::getPointeeType() const { return getImpl()->pointeeType; }
@@ -511,13 +508,11 @@ struct spirv::detail::RuntimeArrayTypeStorage : public TypeStorage {
 };
 
 RuntimeArrayType RuntimeArrayType::get(Type elementType) {
-  return Base::get(elementType.getContext(), TypeKind::RuntimeArray,
-                   elementType, /*stride=*/0);
+  return Base::get(elementType.getContext(), elementType, /*stride=*/0);
 }
 
 RuntimeArrayType RuntimeArrayType::get(Type elementType, unsigned stride) {
-  return Base::get(elementType.getContext(), TypeKind::RuntimeArray,
-                   elementType, stride);
+  return Base::get(elementType.getContext(), elementType, stride);
 }
 
 Type RuntimeArrayType::getElementType() const { return getImpl()->elementType; }
@@ -846,12 +841,12 @@ StructType::get(ArrayRef<Type> memberTypes,
   SmallVector<StructType::MemberDecorationInfo, 4> sortedDecorations(
       memberDecorations.begin(), memberDecorations.end());
   llvm::array_pod_sort(sortedDecorations.begin(), sortedDecorations.end());
-  return Base::get(memberTypes.vec().front().getContext(), TypeKind::Struct,
-                   memberTypes, offsetInfo, sortedDecorations);
+  return Base::get(memberTypes.vec().front().getContext(), memberTypes,
+                   offsetInfo, sortedDecorations);
 }
 
 StructType StructType::getEmpty(MLIRContext *context) {
-  return Base::get(context, TypeKind::Struct, ArrayRef<Type>(),
+  return Base::get(context, ArrayRef<Type>(),
                    ArrayRef<StructType::OffsetInfo>(),
                    ArrayRef<StructType::MemberDecorationInfo>());
 }
@@ -946,13 +941,12 @@ struct spirv::detail::MatrixTypeStorage : public TypeStorage {
 };
 
 MatrixType MatrixType::get(Type columnType, uint32_t columnCount) {
-  return Base::get(columnType.getContext(), TypeKind::Matrix, columnType,
-                   columnCount);
+  return Base::get(columnType.getContext(), columnType, columnCount);
 }
 
 MatrixType MatrixType::getChecked(Type columnType, uint32_t columnCount,
                                   Location location) {
-  return Base::getChecked(location, TypeKind::Matrix, columnType, columnCount);
+  return Base::getChecked(location, columnType, columnCount);
 }
 
 LogicalResult MatrixType::verifyConstructionInvariants(Location loc,
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
index 83d080f17d7df0..fdecdc6c7168a9 100644
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -20,9 +20,7 @@ using namespace mlir::detail;
 
 MLIRContext *AffineExpr::getContext() const { return expr->context; }
 
-AffineExprKind AffineExpr::getKind() const {
-  return static_cast<AffineExprKind>(expr->getKind());
-}
+AffineExprKind AffineExpr::getKind() const { return expr->kind; }
 
 /// Walk all of the AffineExprs in this subgraph in postorder.
 void AffineExpr::walk(std::function<void(AffineExpr)> callback) const {
@@ -449,8 +447,7 @@ static AffineExpr getAffineDimOrSymbol(AffineExprKind kind, unsigned position,
 
   StorageUniquer &uniquer = context->getAffineUniquer();
   return uniquer.get<AffineDimExprStorage>(
-      TypeID::get<AffineDimExprStorage>(), assignCtx,
-      static_cast<unsigned>(kind), position);
+      assignCtx, static_cast<unsigned>(kind), position);
 }
 
 AffineExpr mlir::getAffineDimExpr(unsigned position, MLIRContext *context) {
@@ -484,9 +481,7 @@ AffineExpr mlir::getAffineConstantExpr(int64_t constant, MLIRContext *context) {
   };
 
   StorageUniquer &uniquer = context->getAffineUniquer();
-  return uniquer.get<AffineConstantExprStorage>(
-      TypeID::get<AffineConstantExprStorage>(), assignCtx,
-      static_cast<unsigned>(AffineExprKind::Constant), constant);
+  return uniquer.get<AffineConstantExprStorage>(assignCtx, constant);
 }
 
 /// Simplify add expression. Return nullptr if it can't be simplified.
@@ -594,7 +589,6 @@ AffineExpr AffineExpr::operator+(AffineExpr other) const {
 
   StorageUniquer &uniquer = getContext()->getAffineUniquer();
   return uniquer.get<AffineBinaryOpExprStorage>(
-      TypeID::get<AffineBinaryOpExprStorage>(),
       /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::Add), *this, other);
 }
 
@@ -655,7 +649,6 @@ AffineExpr AffineExpr::operator*(AffineExpr other) const {
 
   StorageUniquer &uniquer = getContext()->getAffineUniquer();
   return uniquer.get<AffineBinaryOpExprStorage>(
-      TypeID::get<AffineBinaryOpExprStorage>(),
       /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::Mul), *this, other);
 }
 
@@ -722,7 +715,6 @@ AffineExpr AffineExpr::floorDiv(AffineExpr other) const {
 
   StorageUniquer &uniquer = getContext()->getAffineUniquer();
   return uniquer.get<AffineBinaryOpExprStorage>(
-      TypeID::get<AffineBinaryOpExprStorage>(),
       /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::FloorDiv), *this,
       other);
 }
@@ -766,7 +758,6 @@ AffineExpr AffineExpr::ceilDiv(AffineExpr other) const {
 
   StorageUniquer &uniquer = getContext()->getAffineUniquer();
   return uniquer.get<AffineBinaryOpExprStorage>(
-      TypeID::get<AffineBinaryOpExprStorage>(),
       /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::CeilDiv), *this,
       other);
 }
@@ -814,7 +805,6 @@ AffineExpr AffineExpr::operator%(AffineExpr other) const {
 
   StorageUniquer &uniquer = getContext()->getAffineUniquer();
   return uniquer.get<AffineBinaryOpExprStorage>(
-      TypeID::get<AffineBinaryOpExprStorage>(),
       /*initFn=*/{}, static_cast<unsigned>(AffineExprKind::Mod), *this, other);
 }
 
diff --git a/mlir/lib/IR/AffineExprDetail.h b/mlir/lib/IR/AffineExprDetail.h
index ff47cd95969cf8..1c38e54bc3e63a 100644
--- a/mlir/lib/IR/AffineExprDetail.h
+++ b/mlir/lib/IR/AffineExprDetail.h
@@ -27,21 +27,24 @@ namespace detail {
 /// Base storage class appearing in an affine expression.
 struct AffineExprStorage : public StorageUniquer::BaseStorage {
   MLIRContext *context;
+  AffineExprKind kind;
 };
 
 /// A binary operation appearing in an affine expression.
 struct AffineBinaryOpExprStorage : public AffineExprStorage {
-  using KeyTy = std::pair<AffineExpr, AffineExpr>;
+  using KeyTy = std::tuple<unsigned, AffineExpr, AffineExpr>;
 
   bool operator==(const KeyTy &key) const {
-    return key.first == lhs && key.second == rhs;
+    return static_cast<AffineExprKind>(std::get<0>(key)) == kind &&
+           std::get<1>(key) == lhs && std::get<2>(key) == rhs;
   }
 
   static AffineBinaryOpExprStorage *
   construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
     auto *result = allocator.allocate<AffineBinaryOpExprStorage>();
-    result->lhs = key.first;
-    result->rhs = key.second;
+    result->kind = static_cast<AffineExprKind>(std::get<0>(key));
+    result->lhs = std::get<1>(key);
+    result->rhs = std::get<2>(key);
     result->context = result->lhs.getContext();
     return result;
   }
@@ -52,14 +55,18 @@ struct AffineBinaryOpExprStorage : public AffineExprStorage {
 
 /// A dimensional or symbolic identifier appearing in an affine expression.
 struct AffineDimExprStorage : public AffineExprStorage {
-  using KeyTy = unsigned;
+  using KeyTy = std::pair<unsigned, unsigned>;
 
-  bool operator==(const KeyTy &key) const { return position == key; }
+  bool operator==(const KeyTy &key) const {
+    return kind == static_cast<AffineExprKind>(key.first) &&
+           position == key.second;
+  }
 
   static AffineDimExprStorage *
   construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
     auto *result = allocator.allocate<AffineDimExprStorage>();
-    result->position = key;
+    result->kind = static_cast<AffineExprKind>(key.first);
+    result->position = key.second;
     return result;
   }
 
@@ -76,6 +83,7 @@ struct AffineConstantExprStorage : public AffineExprStorage {
   static AffineConstantExprStorage *
   construct(StorageUniquer::StorageAllocator &allocator, const KeyTy &key) {
     auto *result = allocator.allocate<AffineConstantExprStorage>();
+    result->kind = AffineExprKind::Constant;
     result->constant = key;
     return result;
   }
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index 61eecb81108504..2247fe390ad12a 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -271,7 +271,7 @@ class AliasState {
   /// Mapping between attribute kind and a pair comprised of a base alias name
   /// and a unique list of attributes belonging to this kind sorted by location
   /// seen in the module.
-  llvm::MapVector<unsigned, std::pair<StringRef, std::vector<Attribute>>>
+  llvm::MapVector<TypeID, std::pair<StringRef, std::vector<Attribute>>>
       attrKindToAlias;
 
   /// Set of types known to be used within the module.
@@ -301,13 +301,13 @@ void AliasState::initialize(
   llvm::StringSet<> usedAliases;
 
   // Collect the set of aliases from each dialect.
-  SmallVector<std::pair<unsigned, StringRef>, 8> attributeKindAliases;
+  SmallVector<std::pair<TypeID, StringRef>, 8> attributeKindAliases;
   SmallVector<std::pair<Attribute, StringRef>, 8> attributeAliases;
   SmallVector<std::pair<Type, StringRef>, 16> typeAliases;
 
   // AffineMap/Integer set have specific kind aliases.
-  attributeKindAliases.emplace_back(StandardAttributes::AffineMap, "map");
-  attributeKindAliases.emplace_back(StandardAttributes::IntegerSet, "set");
+  attributeKindAliases.emplace_back(AffineMapAttr::getTypeID(), "map");
+  attributeKindAliases.emplace_back(IntegerSetAttr::getTypeID(), "set");
 
   for (auto &interface : interfaces) {
     interface.getAttributeKindAliases(attributeKindAliases);
@@ -317,7 +317,7 @@ void AliasState::initialize(
 
   // Setup the attribute kind aliases.
   StringRef alias;
-  unsigned attrKind;
+  TypeID attrKind;
   for (auto &attrAliasPair : attributeKindAliases) {
     std::tie(attrKind, alias) = attrAliasPair;
     assert(!alias.empty() && "expected non-empty alias string");
@@ -420,7 +420,7 @@ void AliasState::recordAttributeReference(Attribute attr) {
     return;
 
   // If this attribute kind has an alias, then record one for this attribute.
-  auto alias = attrKindToAlias.find(static_cast<unsigned>(attr.getKind()));
+  auto alias = attrKindToAlias.find(attr.getTypeID());
   if (alias == attrKindToAlias.end())
     return;
   std::pair<StringRef, int> attrAlias(alias->second.first,
diff --git a/mlir/lib/IR/Attributes.cpp b/mlir/lib/IR/Attributes.cpp
index dba7872b2a6ca5..ac51cba88f1c8e 100644
--- a/mlir/lib/IR/Attributes.cpp
+++ b/mlir/lib/IR/Attributes.cpp
@@ -57,7 +57,7 @@ Dialect &Attribute::getDialect() const {
 //===----------------------------------------------------------------------===//
 
 AffineMapAttr AffineMapAttr::get(AffineMap value) {
-  return Base::get(value.getContext(), StandardAttributes::AffineMap, value);
+  return Base::get(value.getContext(), value);
 }
 
 AffineMap AffineMapAttr::getValue() const { return getImpl()->value; }
@@ -67,7 +67,7 @@ AffineMap AffineMapAttr::getValue() const { return getImpl()->value; }
 //===----------------------------------------------------------------------===//
 
 ArrayAttr ArrayAttr::get(ArrayRef<Attribute> value, MLIRContext *context) {
-  return Base::get(context, StandardAttributes::Array, value);
+  return Base::get(context, value);
 }
 
 ArrayRef<Attribute> ArrayAttr::getValue() const { return getImpl()->value; }
@@ -156,7 +156,7 @@ DictionaryAttr DictionaryAttr::get(ArrayRef<NamedAttribute> value,
   if (dictionaryAttrSort</*inPlace=*/false>(value, storage))
     value = storage;
 
-  return Base::get(context, StandardAttributes::Dictionary, value);
+  return Base::get(context, value);
 }
 /// Construct a dictionary with an array of values that is known to already be
 /// sorted by name and uniqued.
@@ -175,7 +175,7 @@ DictionaryAttr DictionaryAttr::getWithSorted(ArrayRef<NamedAttribute> value,
                               return l.first == r.first;
                             }) == value.end() &&
          "DictionaryAttr element names must be unique");
-  return Base::get(context, StandardAttributes::Dictionary, value);
+  return Base::get(context, value);
 }
 
 ArrayRef<NamedAttribute> DictionaryAttr::getValue() const {
@@ -219,19 +219,19 @@ size_t DictionaryAttr::size() const { return getValue().size(); }
 //===----------------------------------------------------------------------===//
 
 FloatAttr FloatAttr::get(Type type, double value) {
-  return Base::get(type.getContext(), StandardAttributes::Float, type, value);
+  return Base::get(type.getContext(), type, value);
 }
 
 FloatAttr FloatAttr::getChecked(Type type, double value, Location loc) {
-  return Base::getChecked(loc, StandardAttributes::Float, type, value);
+  return Base::getChecked(loc, type, value);
 }
 
 FloatAttr FloatAttr::get(Type type, const APFloat &value) {
-  return Base::get(type.getContext(), StandardAttributes::Float, type, value);
+  return Base::get(type.getContext(), type, value);
 }
 
 FloatAttr FloatAttr::getChecked(Type type, const APFloat &value, Location loc) {
-  return Base::getChecked(loc, StandardAttributes::Float, type, value);
+  return Base::getChecked(loc, type, value);
 }
 
 APFloat FloatAttr::getValue() const { return getImpl()->getValue(); }
@@ -279,14 +279,13 @@ LogicalResult FloatAttr::verifyConstructionInvariants(Location loc, Type type,
 //===----------------------------------------------------------------------===//
 
 FlatSymbolRefAttr SymbolRefAttr::get(StringRef value, MLIRContext *ctx) {
-  return Base::get(ctx, StandardAttributes::SymbolRef, value, llvm::None)
-      .cast<FlatSymbolRefAttr>();
+  return Base::get(ctx, value, llvm::None).cast<FlatSymbolRefAttr>();
 }
 
 SymbolRefAttr SymbolRefAttr::get(StringRef value,
                                  ArrayRef<FlatSymbolRefAttr> nestedReferences,
                                  MLIRContext *ctx) {
-  return Base::get(ctx, StandardAttributes::SymbolRef, value, nestedReferences);
+  return Base::get(ctx, value, nestedReferences);
 }
 
 StringRef SymbolRefAttr::getRootReference() const { return getImpl()->value; }
@@ -307,7 +306,7 @@ ArrayRef<FlatSymbolRefAttr> SymbolRefAttr::getNestedReferences() const {
 IntegerAttr IntegerAttr::get(Type type, const APInt &value) {
   if (type.isSignlessInteger(1))
     return BoolAttr::get(value.getBoolValue(), type.getContext());
-  return Base::get(type.getContext(), StandardAttributes::Integer, type, value);
+  return Base::get(type.getContext(), type, value);
 }
 
 IntegerAttr IntegerAttr::get(Type type, int64_t value) {
@@ -380,8 +379,7 @@ bool BoolAttr::classof(Attribute attr) {
 //===----------------------------------------------------------------------===//
 
 IntegerSetAttr IntegerSetAttr::get(IntegerSet value) {
-  return Base::get(value.getConstraint(0).getContext(),
-                   StandardAttributes::IntegerSet, value);
+  return Base::get(value.getConstraint(0).getContext(), value);
 }
 
 IntegerSet IntegerSetAttr::getValue() const { return getImpl()->value; }
@@ -392,14 +390,12 @@ IntegerSet IntegerSetAttr::getValue() const { return getImpl()->value; }
 
 OpaqueAttr OpaqueAttr::get(Identifier dialect, StringRef attrData, Type type,
                            MLIRContext *context) {
-  return Base::get(context, StandardAttributes::Opaque, dialect, attrData,
-                   type);
+  return Base::get(context, dialect, attrData, type);
 }
 
 OpaqueAttr OpaqueAttr::getChecked(Identifier dialect, StringRef attrData,
                                   Type type, Location location) {
-  return Base::getChecked(location, StandardAttributes::Opaque, dialect,
-                          attrData, type);
+  return Base::getChecked(location, dialect, attrData, type);
 }
 
 /// Returns the dialect namespace of the opaque attribute.
@@ -430,7 +426,7 @@ StringAttr StringAttr::get(StringRef bytes, MLIRContext *context) {
 
 /// Get an instance of a StringAttr with the given string and Type.
 StringAttr StringAttr::get(StringRef bytes, Type type) {
-  return Base::get(type.getContext(), StandardAttributes::String, bytes, type);
+  return Base::get(type.getContext(), bytes, type);
 }
 
 StringRef StringAttr::getValue() const { return getImpl()->value; }
@@ -440,7 +436,7 @@ StringRef StringAttr::getValue() const { return getImpl()->value; }
 //===----------------------------------------------------------------------===//
 
 TypeAttr TypeAttr::get(Type value) {
-  return Base::get(value.getContext(), StandardAttributes::Type, value);
+  return Base::get(value.getContext(), value);
 }
 
 Type TypeAttr::getValue() const { return getImpl()->value; }
@@ -1036,8 +1032,7 @@ DenseElementsAttr DenseElementsAttr::mapValues(
 
 DenseStringElementsAttr
 DenseStringElementsAttr::get(ShapedType type, ArrayRef<StringRef> values) {
-  return Base::get(type.getContext(), StandardAttributes::DenseStringElements,
-                   type, values, (values.size() == 1));
+  return Base::get(type.getContext(), type, values, (values.size() == 1));
 }
 
 //===----------------------------------------------------------------------===//
@@ -1088,8 +1083,7 @@ DenseElementsAttr DenseIntOrFPElementsAttr::getRaw(ShapedType type,
   assert((type.isa<RankedTensorType, VectorType>()) &&
          "type must be ranked tensor or vector");
   assert(type.hasStaticShape() && "type must have static shape");
-  return Base::get(type.getContext(), StandardAttributes::DenseIntOrFPElements,
-                   type, data, isSplat);
+  return Base::get(type.getContext(), type, data, isSplat);
 }
 
 /// Overload of the raw 'get' method that asserts that the given type is of
@@ -1210,8 +1204,7 @@ OpaqueElementsAttr OpaqueElementsAttr::get(Dialect *dialect, ShapedType type,
                                            StringRef bytes) {
   assert(TensorType::isValidElementType(type.getElementType()) &&
          "Input element type should be a valid tensor element type");
-  return Base::get(type.getContext(), StandardAttributes::OpaqueElements, type,
-                   dialect, bytes);
+  return Base::get(type.getContext(), type, dialect, bytes);
 }
 
 StringRef OpaqueElementsAttr::getValue() const { return getImpl()->bytes; }
@@ -1248,7 +1241,7 @@ SparseElementsAttr SparseElementsAttr::get(ShapedType type,
   assert((type.isa<RankedTensorType, VectorType>()) &&
          "type must be ranked tensor or vector");
   assert(type.hasStaticShape() && "type must have static shape");
-  return Base::get(type.getContext(), StandardAttributes::SparseElements, type,
+  return Base::get(type.getContext(), type,
                    indices.cast<DenseIntElementsAttr>(), values);
 }
 
diff --git a/mlir/lib/IR/Location.cpp b/mlir/lib/IR/Location.cpp
index 48b05ba0eb40da..151e2cf0cd61fd 100644
--- a/mlir/lib/IR/Location.cpp
+++ b/mlir/lib/IR/Location.cpp
@@ -28,8 +28,7 @@ bool LocationAttr::classof(Attribute attr) {
 //===----------------------------------------------------------------------===//
 
 Location CallSiteLoc::get(Location callee, Location caller) {
-  return Base::get(callee->getContext(), StandardAttributes::CallSiteLocation,
-                   callee, caller);
+  return Base::get(callee->getContext(), callee, caller);
 }
 
 Location CallSiteLoc::get(Location name, ArrayRef<Location> frames) {
@@ -50,8 +49,7 @@ Location CallSiteLoc::getCaller() const { return getImpl()->caller; }
 
 Location FileLineColLoc::get(Identifier filename, unsigned line,
                              unsigned column, MLIRContext *context) {
-  return Base::get(context, StandardAttributes::FileLineColLocation, filename,
-                   line, column);
+  return Base::get(context, filename, line, column);
 }
 
 Location FileLineColLoc::get(StringRef filename, unsigned line, unsigned column,
@@ -95,7 +93,7 @@ Location FusedLoc::get(ArrayRef<Location> locs, Attribute metadata,
     return UnknownLoc::get(context);
   if (locs.size() == 1)
     return locs.front();
-  return Base::get(context, StandardAttributes::FusedLocation, locs, metadata);
+  return Base::get(context, locs, metadata);
 }
 
 ArrayRef<Location> FusedLoc::getLocations() const {
@@ -111,8 +109,7 @@ Attribute FusedLoc::getMetadata() const { return getImpl()->metadata; }
 Location NameLoc::get(Identifier name, Location child) {
   assert(!child.isa<NameLoc>() &&
          "a NameLoc cannot be used as a child of another NameLoc");
-  return Base::get(child->getContext(), StandardAttributes::NameLocation, name,
-                   child);
+  return Base::get(child->getContext(), name, child);
 }
 
 Location NameLoc::get(Identifier name, MLIRContext *context) {
@@ -131,9 +128,8 @@ Location NameLoc::getChildLoc() const { return getImpl()->child; }
 
 Location OpaqueLoc::get(uintptr_t underlyingLocation, TypeID typeID,
                         Location fallbackLocation) {
-  return Base::get(fallbackLocation->getContext(),
-                   StandardAttributes::OpaqueLocation, underlyingLocation,
-                   typeID, fallbackLocation);
+  return Base::get(fallbackLocation->getContext(), underlyingLocation, typeID,
+                   fallbackLocation);
 }
 
 uintptr_t OpaqueLoc::getUnderlyingLocation() const {
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 0d66070657aaff..a86f27a7145f26 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -87,6 +87,10 @@ namespace {
 struct BuiltinDialect : public Dialect {
   BuiltinDialect(MLIRContext *context)
       : Dialect(/*name=*/"", context, TypeID::get<BuiltinDialect>()) {
+    addTypes<ComplexType, BFloat16Type, Float16Type, Float32Type, Float64Type,
+             FunctionType, IndexType, IntegerType, MemRefType,
+             UnrankedMemRefType, NoneType, OpaqueType, RankedTensorType,
+             TupleType, UnrankedTensorType, VectorType>();
     addAttributes<AffineMapAttr, ArrayAttr, DenseIntOrFPElementsAttr,
                   DenseStringElementsAttr, DictionaryAttr, FloatAttr,
                   SymbolRefAttr, IntegerAttr, IntegerSetAttr, OpaqueAttr,
@@ -95,11 +99,6 @@ struct BuiltinDialect : public Dialect {
     addAttributes<CallSiteLoc, FileLineColLoc, FusedLoc, NameLoc, OpaqueLoc,
                   UnknownLoc>();
 
-    addTypes<ComplexType, BFloat16Type, Float16Type, Float32Type, Float64Type,
-             FunctionType, IndexType, IntegerType, MemRefType,
-             UnrankedMemRefType, NoneType, OpaqueType, RankedTensorType,
-             TupleType, UnrankedTensorType, VectorType>();
-
     // TODO: These operations should be moved to a different dialect when they
     // have been fully decoupled from the core.
     addOperations<FuncOp, ModuleOp, ModuleTerminatorOp>();
@@ -363,56 +362,50 @@ MLIRContext::MLIRContext() : impl(new MLIRContextImpl()) {
 
   //// Types.
   /// Floating-point Types.
-  impl->bf16Ty = TypeUniquer::get<BFloat16Type>(this, StandardTypes::BF16);
-  impl->f16Ty = TypeUniquer::get<Float16Type>(this, StandardTypes::F16);
-  impl->f32Ty = TypeUniquer::get<Float32Type>(this, StandardTypes::F32);
-  impl->f64Ty = TypeUniquer::get<Float64Type>(this, StandardTypes::F64);
+  impl->bf16Ty = TypeUniquer::get<BFloat16Type>(this);
+  impl->f16Ty = TypeUniquer::get<Float16Type>(this);
+  impl->f32Ty = TypeUniquer::get<Float32Type>(this);
+  impl->f64Ty = TypeUniquer::get<Float64Type>(this);
   /// Index Type.
-  impl->indexTy = TypeUniquer::get<IndexType>(this, StandardTypes::Index);
+  impl->indexTy = TypeUniquer::get<IndexType>(this);
   /// Integer Types.
-  impl->int1Ty = TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 1,
-                                               IntegerType::Signless);
-  impl->int8Ty = TypeUniquer::get<IntegerType>(this, StandardTypes::Integer, 8,
-                                               IntegerType::Signless);
-  impl->int16Ty = TypeUniquer::get<IntegerType>(this, StandardTypes::Integer,
-                                                16, IntegerType::Signless);
-  impl->int32Ty = TypeUniquer::get<IntegerType>(this, StandardTypes::Integer,
-                                                32, IntegerType::Signless);
-  impl->int64Ty = TypeUniquer::get<IntegerType>(this, StandardTypes::Integer,
-                                                64, IntegerType::Signless);
-  impl->int128Ty = TypeUniquer::get<IntegerType>(this, StandardTypes::Integer,
-                                                 128, IntegerType::Signless);
+  impl->int1Ty = TypeUniquer::get<IntegerType>(this, 1, IntegerType::Signless);
+  impl->int8Ty = TypeUniquer::get<IntegerType>(this, 8, IntegerType::Signless);
+  impl->int16Ty =
+      TypeUniquer::get<IntegerType>(this, 16, IntegerType::Signless);
+  impl->int32Ty =
+      TypeUniquer::get<IntegerType>(this, 32, IntegerType::Signless);
+  impl->int64Ty =
+      TypeUniquer::get<IntegerType>(this, 64, IntegerType::Signless);
+  impl->int128Ty =
+      TypeUniquer::get<IntegerType>(this, 128, IntegerType::Signless);
   /// None Type.
-  impl->noneType = TypeUniquer::get<NoneType>(this, StandardTypes::None);
+  impl->noneType = TypeUniquer::get<NoneType>(this);
 
   //// Attributes.
   //// Note: These must be registered after the types as they may generate one
   //// of the above types internally.
   /// Bool Attributes.
   impl->falseAttr = AttributeUniquer::get<IntegerAttr>(
-                        this, StandardAttributes::Integer, impl->int1Ty,
-                        APInt(/*numBits=*/1, false))
+                        this, impl->int1Ty, APInt(/*numBits=*/1, false))
                         .cast<BoolAttr>();
   impl->trueAttr = AttributeUniquer::get<IntegerAttr>(
-                       this, StandardAttributes::Integer, impl->int1Ty,
-                       APInt(/*numBits=*/1, true))
+                       this, impl->int1Ty, APInt(/*numBits=*/1, true))
                        .cast<BoolAttr>();
   /// Unit Attribute.
-  impl->unitAttr =
-      AttributeUniquer::get<UnitAttr>(this, StandardAttributes::Unit);
+  impl->unitAttr = AttributeUniquer::get<UnitAttr>(this);
   /// Unknown Location Attribute.
-  impl->unknownLocAttr = AttributeUniquer::get<UnknownLoc>(
-      this, StandardAttributes::UnknownLocation);
+  impl->unknownLocAttr = AttributeUniquer::get<UnknownLoc>(this);
   /// The empty dictionary attribute.
-  impl->emptyDictionaryAttr = AttributeUniquer::get<DictionaryAttr>(
-      this, StandardAttributes::Dictionary, ArrayRef<NamedAttribute>());
+  impl->emptyDictionaryAttr =
+      AttributeUniquer::get<DictionaryAttr>(this, ArrayRef<NamedAttribute>());
 
   // Register the affine storage objects with the uniquer.
-  impl->affineUniquer.registerStorageType(
-      TypeID::get<AffineBinaryOpExprStorage>());
-  impl->affineUniquer.registerStorageType(
-      TypeID::get<AffineConstantExprStorage>());
-  impl->affineUniquer.registerStorageType(TypeID::get<AffineDimExprStorage>());
+  impl->affineUniquer
+      .registerParametricStorageType<AffineBinaryOpExprStorage>();
+  impl->affineUniquer
+      .registerParametricStorageType<AffineConstantExprStorage>();
+  impl->affineUniquer.registerParametricStorageType<AffineDimExprStorage>();
 }
 
 MLIRContext::~MLIRContext() {}
@@ -582,7 +575,6 @@ void Dialect::addType(TypeID typeID, AbstractType &&typeInfo) {
           AbstractType(std::move(typeInfo));
   if (!impl.registeredTypes.insert({typeID, newInfo}).second)
     llvm::report_fatal_error("Dialect Type already registered.");
-  impl.typeUniquer.registerStorageType(typeID);
 }
 
 void Dialect::addAttribute(TypeID typeID, AbstractAttribute &&attrInfo) {
@@ -592,7 +584,6 @@ void Dialect::addAttribute(TypeID typeID, AbstractAttribute &&attrInfo) {
           AbstractAttribute(std::move(attrInfo));
   if (!impl.registeredAttributes.insert({typeID, newInfo}).second)
     llvm::report_fatal_error("Dialect Attribute already registered.");
-  impl.attributeUniquer.registerStorageType(typeID);
 }
 
 /// Get the dialect that registered the attribute with the provided typeid.
@@ -718,7 +709,7 @@ IntegerType IntegerType::get(unsigned width,
                              MLIRContext *context) {
   if (auto cached = getCachedIntegerType(width, signedness, context))
     return cached;
-  return Base::get(context, StandardTypes::Integer, width, signedness);
+  return Base::get(context, width, signedness);
 }
 
 IntegerType IntegerType::getChecked(unsigned width, Location location) {
@@ -731,12 +722,16 @@ IntegerType IntegerType::getChecked(unsigned width,
   if (auto cached =
           getCachedIntegerType(width, signedness, location->getContext()))
     return cached;
-  return Base::getChecked(location, StandardTypes::Integer, width, signedness);
+  return Base::getChecked(location, width, signedness);
 }
 
 /// Get an instance of the NoneType.
 NoneType NoneType::get(MLIRContext *context) {
-  return context->getImpl().noneType;
+  if (NoneType cachedInst = context->getImpl().noneType)
+    return cachedInst;
+  // Note: May happen when initializing the singleton attributes of the builtin
+  // dialect.
+  return Base::get(context);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/IR/StandardTypes.cpp b/mlir/lib/IR/StandardTypes.cpp
index f07532434a547f..8eb9025b7735bd 100644
--- a/mlir/lib/IR/StandardTypes.cpp
+++ b/mlir/lib/IR/StandardTypes.cpp
@@ -102,12 +102,11 @@ unsigned Type::getIntOrFloatBitWidth() {
 //===----------------------------------------------------------------------===//
 
 ComplexType ComplexType::get(Type elementType) {
-  return Base::get(elementType.getContext(), StandardTypes::Complex,
-                   elementType);
+  return Base::get(elementType.getContext(), elementType);
 }
 
 ComplexType ComplexType::getChecked(Type elementType, Location location) {
-  return Base::getChecked(location, StandardTypes::Complex, elementType);
+  return Base::getChecked(location, elementType);
 }
 
 /// Verify the construction of an integer type.
@@ -265,13 +264,12 @@ bool ShapedType::hasStaticShape(ArrayRef<int64_t> shape) const {
 //===----------------------------------------------------------------------===//
 
 VectorType VectorType::get(ArrayRef<int64_t> shape, Type elementType) {
-  return Base::get(elementType.getContext(), StandardTypes::Vector, shape,
-                   elementType);
+  return Base::get(elementType.getContext(), shape, elementType);
 }
 
 VectorType VectorType::getChecked(ArrayRef<int64_t> shape, Type elementType,
                                   Location location) {
-  return Base::getChecked(location, StandardTypes::Vector, shape, elementType);
+  return Base::getChecked(location, shape, elementType);
 }
 
 LogicalResult VectorType::verifyConstructionInvariants(Location loc,
@@ -320,15 +318,13 @@ bool TensorType::isValidElementType(Type type) {
 
 RankedTensorType RankedTensorType::get(ArrayRef<int64_t> shape,
                                        Type elementType) {
-  return Base::get(elementType.getContext(), StandardTypes::RankedTensor, shape,
-                   elementType);
+  return Base::get(elementType.getContext(), shape, elementType);
 }
 
 RankedTensorType RankedTensorType::getChecked(ArrayRef<int64_t> shape,
                                               Type elementType,
                                               Location location) {
-  return Base::getChecked(location, StandardTypes::RankedTensor, shape,
-                          elementType);
+  return Base::getChecked(location, shape, elementType);
 }
 
 LogicalResult RankedTensorType::verifyConstructionInvariants(
@@ -349,13 +345,12 @@ ArrayRef<int64_t> RankedTensorType::getShape() const {
 //===----------------------------------------------------------------------===//
 
 UnrankedTensorType UnrankedTensorType::get(Type elementType) {
-  return Base::get(elementType.getContext(), StandardTypes::UnrankedTensor,
-                   elementType);
+  return Base::get(elementType.getContext(), elementType);
 }
 
 UnrankedTensorType UnrankedTensorType::getChecked(Type elementType,
                                                   Location location) {
-  return Base::getChecked(location, StandardTypes::UnrankedTensor, elementType);
+  return Base::getChecked(location, elementType);
 }
 
 LogicalResult
@@ -444,8 +439,8 @@ MemRefType MemRefType::getImpl(ArrayRef<int64_t> shape, Type elementType,
     cleanedAffineMapComposition.push_back(map);
   }
 
-  return Base::get(context, StandardTypes::MemRef, shape, elementType,
-                   cleanedAffineMapComposition, memorySpace);
+  return Base::get(context, shape, elementType, cleanedAffineMapComposition,
+                   memorySpace);
 }
 
 ArrayRef<int64_t> MemRefType::getShape() const { return getImpl()->getShape(); }
@@ -462,15 +457,13 @@ unsigned MemRefType::getMemorySpace() const { return getImpl()->memorySpace; }
 
 UnrankedMemRefType UnrankedMemRefType::get(Type elementType,
                                            unsigned memorySpace) {
-  return Base::get(elementType.getContext(), StandardTypes::UnrankedMemRef,
-                   elementType, memorySpace);
+  return Base::get(elementType.getContext(), elementType, memorySpace);
 }
 
 UnrankedMemRefType UnrankedMemRefType::getChecked(Type elementType,
                                                   unsigned memorySpace,
                                                   Location location) {
-  return Base::getChecked(location, StandardTypes::UnrankedMemRef, elementType,
-                          memorySpace);
+  return Base::getChecked(location, elementType, memorySpace);
 }
 
 unsigned UnrankedMemRefType::getMemorySpace() const {
@@ -642,7 +635,7 @@ LogicalResult mlir::getStridesAndOffset(MemRefType t,
 /// Get or create a new TupleType with the provided element types. Assumes the
 /// arguments define a well-formed type.
 TupleType TupleType::get(TypeRange elementTypes, MLIRContext *context) {
-  return Base::get(context, StandardTypes::Tuple, elementTypes);
+  return Base::get(context, elementTypes);
 }
 
 /// Get or create an empty tuple type.
diff --git a/mlir/lib/IR/Types.cpp b/mlir/lib/IR/Types.cpp
index ae2dd909ff59a1..cdcd6a9c6ea5fe 100644
--- a/mlir/lib/IR/Types.cpp
+++ b/mlir/lib/IR/Types.cpp
@@ -19,8 +19,6 @@ using namespace mlir::detail;
 // Type
 //===----------------------------------------------------------------------===//
 
-unsigned Type::getKind() const { return impl->getKind(); }
-
 Dialect &Type::getDialect() const {
   return impl->getAbstractType().getDialect();
 }
@@ -33,7 +31,7 @@ MLIRContext *Type::getContext() const { return getDialect().getContext(); }
 
 FunctionType FunctionType::get(TypeRange inputs, TypeRange results,
                                MLIRContext *context) {
-  return Base::get(context, Type::Kind::Function, inputs, results);
+  return Base::get(context, inputs, results);
 }
 
 unsigned FunctionType::getNumInputs() const { return getImpl()->numInputs; }
@@ -54,12 +52,12 @@ ArrayRef<Type> FunctionType::getResults() const {
 
 OpaqueType OpaqueType::get(Identifier dialect, StringRef typeData,
                            MLIRContext *context) {
-  return Base::get(context, Type::Kind::Opaque, dialect, typeData);
+  return Base::get(context, dialect, typeData);
 }
 
 OpaqueType OpaqueType::getChecked(Identifier dialect, StringRef typeData,
                                   MLIRContext *context, Location location) {
-  return Base::getChecked(location, Kind::Opaque, dialect, typeData);
+  return Base::getChecked(location, dialect, typeData);
 }
 
 /// Returns the dialect namespace of the opaque type.
diff --git a/mlir/lib/Support/StorageUniquer.cpp b/mlir/lib/Support/StorageUniquer.cpp
index 49e7272091fbfb..73578b5c91acf2 100644
--- a/mlir/lib/Support/StorageUniquer.cpp
+++ b/mlir/lib/Support/StorageUniquer.cpp
@@ -16,19 +16,17 @@ using namespace mlir;
 using namespace mlir::detail;
 
 namespace {
-/// This class represents a uniquer for storage instances of a specific type. It
-/// contains all of the necessary data to unique storage instances in a thread
-/// safe way. This allows for the main uniquer to bucket each of the individual
-/// sub-types removing the need to lock the main uniquer itself.
-struct InstSpecificUniquer {
+/// This class represents a uniquer for storage instances of a specific type
+/// that has parametric storage. It contains all of the necessary data to unique
+/// storage instances in a thread safe way. This allows for the main uniquer to
+/// bucket each of the individual sub-types removing the need to lock the main
+/// uniquer itself.
+struct ParametricStorageUniquer {
   using BaseStorage = StorageUniquer::BaseStorage;
   using StorageAllocator = StorageUniquer::StorageAllocator;
 
   /// A lookup key for derived instances of storage objects.
   struct LookupKey {
-    /// The known derived kind for the storage.
-    unsigned kind;
-
     /// The known hash value of the key.
     unsigned hashValue;
 
@@ -63,18 +61,14 @@ struct InstSpecificUniquer {
     static bool isEqual(const LookupKey &lhs, const HashedStorage &rhs) {
       if (isEqual(rhs, getEmptyKey()) || isEqual(rhs, getTombstoneKey()))
         return false;
-      // If the lookup kind matches the kind of the storage, then invoke the
-      // equality function on the lookup key.
-      return lhs.kind == rhs.storage->getKind() && lhs.isEqual(rhs.storage);
+      // Invoke the equality function on the lookup key.
+      return lhs.isEqual(rhs.storage);
     }
   };
 
-  /// Unique types with specific hashing or storage constraints.
+  /// The set containing the allocated storage instances.
   using StorageTypeSet = DenseSet<HashedStorage, StorageKeyInfo>;
-  StorageTypeSet complexInstances;
-
-  /// Instances of this storage object.
-  llvm::SmallDenseMap<unsigned, BaseStorage *, 1> simpleInstances;
+  StorageTypeSet instances;
 
   /// Allocator to use when constructing derived instances.
   StorageAllocator allocator;
@@ -91,107 +85,79 @@ struct StorageUniquerImpl {
   using BaseStorage = StorageUniquer::BaseStorage;
   using StorageAllocator = StorageUniquer::StorageAllocator;
 
-  /// Get or create an instance of a complex derived type.
+  //===--------------------------------------------------------------------===//
+  // Parametric Storage
+  //===--------------------------------------------------------------------===//
+
+  /// Get or create an instance of a parametric type.
   BaseStorage *
-  getOrCreate(TypeID id, unsigned kind, unsigned hashValue,
+  getOrCreate(TypeID id, unsigned hashValue,
               function_ref<bool(const BaseStorage *)> isEqual,
               function_ref<BaseStorage *(StorageAllocator &)> ctorFn) {
-    assert(instUniquers.count(id) && "creating unregistered storage instance");
-    InstSpecificUniquer::LookupKey lookupKey{kind, hashValue, isEqual};
-    InstSpecificUniquer &storageUniquer = *instUniquers[id];
+    assert(parametricUniquers.count(id) &&
+           "creating unregistered storage instance");
+    ParametricStorageUniquer::LookupKey lookupKey{hashValue, isEqual};
+    ParametricStorageUniquer &storageUniquer = *parametricUniquers[id];
     if (!threadingIsEnabled)
-      return getOrCreateUnsafe(storageUniquer, kind, lookupKey, ctorFn);
+      return getOrCreateUnsafe(storageUniquer, lookupKey, ctorFn);
 
     // Check for an existing instance in read-only mode.
     {
       llvm::sys::SmartScopedReader<true> typeLock(storageUniquer.mutex);
-      auto it = storageUniquer.complexInstances.find_as(lookupKey);
-      if (it != storageUniquer.complexInstances.end())
+      auto it = storageUniquer.instances.find_as(lookupKey);
+      if (it != storageUniquer.instances.end())
         return it->storage;
     }
 
     // Acquire a writer-lock so that we can safely create the new type instance.
     llvm::sys::SmartScopedWriter<true> typeLock(storageUniquer.mutex);
-    return getOrCreateUnsafe(storageUniquer, kind, lookupKey, ctorFn);
+    return getOrCreateUnsafe(storageUniquer, lookupKey, ctorFn);
   }
   /// Get or create an instance of a complex derived type in an thread-unsafe
   /// fashion.
   BaseStorage *
-  getOrCreateUnsafe(InstSpecificUniquer &storageUniquer, unsigned kind,
-                    InstSpecificUniquer::LookupKey &lookupKey,
+  getOrCreateUnsafe(ParametricStorageUniquer &storageUniquer,
+                    ParametricStorageUniquer::LookupKey &lookupKey,
                     function_ref<BaseStorage *(StorageAllocator &)> ctorFn) {
-    auto existing = storageUniquer.complexInstances.insert_as({}, lookupKey);
+    auto existing = storageUniquer.instances.insert_as({}, lookupKey);
     if (!existing.second)
       return existing.first->storage;
 
     // Otherwise, construct and initialize the derived storage for this type
     // instance.
-    BaseStorage *storage =
-        initializeStorage(kind, storageUniquer.allocator, ctorFn);
+    BaseStorage *storage = ctorFn(storageUniquer.allocator);
     *existing.first =
-        InstSpecificUniquer::HashedStorage{lookupKey.hashValue, storage};
+        ParametricStorageUniquer::HashedStorage{lookupKey.hashValue, storage};
     return storage;
   }
 
-  /// Get or create an instance of a simple derived type.
-  BaseStorage *
-  getOrCreate(TypeID id, unsigned kind,
-              function_ref<BaseStorage *(StorageAllocator &)> ctorFn) {
-    assert(instUniquers.count(id) && "creating unregistered storage instance");
-    InstSpecificUniquer &storageUniquer = *instUniquers[id];
-    if (!threadingIsEnabled)
-      return getOrCreateUnsafe(storageUniquer, kind, ctorFn);
-
-    // Check for an existing instance in read-only mode.
-    {
-      llvm::sys::SmartScopedReader<true> typeLock(storageUniquer.mutex);
-      auto it = storageUniquer.simpleInstances.find(kind);
-      if (it != storageUniquer.simpleInstances.end())
-        return it->second;
-    }
-
-    // Acquire a writer-lock so that we can safely create the new type instance.
-    llvm::sys::SmartScopedWriter<true> typeLock(storageUniquer.mutex);
-    return getOrCreateUnsafe(storageUniquer, kind, ctorFn);
-  }
-  /// Get or create an instance of a simple derived type in an thread-unsafe
-  /// fashion.
-  BaseStorage *
-  getOrCreateUnsafe(InstSpecificUniquer &storageUniquer, unsigned kind,
-                    function_ref<BaseStorage *(StorageAllocator &)> ctorFn) {
-    auto &result = storageUniquer.simpleInstances[kind];
-    if (result)
-      return result;
-
-    // Otherwise, create and return a new storage instance.
-    return result = initializeStorage(kind, storageUniquer.allocator, ctorFn);
-  }
-
-  /// Erase an instance of a complex derived type.
-  void erase(TypeID id, unsigned kind, unsigned hashValue,
+  /// Erase an instance of a parametric derived type.
+  void erase(TypeID id, unsigned hashValue,
              function_ref<bool(const BaseStorage *)> isEqual,
              function_ref<void(BaseStorage *)> cleanupFn) {
-    assert(instUniquers.count(id) && "erasing unregistered storage instance");
-    InstSpecificUniquer &storageUniquer = *instUniquers[id];
-    InstSpecificUniquer::LookupKey lookupKey{kind, hashValue, isEqual};
+    assert(parametricUniquers.count(id) &&
+           "erasing unregistered storage instance");
+    ParametricStorageUniquer &storageUniquer = *parametricUniquers[id];
+    ParametricStorageUniquer::LookupKey lookupKey{hashValue, isEqual};
 
     // Acquire a writer-lock so that we can safely erase the type instance.
     llvm::sys::SmartScopedWriter<true> lock(storageUniquer.mutex);
-    auto existing = storageUniquer.complexInstances.find_as(lookupKey);
-    if (existing == storageUniquer.complexInstances.end())
+    auto existing = storageUniquer.instances.find_as(lookupKey);
+    if (existing == storageUniquer.instances.end())
       return;
 
     // Cleanup the storage and remove it from the map.
     cleanupFn(existing->storage);
-    storageUniquer.complexInstances.erase(existing);
+    storageUniquer.instances.erase(existing);
   }
 
   /// Mutates an instance of a derived storage in a thread-safe way.
   LogicalResult
   mutate(TypeID id,
          function_ref<LogicalResult(StorageAllocator &)> mutationFn) {
-    assert(instUniquers.count(id) && "mutating unregistered storage instance");
-    InstSpecificUniquer &storageUniquer = *instUniquers[id];
+    assert(parametricUniquers.count(id) &&
+           "mutating unregistered storage instance");
+    ParametricStorageUniquer &storageUniquer = *parametricUniquers[id];
     if (!threadingIsEnabled)
       return mutationFn(storageUniquer.allocator);
 
@@ -200,20 +166,30 @@ struct StorageUniquerImpl {
   }
 
   //===--------------------------------------------------------------------===//
-  // Instance Storage
+  // Singleton Storage
   //===--------------------------------------------------------------------===//
 
-  /// Utility to create and initialize a storage instance.
-  BaseStorage *
-  initializeStorage(unsigned kind, StorageAllocator &allocator,
-                    function_ref<BaseStorage *(StorageAllocator &)> ctorFn) {
-    BaseStorage *storage = ctorFn(allocator);
-    storage->kind = kind;
-    return storage;
+  /// Get or create an instance of a singleton storage class.
+  BaseStorage *getSingleton(TypeID id) {
+    BaseStorage *singletonInstance = singletonInstances[id];
+    assert(singletonInstance && "expected singleton instance to exist");
+    return singletonInstance;
   }
 
+  //===--------------------------------------------------------------------===//
+  // Instance Storage
+  //===--------------------------------------------------------------------===//
+
   /// Map of type ids to the storage uniquer to use for registered objects.
-  DenseMap<TypeID, std::unique_ptr<InstSpecificUniquer>> instUniquers;
+  DenseMap<TypeID, std::unique_ptr<ParametricStorageUniquer>>
+      parametricUniquers;
+
+  /// Map of type ids to a singleton instance when the storage class is a
+  /// singleton.
+  DenseMap<TypeID, BaseStorage *> singletonInstances;
+
+  /// Allocator used for uniquing singleton instances.
+  StorageAllocator singletonAllocator;
 
   /// Flag specifying if multi-threading is enabled within the uniquer.
   bool threadingIsEnabled = true;
@@ -229,41 +205,47 @@ void StorageUniquer::disableMultithreading(bool disable) {
   impl->threadingIsEnabled = !disable;
 }
 
-/// Register a new storage object with this uniquer using the given unique type
-/// id.
-void StorageUniquer::registerStorageType(TypeID id) {
-  impl->instUniquers.try_emplace(id, std::make_unique<InstSpecificUniquer>());
-}
-
 /// Implementation for getting/creating an instance of a derived type with
-/// complex storage.
-auto StorageUniquer::getImpl(
-    const TypeID &id, unsigned kind, unsigned hashValue,
+/// parametric storage.
+auto StorageUniquer::getParametricStorageTypeImpl(
+    TypeID id, unsigned hashValue,
     function_ref<bool(const BaseStorage *)> isEqual,
     function_ref<BaseStorage *(StorageAllocator &)> ctorFn) -> BaseStorage * {
-  return impl->getOrCreate(id, kind, hashValue, isEqual, ctorFn);
+  return impl->getOrCreate(id, hashValue, isEqual, ctorFn);
 }
 
-/// Implementation for getting/creating an instance of a derived type with
-/// default storage.
-auto StorageUniquer::getImpl(
-    const TypeID &id, unsigned kind,
-    function_ref<BaseStorage *(StorageAllocator &)> ctorFn) -> BaseStorage * {
-  return impl->getOrCreate(id, kind, ctorFn);
+/// Implementation for registering an instance of a derived type with
+/// parametric storage.
+void StorageUniquer::registerParametricStorageTypeImpl(TypeID id) {
+  impl->parametricUniquers.try_emplace(
+      id, std::make_unique<ParametricStorageUniquer>());
+}
+
+/// Implementation for getting an instance of a derived type with default
+/// storage.
+auto StorageUniquer::getSingletonImpl(TypeID id) -> BaseStorage * {
+  return impl->getSingleton(id);
+}
+
+/// Implementation for registering an instance of a derived type with default
+/// storage.
+void StorageUniquer::registerSingletonImpl(
+    TypeID id, function_ref<BaseStorage *(StorageAllocator &)> ctorFn) {
+  assert(!impl->singletonInstances.count(id) &&
+         "storage class already registered");
+  impl->singletonInstances.try_emplace(id, ctorFn(impl->singletonAllocator));
 }
 
-/// Implementation for erasing an instance of a derived type with complex
+/// Implementation for erasing an instance of a derived type with parametric
 /// storage.
-void StorageUniquer::eraseImpl(const TypeID &id, unsigned kind,
-                               unsigned hashValue,
+void StorageUniquer::eraseImpl(TypeID id, unsigned hashValue,
                                function_ref<bool(const BaseStorage *)> isEqual,
                                function_ref<void(BaseStorage *)> cleanupFn) {
-  impl->erase(id, kind, hashValue, isEqual, cleanupFn);
+  impl->erase(id, hashValue, isEqual, cleanupFn);
 }
 
 /// Implementation for mutating an instance of a derived storage.
 LogicalResult StorageUniquer::mutateImpl(
-    const TypeID &id,
-    function_ref<LogicalResult(StorageAllocator &)> mutationFn) {
+    TypeID id, function_ref<LogicalResult(StorageAllocator &)> mutationFn) {
   return impl->mutate(id, mutationFn);
 }
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index c873a009f151d2..7bea72dbc040b1 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -156,7 +156,7 @@ static Type parseTestType(DialectAsmParser &parser,
   StringRef name;
   if (parser.parseLess() || parser.parseKeyword(&name))
     return Type();
-  auto rec = TestRecursiveType::create(parser.getBuilder().getContext(), name);
+  auto rec = TestRecursiveType::get(parser.getBuilder().getContext(), name);
 
   // If this type already has been parsed above in the stack, expect just the
   // name.
diff --git a/mlir/test/lib/Dialect/Test/TestTypes.h b/mlir/test/lib/Dialect/Test/TestTypes.h
index 1df16559167250..c7fd80ef1fb91b 100644
--- a/mlir/test/lib/Dialect/Test/TestTypes.h
+++ b/mlir/test/lib/Dialect/Test/TestTypes.h
@@ -26,10 +26,6 @@ struct TestType : public Type::TypeBase<TestType, Type, TypeStorage,
                                         TestTypeInterface::Trait> {
   using Base::Base;
 
-  static TestType get(MLIRContext *context) {
-    return Base::get(context, Type::Kind::FIRST_PRIVATE_EXPERIMENTAL_9_TYPE);
-  }
-
   /// Provide a definition for the necessary interface methods.
   void printTypeC(Location loc) const {
     emitRemark(loc) << *this << " - TestC";
@@ -72,9 +68,8 @@ class TestRecursiveType
 public:
   using Base::Base;
 
-  static TestRecursiveType create(MLIRContext *ctx, StringRef name) {
-    return Base::get(ctx, Type::Kind::FIRST_PRIVATE_EXPERIMENTAL_9_TYPE + 1,
-                     name);
+  static TestRecursiveType get(MLIRContext *ctx, StringRef name) {
+    return Base::get(ctx, name);
   }
 
   /// Body getter and setter.
diff --git a/mlir/test/lib/IR/TestTypes.cpp b/mlir/test/lib/IR/TestTypes.cpp
index f62c06eededfbd..37e322c610c165 100644
--- a/mlir/test/lib/IR/TestTypes.cpp
+++ b/mlir/test/lib/IR/TestTypes.cpp
@@ -41,7 +41,7 @@ struct TestRecursiveTypesPass
 LogicalResult TestRecursiveTypesPass::createIRWithTypes() {
   MLIRContext *ctx = &getContext();
   FuncOp func = getFunction();
-  auto type = TestRecursiveType::create(ctx, "some_long_and_unique_name");
+  auto type = TestRecursiveType::get(ctx, "some_long_and_unique_name");
   if (failed(type.setBody(type)))
     return func.emitError("expected to be able to set the type body");
 
@@ -56,7 +56,7 @@ LogicalResult TestRecursiveTypesPass::createIRWithTypes() {
         "not expected to be able to change function body more than once");
 
   // Expecting to get the same type for the same name.
-  auto other = TestRecursiveType::create(ctx, "some_long_and_unique_name");
+  auto other = TestRecursiveType::get(ctx, "some_long_and_unique_name");
   if (type != other)
     return func.emitError("expected type name to be the uniquing key");
 

From f6de5306ec658fe15b65d875bba2f506558f160d Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Tue, 18 Aug 2020 14:30:03 -0700
Subject: [PATCH 100/101] [NFC][StackSafety] Test that StackLifetime looks
 through stripPointerCasts

StackLifetime class collects lifetime marker of an `alloca` by collect
the user of `BitCast` who is the user of the `alloca`. However, either
the `alloca` itself could be used with the lifetime marker or the `BitCast`
of the `alloca` could be transformed to other instructions. (e.g.,
it may be transformed to all zero reps in `InstCombine` pass).
This patch tries to fix this process in `collectMarkers` functions.

Reviewed By: vitalybuka

Differential Revision: https://reviews.llvm.org/D85399
---
 .../Analysis/StackSafetyAnalysis/lifetime.ll  | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll b/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll
index 470450a3a977de..d29029530b6cf7 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll
@@ -699,6 +699,50 @@ l2:                                               ; preds = %l2, %entry
   br label %l2
 }
 
+%struct.char_array = type { [500 x i8] }
+
+define dso_local void @gep_test(i32 %cond)  {
+; CHECK-LABEL: define dso_local void @gep_test
+entry:
+; CHECK: entry:
+; CHECK-NEXT: Alive: <>
+  %a = alloca %struct.char_array, align 8
+  %b = alloca %struct.char_array, align 8
+  %tobool.not = icmp eq i32 %cond, 0
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+; CHECK: if.then:
+; CHECK-NEXT: Alive: <>
+  %0 = getelementptr inbounds %struct.char_array, %struct.char_array* %a, i64 0, i32 0, i64 0
+  call void @llvm.lifetime.start.p0i8(i64 500, i8* nonnull %0)
+; CHECK: call void @llvm.lifetime.start.p0i8(i64 500, i8* nonnull %0)
+; CHECK-NEXT: Alive: <a>
+  tail call void @capture8(i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 500, i8* nonnull %0)
+; CHECK: call void @llvm.lifetime.end.p0i8(i64 500, i8* nonnull %0)
+; CHECK-NEXT: Alive: <>
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+; CHECK: if.else:
+; CHECK-NEXT: Alive: <>
+  %1 = getelementptr inbounds %struct.char_array, %struct.char_array* %b, i64 0, i32 0, i64 0
+  call void @llvm.lifetime.start.p0i8(i64 500, i8* nonnull %1)
+; CHECK: call void @llvm.lifetime.start.p0i8(i64 500, i8* nonnull %1)
+; CHECK-NEXT: Alive: <b>
+  tail call void @capture8(i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 500, i8* nonnull %1)
+; CHECK: call void @llvm.lifetime.end.p0i8(i64 500, i8* nonnull %1)
+; CHECK-NEXT: Alive: <>
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+; CHECK: if.end:
+; CHECK-NEXT: Alive: <>
+  ret void
+}
+
 define void @if_must(i1 %a) {
 ; CHECK-LABEL: define void @if_must
 entry:

From d14cf45735b0d09d7d3caf0824779520dd20ef10 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 18 Aug 2020 20:01:19 +0000
Subject: [PATCH 101/101] Separate the Registration from Loading dialects in
 the Context

This changes the behavior of constructing MLIRContext to no longer load globally
registered dialects on construction. Instead Dialects are only loaded explicitly
on demand:
- the Parser is lazily loading Dialects in the context as it encounters them
during parsing. This is the only purpose for registering dialects and not load
them in the context.
- Passes are expected to declare the dialects they will create entity from
(Operations, Attributes, or Types), and the PassManager is loading Dialects into
the Context when starting a pipeline.

This changes simplifies the configuration of the registration: a compiler only
need to load the dialect for the IR it will emit, and the optimizer is
self-contained and load the required Dialects. For example in the Toy tutorial,
the compiler only needs to load the Toy dialect in the Context, all the others
(linalg, affine, std, LLVM, ...) are automatically loaded depending on the
optimization pipeline enabled.

To adjust to this change, stop using the existing dialect registration: the
global registry will be removed soon.

1) For passes, you need to override the method:

virtual void getDependentDialects(DialectRegistry &registry) const {}

and registery on the provided registry any dialect that this pass can produce.
Passes defined in TableGen can provide this list in the dependentDialects list
field.

2) For dialects, on construction you can register dependent dialects using the
provided MLIRContext: `context.getOrLoadDialect<DialectName>()`
This is useful if a dialect may canonicalize or have interfaces involving
another dialect.

3) For loading IR, dialect that can be in the input file must be explicitly
registered with the context. `MlirOptMain()` is taking an explicit registry for
this purpose. See how the standalone-opt.cpp example is setup:

  mlir::DialectRegistry registry;
  registry.insert<mlir::standalone::StandaloneDialect>();
  registry.insert<mlir::StandardOpsDialect>();

Only operations from these two dialects can be in the input file. To include all
of the dialects in MLIR Core, you can populate the registry this way:

  mlir::registerAllDialects(registry);

4) For `mlir-translate` callback, as well as frontend, Dialects can be loaded in
the context before emitting the IR: context.getOrLoadDialect<ToyDialect>()

Differential Revision: https://reviews.llvm.org/D85622
---
 .../standalone-opt/standalone-opt.cpp         | 13 ++-
 .../test/Standalone/standalone-opt.mlir       |  2 +-
 mlir/examples/toy/Ch2/toyc.cpp                |  7 +-
 mlir/examples/toy/Ch3/toyc.cpp                |  6 +-
 mlir/examples/toy/Ch4/toyc.cpp                |  6 +-
 .../toy/Ch5/mlir/LowerToAffineLoops.cpp       |  3 +
 mlir/examples/toy/Ch5/toyc.cpp                |  6 +-
 .../toy/Ch6/mlir/LowerToAffineLoops.cpp       |  3 +
 mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp    |  3 +
 mlir/examples/toy/Ch6/toyc.cpp                |  6 +-
 .../toy/Ch7/mlir/LowerToAffineLoops.cpp       |  3 +
 mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp    |  3 +
 mlir/examples/toy/Ch7/toyc.cpp                |  6 +-
 mlir/include/mlir-c/Registration.h            | 10 ++-
 mlir/include/mlir/Conversion/Passes.td        | 26 ++++++
 mlir/include/mlir/Dialect/Affine/Passes.td    |  1 +
 .../include/mlir/Dialect/LLVMIR/LLVMDialect.h |  1 +
 .../include/mlir/Dialect/LLVMIR/LLVMOpBase.td |  5 ++
 .../include/mlir/Dialect/LLVMIR/NVVMDialect.h |  1 +
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   |  1 +
 .../mlir/Dialect/LLVMIR/ROCDLDialect.h        |  1 +
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |  1 +
 mlir/include/mlir/Dialect/Linalg/Passes.td    |  8 ++
 mlir/include/mlir/Dialect/SCF/Passes.td       |  1 +
 mlir/include/mlir/IR/Dialect.h                | 87 ++++++++++++++++---
 mlir/include/mlir/IR/FunctionSupport.h        |  4 +-
 mlir/include/mlir/IR/MLIRContext.h            | 67 +++++++++++---
 mlir/include/mlir/IR/OpBase.td                |  5 ++
 mlir/include/mlir/InitAllDialects.h           | 47 +++++-----
 mlir/include/mlir/InitAllTranslations.h       |  4 +-
 mlir/include/mlir/Pass/Pass.h                 |  8 ++
 mlir/include/mlir/Pass/PassBase.td            |  3 +
 mlir/include/mlir/Pass/PassManager.h          | 14 +++
 mlir/include/mlir/Support/MlirOptMain.h       | 20 ++++-
 mlir/include/mlir/TableGen/Dialect.h          |  8 +-
 mlir/include/mlir/TableGen/Pass.h             |  4 +
 mlir/include/mlir/Transforms/Passes.td        |  2 +
 mlir/lib/CAPI/IR/IR.cpp                       |  3 +-
 mlir/lib/CAPI/Registration/Registration.cpp   |  7 +-
 ...ConvertGPULaunchFuncToVulkanLaunchFunc.cpp |  1 +
 .../Conversion/LinalgToLLVM/LinalgToLLVM.cpp  |  1 +
 mlir/lib/Conversion/PassDetail.h              | 32 +++++++
 .../StandardToLLVM/StandardToLLVM.cpp         |  2 +-
 .../LegalizeStandardForSPIRV.cpp              |  1 +
 .../Dialect/Affine/Transforms/PassDetail.h    | 10 +++
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp      |  1 +
 .../Dialect/Linalg/Transforms/PassDetail.h    |  9 ++
 mlir/lib/Dialect/SCF/Transforms/PassDetail.h  |  5 ++
 mlir/lib/Dialect/SDBM/SDBMExpr.cpp            |  2 +-
 mlir/lib/ExecutionEngine/JitRunner.cpp        |  4 +-
 mlir/lib/IR/Dialect.cpp                       | 36 +++++---
 mlir/lib/IR/MLIRContext.cpp                   | 87 ++++++++++++-------
 mlir/lib/IR/Operation.cpp                     |  2 +-
 mlir/lib/IR/Verifier.cpp                      |  4 +-
 mlir/lib/Parser/AttributeParser.cpp           | 10 ++-
 mlir/lib/Parser/DialectSymbolParser.cpp       |  7 +-
 mlir/lib/Parser/Parser.cpp                    | 41 ++++++---
 mlir/lib/Pass/Pass.cpp                        | 27 ++++++
 mlir/lib/Pass/PassDetail.h                    |  4 +
 mlir/lib/Support/MlirOptMain.cpp              | 48 +++++-----
 mlir/lib/TableGen/Dialect.cpp                 |  8 ++
 mlir/lib/TableGen/Pass.cpp                    |  5 ++
 mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp  |  1 +
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  |  5 +-
 mlir/lib/Transforms/PassDetail.h              |  7 ++
 mlir/test/CAPI/ir.c                           |  2 +-
 mlir/test/EDSC/builder-api-test.cpp           | 18 ++--
 mlir/test/SDBM/sdbm-api-test.cpp              |  9 +-
 .../Dialect/Affine/TestVectorizationUtils.cpp |  4 +
 .../lib/Dialect/SPIRV/TestAvailability.cpp    |  2 +-
 mlir/test/lib/Dialect/Test/TestDialect.cpp    |  4 +
 mlir/test/lib/Dialect/Test/TestDialect.h      |  2 +
 mlir/test/lib/Dialect/Test/TestPatterns.cpp   |  4 +
 .../lib/Transforms/TestAllReduceLowering.cpp  |  4 +
 .../lib/Transforms/TestBufferPlacement.cpp    |  4 +
 .../lib/Transforms/TestGpuMemoryPromotion.cpp |  7 ++
 .../lib/Transforms/TestLinalgHoisting.cpp     |  4 +
 .../lib/Transforms/TestLinalgTransforms.cpp   | 11 +++
 .../lib/Transforms/TestVectorTransforms.cpp   |  8 ++
 mlir/test/mlir-opt/commandline.mlir           |  2 +-
 .../mlir-linalg-ods-gen.cpp                   |  2 +-
 mlir/tools/mlir-opt/mlir-opt.cpp              |  8 +-
 mlir/tools/mlir-tblgen/DialectGen.cpp         | 20 ++++-
 mlir/tools/mlir-tblgen/PassGen.cpp            | 21 ++++-
 mlir/tools/mlir-translate/mlir-translate.cpp  |  3 +-
 .../Dialect/Quant/QuantizationUtilsTest.cpp   | 15 ++--
 .../Dialect/SPIRV/DeserializationTest.cpp     |  3 +-
 .../Dialect/SPIRV/SerializationTest.cpp       |  5 +-
 mlir/unittests/IR/AttributeTest.cpp           | 32 +++----
 mlir/unittests/IR/DialectTest.cpp             |  6 +-
 mlir/unittests/IR/OperationSupportTest.cpp    |  8 +-
 mlir/unittests/Pass/AnalysisManagerTest.cpp   |  8 +-
 mlir/unittests/SDBM/SDBMTest.cpp              |  7 +-
 mlir/unittests/TableGen/OpBuildGen.cpp        |  9 +-
 mlir/unittests/TableGen/StructsGenTest.cpp    |  2 +-
 95 files changed, 761 insertions(+), 239 deletions(-)

diff --git a/mlir/examples/standalone/standalone-opt/standalone-opt.cpp b/mlir/examples/standalone/standalone-opt/standalone-opt.cpp
index b33dab26a71367..86cf6791844602 100644
--- a/mlir/examples/standalone/standalone-opt/standalone-opt.cpp
+++ b/mlir/examples/standalone/standalone-opt/standalone-opt.cpp
@@ -24,9 +24,16 @@
 int main(int argc, char **argv) {
   mlir::registerAllDialects();
   mlir::registerAllPasses();
-
-  mlir::registerDialect<mlir::standalone::StandaloneDialect>();
   // TODO: Register standalone passes here.
 
-  return failed(mlir::MlirOptMain(argc, argv, "Standalone optimizer driver\n"));
+  mlir::DialectRegistry registry;
+  registry.insert<mlir::standalone::StandaloneDialect>();
+  registry.insert<mlir::StandardOpsDialect>();
+  // Add the following to include *all* MLIR Core dialects, or selectively
+  // include what you need like above. You only need to register dialects that
+  // will be *parsed* by the tool, not the one generated
+  // registerAllDialects(registry);
+
+  return failed(
+      mlir::MlirOptMain(argc, argv, "Standalone optimizer driver\n", registry));
 }
diff --git a/mlir/examples/standalone/test/Standalone/standalone-opt.mlir b/mlir/examples/standalone/test/Standalone/standalone-opt.mlir
index fac08144ec39cd..1a78a9d8cb9eca 100644
--- a/mlir/examples/standalone/test/Standalone/standalone-opt.mlir
+++ b/mlir/examples/standalone/test/Standalone/standalone-opt.mlir
@@ -1,3 +1,3 @@
 // RUN: standalone-opt --show-dialects | FileCheck %s
-// CHECK: Registered Dialects:
+// CHECK: Available Dialects:
 // CHECK: standalone
diff --git a/mlir/examples/toy/Ch2/toyc.cpp b/mlir/examples/toy/Ch2/toyc.cpp
index d0880ce0971b6e..99232d8f24a4a5 100644
--- a/mlir/examples/toy/Ch2/toyc.cpp
+++ b/mlir/examples/toy/Ch2/toyc.cpp
@@ -68,10 +68,9 @@ std::unique_ptr<toy::ModuleAST> parseInputFile(llvm::StringRef filename) {
 }
 
 int dumpMLIR() {
-  // Register our Dialect with MLIR.
-  mlir::registerDialect<mlir::toy::ToyDialect>();
-
-  mlir::MLIRContext context;
+  mlir::MLIRContext context(/*loadAllDialects=*/false);
+  // Load our Dialect in this MLIR Context.
+  context.getOrLoadDialect<mlir::toy::ToyDialect>();
 
   // Handle '.toy' input to the compiler.
   if (inputType != InputType::MLIR &&
diff --git a/mlir/examples/toy/Ch3/toyc.cpp b/mlir/examples/toy/Ch3/toyc.cpp
index f9d5631719e8b6..d0430ce16e54a8 100644
--- a/mlir/examples/toy/Ch3/toyc.cpp
+++ b/mlir/examples/toy/Ch3/toyc.cpp
@@ -102,10 +102,10 @@ int loadMLIR(llvm::SourceMgr &sourceMgr, mlir::MLIRContext &context,
 }
 
 int dumpMLIR() {
-  // Register our Dialect with MLIR.
-  mlir::registerDialect<mlir::toy::ToyDialect>();
+  mlir::MLIRContext context(/*loadAllDialects=*/false);
+  // Load our Dialect in this MLIR Context.
+  context.getOrLoadDialect<mlir::toy::ToyDialect>();
 
-  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   llvm::SourceMgr sourceMgr;
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
diff --git a/mlir/examples/toy/Ch4/toyc.cpp b/mlir/examples/toy/Ch4/toyc.cpp
index e11f35c5f7e10c..9f95887d270738 100644
--- a/mlir/examples/toy/Ch4/toyc.cpp
+++ b/mlir/examples/toy/Ch4/toyc.cpp
@@ -103,10 +103,10 @@ int loadMLIR(llvm::SourceMgr &sourceMgr, mlir::MLIRContext &context,
 }
 
 int dumpMLIR() {
-  // Register our Dialect with MLIR.
-  mlir::registerDialect<mlir::toy::ToyDialect>();
+  mlir::MLIRContext context(/*loadAllDialects=*/false);
+  // Load our Dialect in this MLIR Context.
+  context.getOrLoadDialect<mlir::toy::ToyDialect>();
 
-  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   llvm::SourceMgr sourceMgr;
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
diff --git a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
index 3097681ea3fad3..92fd246a135886 100644
--- a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
@@ -256,6 +256,9 @@ struct TransposeOpLowering : public ConversionPattern {
 namespace {
 struct ToyToAffineLoweringPass
     : public PassWrapper<ToyToAffineLoweringPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AffineDialect, StandardOpsDialect>();
+  }
   void runOnFunction() final;
 };
 } // end anonymous namespace.
diff --git a/mlir/examples/toy/Ch5/toyc.cpp b/mlir/examples/toy/Ch5/toyc.cpp
index ed0496957093bb..16faac02fc60d0 100644
--- a/mlir/examples/toy/Ch5/toyc.cpp
+++ b/mlir/examples/toy/Ch5/toyc.cpp
@@ -106,10 +106,10 @@ int loadMLIR(llvm::SourceMgr &sourceMgr, mlir::MLIRContext &context,
 }
 
 int dumpMLIR() {
-  // Register our Dialect with MLIR.
-  mlir::registerDialect<mlir::toy::ToyDialect>();
+  mlir::MLIRContext context(/*loadAllDialects=*/false);
+  // Load our Dialect in this MLIR Context.
+  context.getOrLoadDialect<mlir::toy::ToyDialect>();
 
-  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   llvm::SourceMgr sourceMgr;
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(sourceMgr, &context);
diff --git a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
index cac3415f48d68f..f3857f35e25c95 100644
--- a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
@@ -255,6 +255,9 @@ struct TransposeOpLowering : public ConversionPattern {
 namespace {
 struct ToyToAffineLoweringPass
     : public PassWrapper<ToyToAffineLoweringPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AffineDialect, StandardOpsDialect>();
+  }
   void runOnFunction() final;
 };
 } // end anonymous namespace.
diff --git a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
index 74b32dc0ca1102..19bf27e1864d18 100644
--- a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
@@ -159,6 +159,9 @@ class PrintOpLowering : public ConversionPattern {
 namespace {
 struct ToyToLLVMLoweringPass
     : public PassWrapper<ToyToLLVMLoweringPass, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<LLVM::LLVMDialect, scf::SCFDialect>();
+  }
   void runOnOperation() final;
 };
 } // end anonymous namespace
diff --git a/mlir/examples/toy/Ch6/toyc.cpp b/mlir/examples/toy/Ch6/toyc.cpp
index bdcdf1af7ea831..9504a38b8784c9 100644
--- a/mlir/examples/toy/Ch6/toyc.cpp
+++ b/mlir/examples/toy/Ch6/toyc.cpp
@@ -255,10 +255,10 @@ int main(int argc, char **argv) {
 
   // If we aren't dumping the AST, then we are compiling with/to MLIR.
 
-  // Register our Dialect with MLIR.
-  mlir::registerDialect<mlir::toy::ToyDialect>();
+  mlir::MLIRContext context(/*loadAllDialects=*/false);
+  // Load our Dialect in this MLIR Context.
+  context.getOrLoadDialect<mlir::toy::ToyDialect>();
 
-  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   if (int error = loadAndProcessMLIR(context, module))
     return error;
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
index 3097681ea3fad3..92fd246a135886 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
@@ -256,6 +256,9 @@ struct TransposeOpLowering : public ConversionPattern {
 namespace {
 struct ToyToAffineLoweringPass
     : public PassWrapper<ToyToAffineLoweringPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AffineDialect, StandardOpsDialect>();
+  }
   void runOnFunction() final;
 };
 } // end anonymous namespace.
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
index 74b32dc0ca1102..19bf27e1864d18 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
@@ -159,6 +159,9 @@ class PrintOpLowering : public ConversionPattern {
 namespace {
 struct ToyToLLVMLoweringPass
     : public PassWrapper<ToyToLLVMLoweringPass, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<LLVM::LLVMDialect, scf::SCFDialect>();
+  }
   void runOnOperation() final;
 };
 } // end anonymous namespace
diff --git a/mlir/examples/toy/Ch7/toyc.cpp b/mlir/examples/toy/Ch7/toyc.cpp
index c1cc207a406ce2..cb3b455dc7ecbe 100644
--- a/mlir/examples/toy/Ch7/toyc.cpp
+++ b/mlir/examples/toy/Ch7/toyc.cpp
@@ -256,10 +256,10 @@ int main(int argc, char **argv) {
 
   // If we aren't dumping the AST, then we are compiling with/to MLIR.
 
-  // Register our Dialect with MLIR.
-  mlir::registerDialect<mlir::toy::ToyDialect>();
+  mlir::MLIRContext context(/*loadAllDialects=*/false);
+  // Load our Dialect in this MLIR Context.
+  context.getOrLoadDialect<mlir::toy::ToyDialect>();
 
-  mlir::MLIRContext context;
   mlir::OwningModuleRef module;
   if (int error = loadAndProcessMLIR(context, module))
     return error;
diff --git a/mlir/include/mlir-c/Registration.h b/mlir/include/mlir-c/Registration.h
index 5e5aa0ed29a277..05d4aacdaa8a15 100644
--- a/mlir/include/mlir-c/Registration.h
+++ b/mlir/include/mlir-c/Registration.h
@@ -10,14 +10,16 @@
 #ifndef MLIR_C_REGISTRATION_H
 #define MLIR_C_REGISTRATION_H
 
+#include "mlir-c/IR.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/** Registers all dialects known to core MLIR with the system. This must be
- * called before creating an MlirContext if it needs access to the registered
- * dialects. */
-void mlirRegisterAllDialects();
+/** Registers all dialects known to core MLIR with the provided Context.
+ * This is needed before creating IR for these Dialects.
+ */
+void mlirRegisterAllDialects(MlirContext context);
 
 #ifdef __cplusplus
 }
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 4ff23d71a5c0bf..0a043c01e98140 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -66,6 +66,11 @@ def ConvertAffineToStandard : Pass<"lower-affine"> {
         `affine.apply`.
   }];
   let constructor = "mlir::createLowerAffinePass()";
+  let dependentDialects = [
+    "scf::SCFDialect",
+    "StandardOpsDialect",
+    "vector::VectorDialect"
+  ];
 }
 
 //===----------------------------------------------------------------------===//
@@ -76,6 +81,7 @@ def ConvertAVX512ToLLVM : Pass<"convert-avx512-to-llvm", "ModuleOp"> {
   let summary = "Convert the operations from the avx512 dialect into the LLVM "
                 "dialect";
   let constructor = "mlir::createConvertAVX512ToLLVMPass()";
+  let dependentDialects = ["LLVM::LLVMDialect", "LLVM::LLVMAVX512Dialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -98,6 +104,7 @@ def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> {
 def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
   let summary = "Generate NVVM operations for gpu operations";
   let constructor = "mlir::createLowerGpuOpsToNVVMOpsPass()";
+  let dependentDialects = ["NVVM::NVVMDialect"];
   let options = [
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
@@ -112,6 +119,7 @@ def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
 def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
   let summary = "Generate ROCDL operations for gpu operations";
   let constructor = "mlir::createLowerGpuOpsToROCDLOpsPass()";
+  let dependentDialects = ["ROCDL::ROCDLDialect"];
   let options = [
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
@@ -126,6 +134,7 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
 def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
   let summary = "Convert GPU dialect to SPIR-V dialect";
   let constructor = "mlir::createConvertGPUToSPIRVPass()";
+  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -136,6 +145,7 @@ def ConvertGpuLaunchFuncToVulkanLaunchFunc
     : Pass<"convert-gpu-launch-to-vulkan-launch", "ModuleOp"> {
   let summary = "Convert gpu.launch_func to vulkanLaunch external call";
   let constructor = "mlir::createConvertGpuLaunchFuncToVulkanLaunchFuncPass()";
+  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 def ConvertVulkanLaunchFuncToVulkanCalls
@@ -143,6 +153,7 @@ def ConvertVulkanLaunchFuncToVulkanCalls
   let summary = "Convert vulkanLaunch external call to Vulkan runtime external "
                 "calls";
   let constructor = "mlir::createConvertVulkanLaunchFuncToVulkanCallsPass()";
+  let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -153,6 +164,7 @@ def ConvertLinalgToLLVM : Pass<"convert-linalg-to-llvm", "ModuleOp"> {
   let summary = "Convert the operations from the linalg dialect into the LLVM "
                 "dialect";
   let constructor = "mlir::createConvertLinalgToLLVMPass()";
+  let dependentDialects = ["scf::SCFDialect", "LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -163,6 +175,7 @@ def ConvertLinalgToStandard : Pass<"convert-linalg-to-std", "ModuleOp"> {
   let summary = "Convert the operations from the linalg dialect into the "
                 "Standard dialect";
   let constructor = "mlir::createConvertLinalgToStandardPass()";
+  let dependentDialects = ["StandardOpsDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -172,6 +185,7 @@ def ConvertLinalgToStandard : Pass<"convert-linalg-to-std", "ModuleOp"> {
 def ConvertLinalgToSPIRV : Pass<"convert-linalg-to-spirv", "ModuleOp"> {
   let summary = "Convert Linalg ops to SPIR-V ops";
   let constructor = "mlir::createLinalgToSPIRVPass()";
+  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -182,6 +196,7 @@ def SCFToStandard : Pass<"convert-scf-to-std"> {
   let summary = "Convert SCF dialect to Standard dialect, replacing structured"
                 " control flow with a CFG";
   let constructor = "mlir::createLowerToCFGPass()";
+  let dependentDialects = ["StandardOpsDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -191,6 +206,7 @@ def SCFToStandard : Pass<"convert-scf-to-std"> {
 def ConvertAffineForToGPU : FunctionPass<"convert-affine-for-to-gpu"> {
   let summary = "Convert top-level AffineFor Ops to GPU kernels";
   let constructor = "mlir::createAffineForToGPUPass()";
+  let dependentDialects = ["gpu::GPUDialect"];
   let options = [
     Option<"numBlockDims", "gpu-block-dims", "unsigned", /*default=*/"1u",
            "Number of GPU block dimensions for mapping">,
@@ -202,6 +218,7 @@ def ConvertAffineForToGPU : FunctionPass<"convert-affine-for-to-gpu"> {
 def ConvertParallelLoopToGpu : Pass<"convert-parallel-loops-to-gpu"> {
   let summary = "Convert mapped scf.parallel ops to gpu launch operations";
   let constructor = "mlir::createParallelLoopToGpuPass()";
+  let dependentDialects = ["AffineDialect", "gpu::GPUDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -212,6 +229,7 @@ def ConvertShapeToStandard : Pass<"convert-shape-to-std", "ModuleOp"> {
   let summary = "Convert operations from the shape dialect into the standard "
                 "dialect";
   let constructor = "mlir::createConvertShapeToStandardPass()";
+  let dependentDialects = ["StandardOpsDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -221,6 +239,7 @@ def ConvertShapeToStandard : Pass<"convert-shape-to-std", "ModuleOp"> {
 def ConvertShapeToSCF : FunctionPass<"convert-shape-to-scf"> {
   let summary = "Convert operations from the shape dialect to the SCF dialect";
   let constructor = "mlir::createConvertShapeToSCFPass()";
+  let dependentDialects = ["scf::SCFDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -230,6 +249,7 @@ def ConvertShapeToSCF : FunctionPass<"convert-shape-to-scf"> {
 def ConvertSPIRVToLLVM : Pass<"convert-spirv-to-llvm", "ModuleOp"> {
   let summary = "Convert SPIR-V dialect to LLVM dialect";
   let constructor = "mlir::createConvertSPIRVToLLVMPass()";
+  let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -264,6 +284,7 @@ def ConvertStandardToLLVM : Pass<"convert-std-to-llvm", "ModuleOp"> {
     LLVM IR types.
   }];
   let constructor = "mlir::createLowerToLLVMPass()";
+  let dependentDialects = ["LLVM::LLVMDialect"];
   let options = [
     Option<"useAlignedAlloc", "use-aligned-alloc", "bool", /*default=*/"false",
            "Use aligned_alloc in place of malloc for heap allocations">,
@@ -291,11 +312,13 @@ def ConvertStandardToLLVM : Pass<"convert-std-to-llvm", "ModuleOp"> {
 def LegalizeStandardForSPIRV : Pass<"legalize-std-for-spirv"> {
   let summary = "Legalize standard ops for SPIR-V lowering";
   let constructor = "mlir::createLegalizeStdOpsForSPIRVLoweringPass()";
+  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 def ConvertStandardToSPIRV : Pass<"convert-std-to-spirv", "ModuleOp"> {
   let summary = "Convert Standard Ops to SPIR-V dialect";
   let constructor = "mlir::createConvertStandardToSPIRVPass()";
+  let dependentDialects = ["spirv::SPIRVDialect"];
 }
 
 //===----------------------------------------------------------------------===//
@@ -306,6 +329,7 @@ def ConvertVectorToSCF : FunctionPass<"convert-vector-to-scf"> {
   let summary = "Lower the operations from the vector dialect into the SCF "
                 "dialect";
   let constructor = "mlir::createConvertVectorToSCFPass()";
+  let dependentDialects = ["AffineDialect", "scf::SCFDialect"];
   let options = [
     Option<"fullUnroll", "full-unroll", "bool", /*default=*/"false",
            "Perform full unrolling when converting vector transfers to SCF">,
@@ -320,6 +344,7 @@ def ConvertVectorToLLVM : Pass<"convert-vector-to-llvm", "ModuleOp"> {
   let summary = "Lower the operations from the vector dialect into the LLVM "
                 "dialect";
   let constructor = "mlir::createConvertVectorToLLVMPass()";
+  let dependentDialects = ["LLVM::LLVMDialect"];
   let options = [
     Option<"reassociateFPReductions", "reassociate-fp-reductions",
            "bool", /*default=*/"false",
@@ -335,6 +360,7 @@ def ConvertVectorToROCDL : Pass<"convert-vector-to-rocdl", "ModuleOp"> {
   let summary = "Lower the operations from the vector dialect into the ROCDL "
                 "dialect";
   let constructor = "mlir::createConvertVectorToROCDLPass()";
+  let dependentDialects = ["ROCDL::ROCDLDialect"];
 }
 
 #endif // MLIR_CONVERSION_PASSES
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
index 810640058155fb..f43fabd19aaefe 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -94,6 +94,7 @@ def AffineLoopUnrollAndJam : FunctionPass<"affine-loop-unroll-jam"> {
 def AffineVectorize : FunctionPass<"affine-super-vectorize"> {
   let summary = "Vectorize to a target independent n-D vector abstraction";
   let constructor = "mlir::createSuperVectorizePass()";
+  let dependentDialects = ["vector::VectorDialect"];
   let options = [
     ListOption<"vectorSizes", "virtual-vector-size", "int64_t",
                "Specify an n-D virtual vector size for vectorization",
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
index 04700f0aa17dbb..2f465f07a97e42 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
@@ -15,6 +15,7 @@
 #define MLIR_DIALECT_LLVMIR_LLVMDIALECT_H_
 
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Function.h"
 #include "mlir/IR/OpDefinition.h"
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index e824f97bc28544..226743587bd9d5 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -19,6 +19,11 @@ include "mlir/IR/OpBase.td"
 def LLVM_Dialect : Dialect {
   let name = "llvm";
   let cppNamespace = "LLVM";
+
+  /// FIXME: at the moment this is a dependency of the translation to LLVM IR,
+  /// not really one of this dialect per-se.
+  let dependentDialects = ["omp::OpenMPDialect"];
+
   let hasRegionArgAttrVerify = 1;
   let hasOperationAttrVerify = 1;
   let extraClassDeclaration = [{
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
index 86d437c9b561b7..9cc5314bdb901f 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -14,6 +14,7 @@
 #ifndef MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_
 #define MLIR_DIALECT_LLVMIR_NVVMDIALECT_H_
 
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 5f022e32b801d6..7d47e5012ac9a0 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -23,6 +23,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 def NVVM_Dialect : Dialect {
   let name = "nvvm";
   let cppNamespace = "NVVM";
+  let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
index bf761c357f9074..eb40373c3f1171 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
@@ -22,6 +22,7 @@
 #ifndef MLIR_DIALECT_LLVMIR_ROCDLDIALECT_H_
 #define MLIR_DIALECT_LLVMIR_ROCDLDIALECT_H_
 
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 0cd11690daa8ba..f85c4f02899b46 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -23,6 +23,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td"
 def ROCDL_Dialect : Dialect {
   let name = "rocdl";
   let cppNamespace = "ROCDL";
+  let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index 11f12ad30eb6c0..dcf4b5ec06cb6f 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -30,17 +30,20 @@ def LinalgFusion : FunctionPass<"linalg-fusion"> {
 def LinalgFusionOfTensorOps : Pass<"linalg-fusion-for-tensor-ops"> {
   let summary = "Fuse operations on RankedTensorType in linalg dialect";
   let constructor = "mlir::createLinalgFusionOfTensorOpsPass()";
+  let dependentDialects = ["AffineDialect"];
 }
 
 def LinalgLowerToAffineLoops : FunctionPass<"convert-linalg-to-affine-loops"> {
   let summary = "Lower the operations from the linalg dialect into affine "
                 "loops";
   let constructor = "mlir::createConvertLinalgToAffineLoopsPass()";
+  let dependentDialects = ["AffineDialect"];
 }
 
 def LinalgLowerToLoops : FunctionPass<"convert-linalg-to-loops"> {
   let summary = "Lower the operations from the linalg dialect into loops";
   let constructor = "mlir::createConvertLinalgToLoopsPass()";
+  let dependentDialects = ["scf::SCFDialect", "AffineDialect"];
 }
 
 def LinalgOnTensorsToBuffers : Pass<"convert-linalg-on-tensors-to-buffers", "ModuleOp"> {
@@ -54,6 +57,7 @@ def LinalgLowerToParallelLoops
   let summary = "Lower the operations from the linalg dialect into parallel "
                 "loops";
   let constructor = "mlir::createConvertLinalgToParallelLoopsPass()";
+  let dependentDialects = ["AffineDialect", "scf::SCFDialect"];
 }
 
 def LinalgPromotion : FunctionPass<"linalg-promote-subviews"> {
@@ -70,6 +74,9 @@ def LinalgPromotion : FunctionPass<"linalg-promote-subviews"> {
 def LinalgTiling : FunctionPass<"linalg-tile"> {
   let summary = "Tile operations in the linalg dialect";
   let constructor = "mlir::createLinalgTilingPass()";
+  let dependentDialects = [
+    "AffineDialect", "scf::SCFDialect"
+  ];
   let options = [
     ListOption<"tileSizes", "linalg-tile-sizes", "int64_t",
                "Test generation of dynamic promoted buffers",
@@ -86,6 +93,7 @@ def LinalgTilingToParallelLoops
                "Test generation of dynamic promoted buffers",
                "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
   ];
+  let dependentDialects = ["AffineDialect", "scf::SCFDialect"];
 }
 
 #endif // MLIR_DIALECT_LINALG_PASSES
diff --git a/mlir/include/mlir/Dialect/SCF/Passes.td b/mlir/include/mlir/Dialect/SCF/Passes.td
index 483d0ba7c7be08..6f3cf0e1264235 100644
--- a/mlir/include/mlir/Dialect/SCF/Passes.td
+++ b/mlir/include/mlir/Dialect/SCF/Passes.td
@@ -36,6 +36,7 @@ def SCFParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
                "Factors to tile parallel loops by",
                "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
   ];
+  let dependentDialects = ["AffineDialect"];
 }
 
 #endif // MLIR_DIALECT_SCF_PASSES
diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h
index 12a19af2054626..0fe1a7f29724d5 100644
--- a/mlir/include/mlir/IR/Dialect.h
+++ b/mlir/include/mlir/IR/Dialect.h
@@ -16,6 +16,8 @@
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/Support/TypeID.h"
 
+#include <map>
+
 namespace mlir {
 class DialectAsmParser;
 class DialectAsmPrinter;
@@ -23,7 +25,7 @@ class DialectInterface;
 class OpBuilder;
 class Type;
 
-using DialectAllocatorFunction = std::function<void(MLIRContext *)>;
+using DialectAllocatorFunction = std::function<Dialect *(MLIRContext *)>;
 
 /// Dialects are groups of MLIR operations and behavior associated with the
 /// entire group.  For example, hooks into other systems for constant folding,
@@ -222,30 +224,87 @@ class Dialect {
   /// A collection of registered dialect interfaces.
   DenseMap<TypeID, std::unique_ptr<DialectInterface>> registeredInterfaces;
 
-  /// Registers a specific dialect creation function with the global registry.
-  /// Used through the registerDialect template.
-  /// Registrations are deduplicated by dialect TypeID and only the first
-  /// registration will be used.
-  static void
-  registerDialectAllocator(TypeID typeID,
-                           const DialectAllocatorFunction &function);
-  template <typename ConcreteDialect>
   friend void registerDialect();
   friend class MLIRContext;
 };
 
-/// Registers all dialects and hooks from the global registries with the
-/// specified MLIRContext.
+/// The DialectRegistry maps a dialect namespace to a constructor for the
+/// matching dialect.
+/// This allows for decoupling the list of dialects "available" from the
+/// dialects loaded in the Context. The parser in particular will lazily load
+/// dialects in in the Context as operations are encountered.
+class DialectRegistry {
+  using MapTy =
+      std::map<std::string, std::pair<TypeID, DialectAllocatorFunction>>;
+
+public:
+  template <typename ConcreteDialect>
+  void insert() {
+    insert(TypeID::get<ConcreteDialect>(),
+           ConcreteDialect::getDialectNamespace(),
+           static_cast<DialectAllocatorFunction>(([](MLIRContext *ctx) {
+             // Just allocate the dialect, the context
+             // takes ownership of it.
+             return ctx->getOrLoadDialect<ConcreteDialect>();
+           })));
+  }
+
+  template <typename ConcreteDialect, typename OtherDialect,
+            typename... MoreDialects>
+  void insert() {
+    insert<ConcreteDialect>();
+    insert<OtherDialect, MoreDialects...>();
+  }
+
+  /// Add a new dialect constructor to the registry.
+  void insert(TypeID typeID, StringRef name, DialectAllocatorFunction ctor);
+
+  /// Load a dialect for this namespace in the provided context.
+  Dialect *loadByName(StringRef name, MLIRContext *context);
+
+  // Register all dialects available in the current registry with the registry
+  // in the provided context.
+  void appendTo(DialectRegistry &destination) {
+    for (const auto &nameAndRegistrationIt : registry)
+      destination.insert(nameAndRegistrationIt.second.first,
+                         nameAndRegistrationIt.first,
+                         nameAndRegistrationIt.second.second);
+  }
+  // Load all dialects available in the registry in the provided context.
+  void loadAll(MLIRContext *context) {
+    for (const auto &nameAndRegistrationIt : registry)
+      nameAndRegistrationIt.second.second(context);
+  }
+
+  MapTy::const_iterator begin() const { return registry.begin(); }
+  MapTy::const_iterator end() const { return registry.end(); }
+
+private:
+  MapTy registry;
+};
+
+/// Deprecated: this provides a global registry for convenience, while we're
+/// transitionning the registration mechanism to a stateless approach.
+DialectRegistry &getGlobalDialectRegistry();
+
+/// Registers all dialects from the global registries with the
+/// specified MLIRContext. This won't load the dialects in the context,
+/// but only make them available for lazy loading by name.
 /// Note: This method is not thread-safe.
 void registerAllDialects(MLIRContext *context);
 
+/// Register and return the dialect with the given namespace in the provided
+/// context. Returns nullptr is there is no constructor registered for this
+/// dialect.
+inline Dialect *registerDialect(StringRef name, MLIRContext *context) {
+  return getGlobalDialectRegistry().loadByName(name, context);
+}
+
 /// Utility to register a dialect. Client can register their dialect with the
 /// global registry by calling registerDialect<MyDialect>();
 /// Note: This method is not thread-safe.
 template <typename ConcreteDialect> void registerDialect() {
-  Dialect::registerDialectAllocator(
-      TypeID::get<ConcreteDialect>(),
-      [](MLIRContext *ctx) { ctx->getOrCreateDialect<ConcreteDialect>(); });
+  getGlobalDialectRegistry().insert<ConcreteDialect>();
 }
 
 /// DialectRegistration provides a global initializer that registers a Dialect
diff --git a/mlir/include/mlir/IR/FunctionSupport.h b/mlir/include/mlir/IR/FunctionSupport.h
index 7e281f393af946..3d467cd4f3642f 100644
--- a/mlir/include/mlir/IR/FunctionSupport.h
+++ b/mlir/include/mlir/IR/FunctionSupport.h
@@ -428,7 +428,7 @@ LogicalResult FunctionLike<ConcreteType>::verifyTrait(Operation *op) {
       if (!attr.first.strref().contains('.'))
         return funcOp.emitOpError("arguments may only have dialect attributes");
       auto dialectNamePair = attr.first.strref().split('.');
-      if (auto *dialect = ctx->getRegisteredDialect(dialectNamePair.first)) {
+      if (auto *dialect = ctx->getLoadedDialect(dialectNamePair.first)) {
         if (failed(dialect->verifyRegionArgAttribute(op, /*regionIndex=*/0,
                                                      /*argIndex=*/i, attr)))
           return failure();
@@ -444,7 +444,7 @@ LogicalResult FunctionLike<ConcreteType>::verifyTrait(Operation *op) {
       if (!attr.first.strref().contains('.'))
         return funcOp.emitOpError("results may only have dialect attributes");
       auto dialectNamePair = attr.first.strref().split('.');
-      if (auto *dialect = ctx->getRegisteredDialect(dialectNamePair.first)) {
+      if (auto *dialect = ctx->getLoadedDialect(dialectNamePair.first)) {
         if (failed(dialect->verifyRegionResultAttribute(op, /*regionIndex=*/0,
                                                         /*resultIndex=*/i,
                                                         attr)))
diff --git a/mlir/include/mlir/IR/MLIRContext.h b/mlir/include/mlir/IR/MLIRContext.h
index 0192a8ae06af87..e8a5d6e6d2368b 100644
--- a/mlir/include/mlir/IR/MLIRContext.h
+++ b/mlir/include/mlir/IR/MLIRContext.h
@@ -19,10 +19,12 @@ namespace mlir {
 class AbstractOperation;
 class DiagnosticEngine;
 class Dialect;
+class DialectRegistry;
 class InFlightDiagnostic;
 class Location;
 class MLIRContextImpl;
 class StorageUniquer;
+DialectRegistry &getGlobalDialectRegistry();
 
 /// MLIRContext is the top-level object for a collection of MLIR modules.  It
 /// holds immortal uniqued objects like types, and the tables used to unique
@@ -34,34 +36,69 @@ class StorageUniquer;
 ///
 class MLIRContext {
 public:
-  explicit MLIRContext();
+  /// Create a new Context.
+  /// The loadAllDialects parameters allows to load all dialects from the global
+  /// registry on Context construction. It is deprecated and will be removed
+  /// soon.
+  explicit MLIRContext(bool loadAllDialects = true);
   ~MLIRContext();
 
-  /// Return information about all registered IR dialects.
-  std::vector<Dialect *> getRegisteredDialects();
+  /// Return information about all IR dialects loaded in the context.
+  std::vector<Dialect *> getLoadedDialects();
+
+  /// Return the dialect registry associated with this context.
+  DialectRegistry &getDialectRegistry();
+
+  /// Return information about all available dialects in the registry in this
+  /// context.
+  std::vector<StringRef> getAvailableDialects();
 
   /// Get a registered IR dialect with the given namespace. If an exact match is
   /// not found, then return nullptr.
-  Dialect *getRegisteredDialect(StringRef name);
+  Dialect *getLoadedDialect(StringRef name);
 
   /// Get a registered IR dialect for the given derived dialect type. The
   /// derived type must provide a static 'getDialectNamespace' method.
-  template <typename T> T *getRegisteredDialect() {
-    return static_cast<T *>(getRegisteredDialect(T::getDialectNamespace()));
+  template <typename T>
+  T *getLoadedDialect() {
+    return static_cast<T *>(getLoadedDialect(T::getDialectNamespace()));
   }
 
   /// Get (or create) a dialect for the given derived dialect type. The derived
   /// type must provide a static 'getDialectNamespace' method.
   template <typename T>
-  T *getOrCreateDialect() {
-    return static_cast<T *>(getOrCreateDialect(
-        T::getDialectNamespace(), TypeID::get<T>(), [this]() {
+  T *getOrLoadDialect() {
+    return static_cast<T *>(
+        getOrLoadDialect(T::getDialectNamespace(), TypeID::get<T>(), [this]() {
           std::unique_ptr<T> dialect(new T(this));
-          dialect->dialectID = TypeID::get<T>();
           return dialect;
         }));
   }
 
+  /// Load a dialect in the context.
+  template <typename Dialect>
+  void loadDialect() {
+    getOrLoadDialect<Dialect>();
+  }
+
+  /// Load a list dialects in the context.
+  template <typename Dialect, typename OtherDialect, typename... MoreDialects>
+  void loadDialect() {
+    getOrLoadDialect<Dialect>();
+    loadDialect<OtherDialect, MoreDialects...>();
+  }
+
+  /// Deprecated: load all globally registered dialects into this context.
+  /// This method will be removed soon, it can be used temporarily as we're
+  /// phasing out the global registry.
+  void loadAllGloballyRegisteredDialects();
+
+  /// Get (or create) a dialect for the given derived dialect name.
+  /// The dialect will be loaded from the registry if no dialect is found.
+  /// If no dialect is loaded for this name and none is available in the
+  /// registry, returns nullptr.
+  Dialect *getOrLoadDialect(StringRef name);
+
   /// Return true if we allow to create operation for unregistered dialects.
   bool allowsUnregisteredDialects();
 
@@ -123,10 +160,12 @@ class MLIRContext {
   const std::unique_ptr<MLIRContextImpl> impl;
 
   /// Get a dialect for the provided namespace and TypeID: abort the program if
-  /// a dialect exist for this namespace with different TypeID. Returns a
-  /// pointer to the dialect owned by the context.
-  Dialect *getOrCreateDialect(StringRef dialectNamespace, TypeID dialectID,
-                              function_ref<std::unique_ptr<Dialect>()> ctor);
+  /// a dialect exist for this namespace with different TypeID. If a dialect has
+  /// not been loaded for this namespace/TypeID yet, use the provided ctor to
+  /// create one on the fly and load it. Returns a pointer to the dialect owned
+  /// by the context.
+  Dialect *getOrLoadDialect(StringRef dialectNamespace, TypeID dialectID,
+                            function_ref<std::unique_ptr<Dialect>()> ctor);
 
   MLIRContext(const MLIRContext &) = delete;
   void operator=(const MLIRContext &) = delete;
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index 9cc57a61728949..a28410f028d5f0 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -244,6 +244,11 @@ class Dialect {
   // The description of the dialect.
   string description = ?;
 
+  // A list of dialects this dialect will load on construction as dependencies.
+  // These are dialects that this dialect may involved in canonicalization
+  // pattern or interfaces.
+  list<string> dependentDialects = [];
+
   // The C++ namespace that ops of this dialect should be placed into.
   //
   // By default, uses the name of the dialect as the only namespace. To avoid
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index b76b26fe348346..147ececc4c5a96 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -35,30 +35,35 @@
 
 namespace mlir {
 
+// Add all the MLIR dialects to the provided registry.
+inline void registerAllDialects(DialectRegistry &registry) {
+  // clang-format off
+  registry.insert<acc::OpenACCDialect,
+                  AffineDialect,
+                  avx512::AVX512Dialect,
+                  gpu::GPUDialect,
+                  LLVM::LLVMAVX512Dialect,
+                  LLVM::LLVMDialect,
+                  linalg::LinalgDialect,
+                  scf::SCFDialect,
+                  omp::OpenMPDialect,
+                  quant::QuantizationDialect,
+                  spirv::SPIRVDialect,
+                  StandardOpsDialect,
+                  vector::VectorDialect,
+                  NVVM::NVVMDialect,
+                  ROCDL::ROCDLDialect,
+                  SDBMDialect,
+                  shape::ShapeDialect>();
+  // clang-format on
+}
+
 // This function should be called before creating any MLIRContext if one expect
 // all the possible dialects to be made available to the context automatically.
 inline void registerAllDialects() {
-  static bool init_once = []() {
-    registerDialect<acc::OpenACCDialect>();
-    registerDialect<AffineDialect>();
-    registerDialect<avx512::AVX512Dialect>();
-    registerDialect<gpu::GPUDialect>();
-    registerDialect<LLVM::LLVMAVX512Dialect>();
-    registerDialect<LLVM::LLVMDialect>();
-    registerDialect<linalg::LinalgDialect>();
-    registerDialect<scf::SCFDialect>();
-    registerDialect<omp::OpenMPDialect>();
-    registerDialect<quant::QuantizationDialect>();
-    registerDialect<spirv::SPIRVDialect>();
-    registerDialect<StandardOpsDialect>();
-    registerDialect<vector::VectorDialect>();
-    registerDialect<NVVM::NVVMDialect>();
-    registerDialect<ROCDL::ROCDLDialect>();
-    registerDialect<SDBMDialect>();
-    registerDialect<shape::ShapeDialect>();
-    return true;
-  }();
-  (void)init_once;
+  static bool initOnce =
+      ([]() { registerAllDialects(getGlobalDialectRegistry()); }(), true);
+  (void)initOnce;
 }
 } // namespace mlir
 
diff --git a/mlir/include/mlir/InitAllTranslations.h b/mlir/include/mlir/InitAllTranslations.h
index 31ca0254cf8999..a1771dab144c04 100644
--- a/mlir/include/mlir/InitAllTranslations.h
+++ b/mlir/include/mlir/InitAllTranslations.h
@@ -28,7 +28,7 @@ void registerAVX512ToLLVMIRTranslation();
 // expects all the possible translations to be made available to the context
 // automatically.
 inline void registerAllTranslations() {
-  static bool init_once = []() {
+  static bool initOnce = []() {
     registerFromLLVMIRTranslation();
     registerFromSPIRVTranslation();
     registerToLLVMIRTranslation();
@@ -38,7 +38,7 @@ inline void registerAllTranslations() {
     registerAVX512ToLLVMIRTranslation();
     return true;
   }();
-  (void)init_once;
+  (void)initOnce;
 }
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Pass/Pass.h b/mlir/include/mlir/Pass/Pass.h
index 8de31d9443190d..cd4c06acd070b4 100644
--- a/mlir/include/mlir/Pass/Pass.h
+++ b/mlir/include/mlir/Pass/Pass.h
@@ -9,6 +9,7 @@
 #ifndef MLIR_PASS_PASS_H
 #define MLIR_PASS_PASS_H
 
+#include "mlir/IR/Dialect.h"
 #include "mlir/IR/Function.h"
 #include "mlir/Pass/AnalysisManager.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -57,6 +58,13 @@ class Pass {
   /// Returns the derived pass name.
   virtual StringRef getName() const = 0;
 
+  /// Register dependent dialects for the current pass.
+  /// A pass is expected to register the dialects it will create entities for
+  /// (Operations, Types, Attributes), other than dialect that exists in the
+  /// input. For example, a pass that converts from Linalg to Affine would
+  /// register the Affine dialect but does not need to register Linalg.
+  virtual void getDependentDialects(DialectRegistry &registry) const {}
+
   /// Returns the command line argument used when registering this pass. Return
   /// an empty string if one does not exist.
   virtual StringRef getArgument() const {
diff --git a/mlir/include/mlir/Pass/PassBase.td b/mlir/include/mlir/Pass/PassBase.td
index 54b44031559e72..7a2feff4fe0454 100644
--- a/mlir/include/mlir/Pass/PassBase.td
+++ b/mlir/include/mlir/Pass/PassBase.td
@@ -78,6 +78,9 @@ class PassBase<string passArg, string base> {
   // A C++ constructor call to create an instance of this pass.
   code constructor = [{}];
 
+  // A list of dialects this pass may produce entities in.
+  list<string> dependentDialects = [];
+
   // A set of options provided by this pass.
   list<Option> options = [];
 
diff --git a/mlir/include/mlir/Pass/PassManager.h b/mlir/include/mlir/Pass/PassManager.h
index 9cbfb0b277100c..29e7c07c2ee416 100644
--- a/mlir/include/mlir/Pass/PassManager.h
+++ b/mlir/include/mlir/Pass/PassManager.h
@@ -9,6 +9,7 @@
 #ifndef MLIR_PASS_PASSMANAGER_H
 #define MLIR_PASS_PASSMANAGER_H
 
+#include "mlir/IR/Dialect.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/Optional.h"
@@ -58,6 +59,14 @@ class OpPassManager {
   pass_iterator end();
   iterator_range<pass_iterator> getPasses() { return {begin(), end()}; }
 
+  using const_pass_iterator = llvm::pointee_iterator<
+      std::vector<std::unique_ptr<Pass>>::const_iterator>;
+  const_pass_iterator begin() const;
+  const_pass_iterator end() const;
+  iterator_range<const_pass_iterator> getPasses() const {
+    return {begin(), end()};
+  }
+
   /// Run the held passes over the given operation.
   LogicalResult run(Operation *op, AnalysisManager am);
 
@@ -100,6 +109,11 @@ class OpPassManager {
   /// Merge the pass statistics of this class into 'other'.
   void mergeStatisticsInto(OpPassManager &other);
 
+  /// Register dependent dialects for the current pass manager.
+  /// This is forwarding to every pass in this PassManager, see the
+  /// documentation for the same method on the Pass class.
+  void getDependentDialects(DialectRegistry &dialects) const;
+
 private:
   OpPassManager(OperationName name, bool verifyPasses);
 
diff --git a/mlir/include/mlir/Support/MlirOptMain.h b/mlir/include/mlir/Support/MlirOptMain.h
index 137cf66f01509c..da03baed2ae708 100644
--- a/mlir/include/mlir/Support/MlirOptMain.h
+++ b/mlir/include/mlir/Support/MlirOptMain.h
@@ -21,12 +21,14 @@ class MemoryBuffer;
 } // end namespace llvm
 
 namespace mlir {
+class DialectRegistry;
 class PassPipelineCLParser;
 
 /// Perform the core processing behind `mlir-opt`:
 /// - outputStream is the stream where the resulting IR is printed.
 /// - buffer is the in-memory file to parser and process.
 /// - passPipeline is the specification of the pipeline that will be applied.
+/// - registry should contain all the dialects that can be parsed in the source.
 /// - splitInputFile will look for a "-----" marker in the input file, and load
 /// each chunk in an individual ModuleOp processed separately.
 /// - verifyDiagnostics enables a verification mode where comments starting with
@@ -35,13 +37,25 @@ class PassPipelineCLParser;
 /// - verifyPasses enables the IR verifier in-between each pass in the pipeline.
 /// - allowUnregisteredDialects allows to parse and create operation without
 /// registering the Dialect in the MLIRContext.
+/// - preloadDialectsInContext will trigger the upfront loading of all
+///   dialects from the global registry in the MLIRContext. This option is
+///   deprecated and will be removed soon.
 LogicalResult MlirOptMain(llvm::raw_ostream &outputStream,
                           std::unique_ptr<llvm::MemoryBuffer> buffer,
                           const PassPipelineCLParser &passPipeline,
-                          bool splitInputFile, bool verifyDiagnostics,
-                          bool verifyPasses, bool allowUnregisteredDialects);
+                          DialectRegistry &registry, bool splitInputFile,
+                          bool verifyDiagnostics, bool verifyPasses,
+                          bool allowUnregisteredDialects,
+                          bool preloadDialectsInContext = true);
 
 /// Implementation for tools like `mlir-opt`.
-LogicalResult MlirOptMain(int argc, char **argv, llvm::StringRef toolName);
+/// - toolName is used for the header displayed by `--help`.
+/// - registry should contain all the dialects that can be parsed in the source.
+/// - preloadDialectsInContext will trigger the upfront loading of all
+///   dialects from the global registry in the MLIRContext. This option is
+///   deprecated and will be removed soon.
+LogicalResult MlirOptMain(int argc, char **argv, llvm::StringRef toolName,
+                          DialectRegistry &registry,
+                          bool preloadDialectsInContext = true);
 
 } // end namespace mlir
diff --git a/mlir/include/mlir/TableGen/Dialect.h b/mlir/include/mlir/TableGen/Dialect.h
index 5e85806f377f4f..99217d8c7d3d8c 100644
--- a/mlir/include/mlir/TableGen/Dialect.h
+++ b/mlir/include/mlir/TableGen/Dialect.h
@@ -14,6 +14,7 @@
 
 #include "mlir/Support/LLVM.h"
 #include <string>
+#include <vector>
 
 namespace llvm {
 class Record;
@@ -25,7 +26,7 @@ namespace tblgen {
 // and provides helper methods for accessing them.
 class Dialect {
 public:
-  explicit Dialect(const llvm::Record *def) : def(def) {}
+  explicit Dialect(const llvm::Record *def);
 
   // Returns the name of this dialect.
   StringRef getName() const;
@@ -43,6 +44,10 @@ class Dialect {
   // Returns the description of the dialect. Returns empty string if none.
   StringRef getDescription() const;
 
+  // Returns the list of dialect (class names) that this dialect depends on.
+  // These are dialects that will be loaded on construction of this dialect.
+  ArrayRef<StringRef> getDependentDialects() const;
+
   // Returns the dialects extra class declaration code.
   llvm::Optional<StringRef> getExtraClassDeclaration() const;
 
@@ -70,6 +75,7 @@ class Dialect {
 
 private:
   const llvm::Record *def;
+  std::vector<StringRef> dependentDialects;
 };
 } // end namespace tblgen
 } // end namespace mlir
diff --git a/mlir/include/mlir/TableGen/Pass.h b/mlir/include/mlir/TableGen/Pass.h
index 02427e42a5256f..968c85416965d0 100644
--- a/mlir/include/mlir/TableGen/Pass.h
+++ b/mlir/include/mlir/TableGen/Pass.h
@@ -94,6 +94,9 @@ class Pass {
   /// Return the C++ constructor call to create an instance of this pass.
   StringRef getConstructor() const;
 
+  /// Return the dialects this pass needs to be registered.
+  ArrayRef<StringRef> getDependentDialects() const;
+
   /// Return the options provided by this pass.
   ArrayRef<PassOption> getOptions() const;
 
@@ -104,6 +107,7 @@ class Pass {
 
 private:
   const llvm::Record *def;
+  std::vector<StringRef> dependentDialects;
   std::vector<PassOption> options;
   std::vector<PassStatistic> statistics;
 };
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 77878057349849..3292d5e7dec2d4 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -162,6 +162,8 @@ def BufferPlacement : FunctionPass<"buffer-placement"> {
 
   }];
   let constructor = "mlir::createBufferPlacementPass()";
+  // TODO: this pass likely shouldn't depend on Linalg?
+  let dependentDialects = ["linalg::LinalgDialect"];
 }
 
 def Canonicalizer : Pass<"canonicalize"> {
diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp
index 1ba1a6aca6f8ed..9d3028deffd291 100644
--- a/mlir/lib/CAPI/IR/IR.cpp
+++ b/mlir/lib/CAPI/IR/IR.cpp
@@ -10,6 +10,7 @@
 
 #include "mlir/CAPI/IR.h"
 #include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
 #include "mlir/IR/Module.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Types.h"
@@ -50,7 +51,7 @@ class CallbackOstream : public llvm::raw_ostream {
 /* ========================================================================== */
 
 MlirContext mlirContextCreate() {
-  auto *context = new MLIRContext;
+  auto *context = new MLIRContext(/*loadAllDialects=*/false);
   return wrap(context);
 }
 
diff --git a/mlir/lib/CAPI/Registration/Registration.cpp b/mlir/lib/CAPI/Registration/Registration.cpp
index 400a4eb63f23ef..1d6294dbaaba5d 100644
--- a/mlir/lib/CAPI/Registration/Registration.cpp
+++ b/mlir/lib/CAPI/Registration/Registration.cpp
@@ -8,6 +8,11 @@
 
 #include "mlir-c/Registration.h"
 
+#include "mlir/CAPI/IR.h"
 #include "mlir/InitAllDialects.h"
 
-void mlirRegisterAllDialects() { mlir::registerAllDialects(); }
+void mlirRegisterAllDialects(MlirContext context) {
+  registerAllDialects(unwrap(context)->getDialectRegistry());
+  // TODO: we may not want to eagerly load here.
+  unwrap(context)->getDialectRegistry().loadAll(unwrap(context));
+}
diff --git a/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp b/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
index 1ebf48174aafb6..42673936b87854 100644
--- a/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
+++ b/mlir/lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp
@@ -16,6 +16,7 @@
 #include "../PassDetail.h"
 #include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
 #include "mlir/Dialect/SPIRV/SPIRVOps.h"
 #include "mlir/Dialect/SPIRV/Serialization.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
diff --git a/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp b/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
index 7b57854dde9833..0460d98b44a470 100644
--- a/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
+++ b/mlir/lib/Conversion/LinalgToLLVM/LinalgToLLVM.cpp
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
diff --git a/mlir/lib/Conversion/PassDetail.h b/mlir/lib/Conversion/PassDetail.h
index 6da0bc81e7af60..7fa5a5a92f2015 100644
--- a/mlir/lib/Conversion/PassDetail.h
+++ b/mlir/lib/Conversion/PassDetail.h
@@ -12,11 +12,43 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+class AffineDialect;
+class StandardOpsDialect;
+
+// Forward declaration from Dialect.h
+template <typename ConcreteDialect>
+void registerDialect(DialectRegistry &registry);
 
 namespace gpu {
+class GPUDialect;
 class GPUModuleOp;
 } // end namespace gpu
 
+namespace LLVM {
+class LLVMDialect;
+class LLVMAVX512Dialect;
+} // end namespace LLVM
+
+namespace NVVM {
+class NVVMDialect;
+} // end namespace NVVM
+
+namespace ROCDL {
+class ROCDLDialect;
+} // end namespace ROCDL
+
+namespace scf {
+class SCFDialect;
+} // end namespace scf
+
+namespace spirv {
+class SPIRVDialect;
+} // end namespace spirv
+
+namespace vector {
+class VectorDialect;
+} // end namespace vector
+
 #define GEN_PASS_CLASSES
 #include "mlir/Conversion/Passes.h.inc"
 
diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
index 0ee1166b1a643b..44d912bfd8eda6 100644
--- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
+++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp
@@ -125,7 +125,7 @@ LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx)
 /// Create an LLVMTypeConverter using custom LowerToLLVMOptions.
 LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx,
                                      const LowerToLLVMOptions &options)
-    : llvmDialect(ctx->getRegisteredDialect<LLVM::LLVMDialect>()),
+    : llvmDialect(ctx->getOrLoadDialect<LLVM::LLVMDialect>()),
       options(options) {
   assert(llvmDialect && "LLVM IR dialect is not registered");
   if (options.indexBitwidth == kDeriveIndexBitwidthFromDataLayout)
diff --git a/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp b/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
index 19643d271f8dc3..a2e608dcb71347 100644
--- a/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
+++ b/mlir/lib/Conversion/StandardToSPIRV/LegalizeStandardForSPIRV.cpp
@@ -14,6 +14,7 @@
 #include "../PassDetail.h"
 #include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.h"
 #include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/IR/PatternMatch.h"
diff --git a/mlir/lib/Dialect/Affine/Transforms/PassDetail.h b/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
index 3bae0592b3d4fa..da8f7ac3fc81c7 100644
--- a/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/Affine/Transforms/PassDetail.h
@@ -12,6 +12,16 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+// Forward declaration from Dialect.h
+template <typename ConcreteDialect>
+void registerDialect(DialectRegistry &registry);
+
+namespace linalg {
+class LinalgDialect;
+} // end namespace linalg
+namespace vector {
+class VectorDialect;
+} // end namespace vector
 
 #define GEN_PASS_CLASSES
 #include "mlir/Dialect/Affine/Passes.h.inc"
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 308272d66d567a..5d52d09bb191cb 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -1244,6 +1244,7 @@ template <typename NamedStructuredOpType>
 static ParseResult parseNamedStructuredOp(OpAsmParser &parser,
                                           OperationState &result) {
   SmallVector<OpAsmParser::OperandType, 8> operandsInfo;
+  result.getContext()->getOrLoadDialect<StandardOpsDialect>();
 
   // Optional attributes may be added.
   if (parser.parseOperandList(operandsInfo) ||
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PassDetail.h b/mlir/lib/Dialect/Linalg/Transforms/PassDetail.h
index 7fa05ff1212070..0415aeb8a1fd6f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/Linalg/Transforms/PassDetail.h
@@ -9,9 +9,18 @@
 #ifndef DIALECT_LINALG_TRANSFORMS_PASSDETAIL_H_
 #define DIALECT_LINALG_TRANSFORMS_PASSDETAIL_H_
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/IR/Dialect.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+// Forward declaration from Dialect.h
+template <typename ConcreteDialect>
+void registerDialect(DialectRegistry &registry);
+
+namespace scf {
+class SCFDialect;
+} // end namespace scf
 
 #define GEN_PASS_CLASSES
 #include "mlir/Dialect/Linalg/Passes.h.inc"
diff --git a/mlir/lib/Dialect/SCF/Transforms/PassDetail.h b/mlir/lib/Dialect/SCF/Transforms/PassDetail.h
index 95f8636b27c19d..6fa7f227d3da5b 100644
--- a/mlir/lib/Dialect/SCF/Transforms/PassDetail.h
+++ b/mlir/lib/Dialect/SCF/Transforms/PassDetail.h
@@ -12,6 +12,11 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+// Forward declaration from Dialect.h
+template <typename ConcreteDialect>
+void registerDialect(DialectRegistry &registry);
+
+class AffineDialect;
 
 #define GEN_PASS_CLASSES
 #include "mlir/Dialect/SCF/Passes.h.inc"
diff --git a/mlir/lib/Dialect/SDBM/SDBMExpr.cpp b/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
index 8da6c40bba88c2..5adcbcc78d524d 100644
--- a/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
+++ b/mlir/lib/Dialect/SDBM/SDBMExpr.cpp
@@ -516,7 +516,7 @@ Optional<SDBMExpr> SDBMExpr::tryConvertAffineExpr(AffineExpr affine) {
 
     SDBMDialect *dialect;
   } converter;
-  converter.dialect = affine.getContext()->getRegisteredDialect<SDBMDialect>();
+  converter.dialect = affine.getContext()->getOrLoadDialect<SDBMDialect>();
 
   if (auto result = converter.visit(affine))
     return result;
diff --git a/mlir/lib/ExecutionEngine/JitRunner.cpp b/mlir/lib/ExecutionEngine/JitRunner.cpp
index 7959183e89682c..2b18adb3734713 100644
--- a/mlir/lib/ExecutionEngine/JitRunner.cpp
+++ b/mlir/lib/ExecutionEngine/JitRunner.cpp
@@ -259,7 +259,9 @@ int mlir::JitRunnerMain(
     }
   }
 
-  MLIRContext context;
+  MLIRContext context(/*loadAllDialects=*/false);
+  registerAllDialects(&context);
+
   auto m = parseMLIRInput(options.inputFilename, &context);
   if (!m) {
     llvm::errs() << "could not parse the input IR\n";
diff --git a/mlir/lib/IR/Dialect.cpp b/mlir/lib/IR/Dialect.cpp
index 555bb2bf0eb4b6..44b0ee6b61d0aa 100644
--- a/mlir/lib/IR/Dialect.cpp
+++ b/mlir/lib/IR/Dialect.cpp
@@ -27,21 +27,29 @@ DialectAsmParser::~DialectAsmParser() {}
 //===----------------------------------------------------------------------===//
 
 /// Registry for all dialect allocation functions.
-static llvm::ManagedStatic<llvm::MapVector<TypeID, DialectAllocatorFunction>>
-    dialectRegistry;
-
-void Dialect::registerDialectAllocator(
-    TypeID typeID, const DialectAllocatorFunction &function) {
-  assert(function &&
-         "Attempting to register an empty dialect initialize function");
-  dialectRegistry->insert({typeID, function});
-}
+static llvm::ManagedStatic<DialectRegistry> dialectRegistry;
+DialectRegistry &mlir::getGlobalDialectRegistry() { return *dialectRegistry; }
 
-/// Registers all dialects and hooks from the global registries with the
-/// specified MLIRContext.
 void mlir::registerAllDialects(MLIRContext *context) {
-  for (const auto &it : *dialectRegistry)
-    it.second(context);
+  dialectRegistry->appendTo(context->getDialectRegistry());
+}
+
+Dialect *DialectRegistry::loadByName(StringRef name, MLIRContext *context) {
+  auto it = registry.find(name.str());
+  if (it == registry.end())
+    return nullptr;
+  return it->second.second(context);
+}
+
+void DialectRegistry::insert(TypeID typeID, StringRef name,
+                             DialectAllocatorFunction ctor) {
+  auto inserted =
+      registry.insert(std::make_pair(name, std::make_pair(typeID, ctor)));
+  if (!inserted.second && inserted.first->second.first != typeID) {
+    llvm::report_fatal_error(
+        "Trying to register different dialects for the same namespace: " +
+        name);
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -119,7 +127,7 @@ DialectInterface::~DialectInterface() {}
 
 DialectInterfaceCollectionBase::DialectInterfaceCollectionBase(
     MLIRContext *ctx, TypeID interfaceKind) {
-  for (auto *dialect : ctx->getRegisteredDialects()) {
+  for (auto *dialect : ctx->getLoadedDialects()) {
     if (auto *interface = dialect->getRegisteredInterface(interfaceKind)) {
       interfaces.insert(interface);
       orderedInterfaces.push_back(interface);
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index a86f27a7145f26..b47c143fbfa8e0 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -31,10 +31,13 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
+#define DEBUG_TYPE "mlircontext"
+
 using namespace mlir;
 using namespace mlir::detail;
 
@@ -274,7 +277,8 @@ class MLIRContextImpl {
 
   /// This is a list of dialects that are created referring to this context.
   /// The MLIRContext owns the objects.
-  std::vector<std::unique_ptr<Dialect>> dialects;
+  DenseMap<StringRef, std::unique_ptr<Dialect>> loadedDialects;
+  DialectRegistry dialectsRegistry;
 
   /// This is a mapping from operation name to AbstractOperation for registered
   /// operations.
@@ -345,7 +349,7 @@ class MLIRContextImpl {
 };
 } // end namespace mlir
 
-MLIRContext::MLIRContext() : impl(new MLIRContextImpl()) {
+MLIRContext::MLIRContext(bool loadAllDialects) : impl(new MLIRContextImpl()) {
   // Initialize values based on the command line flags if they were provided.
   if (clOptions.isConstructed()) {
     disableMultithreading(clOptions->disableThreading);
@@ -354,8 +358,9 @@ MLIRContext::MLIRContext() : impl(new MLIRContextImpl()) {
   }
 
   // Register dialects with this context.
-  getOrCreateDialect<BuiltinDialect>();
-  registerAllDialects(this);
+  getOrLoadDialect<BuiltinDialect>();
+  if (loadAllDialects)
+    loadAllGloballyRegisteredDialects();
 
   // Initialize several common attributes and types to avoid the need to lock
   // the context when accessing them.
@@ -431,54 +436,72 @@ DiagnosticEngine &MLIRContext::getDiagEngine() { return getImpl().diagEngine; }
 // Dialect and Operation Registration
 //===----------------------------------------------------------------------===//
 
+DialectRegistry &MLIRContext::getDialectRegistry() {
+  return impl->dialectsRegistry;
+}
+
 /// Return information about all registered IR dialects.
-std::vector<Dialect *> MLIRContext::getRegisteredDialects() {
+std::vector<Dialect *> MLIRContext::getLoadedDialects() {
   std::vector<Dialect *> result;
-  result.reserve(impl->dialects.size());
-  for (auto &dialect : impl->dialects)
-    result.push_back(dialect.get());
+  result.reserve(impl->loadedDialects.size());
+  for (auto &dialect : impl->loadedDialects)
+    result.push_back(dialect.second.get());
+  llvm::array_pod_sort(result.begin(), result.end(),
+                       [](Dialect *const *lhs, Dialect *const *rhs) -> int {
+                         return (*lhs)->getNamespace() < (*rhs)->getNamespace();
+                       });
+  return result;
+}
+std::vector<StringRef> MLIRContext::getAvailableDialects() {
+  std::vector<StringRef> result;
+  for (auto &dialect : impl->dialectsRegistry)
+    result.push_back(dialect.first);
   return result;
 }
 
 /// Get a registered IR dialect with the given namespace. If none is found,
 /// then return nullptr.
-Dialect *MLIRContext::getRegisteredDialect(StringRef name) {
+Dialect *MLIRContext::getLoadedDialect(StringRef name) {
   // Dialects are sorted by name, so we can use binary search for lookup.
-  auto it = llvm::lower_bound(
-      impl->dialects, name,
-      [](const auto &lhs, StringRef rhs) { return lhs->getNamespace() < rhs; });
-  return (it != impl->dialects.end() && (*it)->getNamespace() == name)
-             ? (*it).get()
-             : nullptr;
+  auto it = impl->loadedDialects.find(name);
+  return (it != impl->loadedDialects.end()) ? it->second.get() : nullptr;
+}
+
+Dialect *MLIRContext::getOrLoadDialect(StringRef name) {
+  Dialect *dialect = getLoadedDialect(name);
+  if (dialect)
+    return dialect;
+  return impl->dialectsRegistry.loadByName(name, this);
 }
 
 /// Get a dialect for the provided namespace and TypeID: abort the program if a
 /// dialect exist for this namespace with different TypeID. Returns a pointer to
 /// the dialect owned by the context.
 Dialect *
-MLIRContext::getOrCreateDialect(StringRef dialectNamespace, TypeID dialectID,
-                                function_ref<std::unique_ptr<Dialect>()> ctor) {
+MLIRContext::getOrLoadDialect(StringRef dialectNamespace, TypeID dialectID,
+                              function_ref<std::unique_ptr<Dialect>()> ctor) {
   auto &impl = getImpl();
   // Get the correct insertion position sorted by namespace.
-  auto insertPt =
-      llvm::lower_bound(impl.dialects, nullptr,
-                        [&](const std::unique_ptr<Dialect> &lhs,
-                            const std::unique_ptr<Dialect> &rhs) {
-                          if (!lhs)
-                            return dialectNamespace < rhs->getNamespace();
-                          return lhs->getNamespace() < dialectNamespace;
-                        });
+  std::unique_ptr<Dialect> &dialect = impl.loadedDialects[dialectNamespace];
+
+  if (!dialect) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Load new dialect in Context" << dialectNamespace);
+    dialect = ctor();
+    assert(dialect && "dialect ctor failed");
+    return dialect.get();
+  }
 
   // Abort if dialect with namespace has already been registered.
-  if (insertPt != impl.dialects.end() &&
-      (*insertPt)->getNamespace() == dialectNamespace) {
-    if ((*insertPt)->getTypeID() == dialectID)
-      return insertPt->get();
+  if (dialect->getTypeID() != dialectID)
     llvm::report_fatal_error("a dialect with namespace '" + dialectNamespace +
                              "' has already been registered");
-  }
-  auto it = impl.dialects.insert(insertPt, ctor());
-  return &**it;
+
+  return dialect.get();
+}
+
+void MLIRContext::loadAllGloballyRegisteredDialects() {
+  getGlobalDialectRegistry().loadAll(this);
 }
 
 bool MLIRContext::allowsUnregisteredDialects() {
diff --git a/mlir/lib/IR/Operation.cpp b/mlir/lib/IR/Operation.cpp
index 4ddc3df38a7e80..67249b83b10470 100644
--- a/mlir/lib/IR/Operation.cpp
+++ b/mlir/lib/IR/Operation.cpp
@@ -214,7 +214,7 @@ Dialect *Operation::getDialect() {
 
   // If this operation hasn't been registered or doesn't have abstract
   // operation, try looking up the dialect name in the context.
-  return getContext()->getRegisteredDialect(getName().getDialect());
+  return getContext()->getLoadedDialect(getName().getDialect());
 }
 
 Region *Operation::getParentRegion() {
diff --git a/mlir/lib/IR/Verifier.cpp b/mlir/lib/IR/Verifier.cpp
index b1aed8842dc431..4caf9891383c57 100644
--- a/mlir/lib/IR/Verifier.cpp
+++ b/mlir/lib/IR/Verifier.cpp
@@ -50,7 +50,7 @@ class OperationVerifier {
   Dialect *getDialectForAttribute(const NamedAttribute &attr) {
     assert(attr.first.strref().contains('.') && "expected dialect attribute");
     auto dialectNamePair = attr.first.strref().split('.');
-    return ctx->getRegisteredDialect(dialectNamePair.first);
+    return ctx->getLoadedDialect(dialectNamePair.first);
   }
 
 private:
@@ -218,7 +218,7 @@ LogicalResult OperationVerifier::verifyOperation(Operation &op) {
   auto it = dialectAllowsUnknownOps.find(dialectPrefix);
   if (it == dialectAllowsUnknownOps.end()) {
     // If the operation dialect is registered, query it directly.
-    if (auto *dialect = ctx->getRegisteredDialect(dialectPrefix))
+    if (auto *dialect = ctx->getLoadedDialect(dialectPrefix))
       it = dialectAllowsUnknownOps
                .try_emplace(dialectPrefix, dialect->allowsUnknownOperations())
                .first;
diff --git a/mlir/lib/Parser/AttributeParser.cpp b/mlir/lib/Parser/AttributeParser.cpp
index 1c1261e6d765c9..37ee938a4bcd5c 100644
--- a/mlir/lib/Parser/AttributeParser.cpp
+++ b/mlir/lib/Parser/AttributeParser.cpp
@@ -12,6 +12,7 @@
 
 #include "Parser.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Dialect.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/IR/StandardTypes.h"
 #include "llvm/ADT/StringExtras.h"
@@ -246,6 +247,11 @@ ParseResult Parser::parseAttributeDict(NamedAttrList &attributes) {
       return emitError("duplicate key in dictionary attribute");
     consumeToken();
 
+    // Lazy load a dialect in the context if there is a possible namespace.
+    auto splitName = nameId->strref().split('.');
+    if (!splitName.second.empty())
+      getContext()->getOrLoadDialect(splitName.first);
+
     // Try to parse the '=' for the attribute value.
     if (!consumeIf(Token::equal)) {
       // If there is no '=', we treat this as a unit attribute.
@@ -817,7 +823,9 @@ Attribute Parser::parseOpaqueElementsAttr(Type attrType) {
     return (emitError("expected dialect namespace"), nullptr);
 
   auto name = getToken().getStringValue();
-  auto *dialect = builder.getContext()->getRegisteredDialect(name);
+  // Lazy load a dialect in the context if there is a possible namespace.
+  Dialect *dialect = builder.getContext()->getOrLoadDialect(name);
+
   // TODO: Allow for having an unknown dialect on an opaque
   // attribute. Otherwise, it can't be roundtripped without having the dialect
   // registered.
diff --git a/mlir/lib/Parser/DialectSymbolParser.cpp b/mlir/lib/Parser/DialectSymbolParser.cpp
index 3b522a876f2541..d45ddf0719897c 100644
--- a/mlir/lib/Parser/DialectSymbolParser.cpp
+++ b/mlir/lib/Parser/DialectSymbolParser.cpp
@@ -526,7 +526,8 @@ Attribute Parser::parseExtendedAttr(Type type) {
           return Attribute();
 
         // If we found a registered dialect, then ask it to parse the attribute.
-        if (auto *dialect = state.context->getRegisteredDialect(dialectName)) {
+        if (Dialect *dialect =
+                builder.getContext()->getOrLoadDialect(dialectName)) {
           return parseSymbol<Attribute>(
               symbolData, state.context, state.symbols, [&](Parser &parser) {
                 CustomDialectAsmParser customParser(symbolData, parser);
@@ -563,7 +564,9 @@ Type Parser::parseExtendedType() {
       [&](StringRef dialectName, StringRef symbolData,
           llvm::SMLoc loc) -> Type {
         // If we found a registered dialect, then ask it to parse the type.
-        if (auto *dialect = state.context->getRegisteredDialect(dialectName)) {
+        auto *dialect = state.context->getOrLoadDialect(dialectName);
+
+        if (dialect) {
           return parseSymbol<Type>(
               symbolData, state.context, state.symbols, [&](Parser &parser) {
                 CustomDialectAsmParser customParser(symbolData, parser);
diff --git a/mlir/lib/Parser/Parser.cpp b/mlir/lib/Parser/Parser.cpp
index 3a995a4e2b048f..837b08ca54c0b2 100644
--- a/mlir/lib/Parser/Parser.cpp
+++ b/mlir/lib/Parser/Parser.cpp
@@ -12,6 +12,7 @@
 
 #include "Parser.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Dialect.h"
 #include "mlir/IR/Module.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Parser.h"
@@ -727,7 +728,7 @@ Operation *OperationParser::parseGenericOperation() {
   // Get location information for the operation.
   auto srcLocation = getEncodedSourceLocation(getToken().getLoc());
 
-  auto name = getToken().getStringValue();
+  std::string name = getToken().getStringValue();
   if (name.empty())
     return (emitError("empty operation name is invalid"), nullptr);
   if (name.find('\0') != StringRef::npos)
@@ -737,6 +738,15 @@ Operation *OperationParser::parseGenericOperation() {
 
   OperationState result(srcLocation, name);
 
+  // Lazy load dialects in the context as needed.
+  if (!result.name.getAbstractOperation()) {
+    StringRef dialectName = StringRef(name).split('.').first;
+    if (!getContext()->getLoadedDialect(dialectName) &&
+        getContext()->getOrLoadDialect(dialectName)) {
+      result.name = OperationName(name, getContext());
+    }
+  }
+
   // Parse the operand list.
   SmallVector<SSAUseInfo, 8> operandInfos;
   if (parseToken(Token::l_paren, "expected '(' to start operand list") ||
@@ -1442,17 +1452,28 @@ class CustomOpAsmParser : public OpAsmParser {
 
 Operation *
 OperationParser::parseCustomOperation(ArrayRef<ResultRecord> resultIDs) {
-  auto opLoc = getToken().getLoc();
-  auto opName = getTokenSpelling();
+  llvm::SMLoc opLoc = getToken().getLoc();
+  StringRef opName = getTokenSpelling();
 
   auto *opDefinition = AbstractOperation::lookup(opName, getContext());
-  if (!opDefinition && !opName.contains('.')) {
-    // If the operation name has no namespace prefix we treat it as a standard
-    // operation and prefix it with "std".
-    // TODO: Would it be better to just build a mapping of the registered
-    // operations in the standard dialect?
-    opDefinition =
-        AbstractOperation::lookup(Twine("std." + opName).str(), getContext());
+  if (!opDefinition) {
+    if (opName.contains('.')) {
+      // This op has a dialect, we try to check if we can register it in the
+      // context on the fly.
+      StringRef dialectName = opName.split('.').first;
+      if (!getContext()->getLoadedDialect(dialectName) &&
+          getContext()->getOrLoadDialect(dialectName)) {
+        opDefinition = AbstractOperation::lookup(opName, getContext());
+      }
+    } else {
+      // If the operation name has no namespace prefix we treat it as a standard
+      // operation and prefix it with "std".
+      // TODO: Would it be better to just build a mapping of the registered
+      // operations in the standard dialect?
+      if (getContext()->getOrLoadDialect("std"))
+        opDefinition = AbstractOperation::lookup(Twine("std." + opName).str(),
+                                                 getContext());
+    }
   }
 
   if (!opDefinition) {
diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp
index b791bf483e675a..9bc23c2e4a653a 100644
--- a/mlir/lib/Pass/Pass.cpp
+++ b/mlir/lib/Pass/Pass.cpp
@@ -290,6 +290,13 @@ OpPassManager::pass_iterator OpPassManager::begin() {
 }
 OpPassManager::pass_iterator OpPassManager::end() { return impl->passes.end(); }
 
+OpPassManager::const_pass_iterator OpPassManager::begin() const {
+  return impl->passes.begin();
+}
+OpPassManager::const_pass_iterator OpPassManager::end() const {
+  return impl->passes.end();
+}
+
 /// Run all of the passes in this manager over the current operation.
 LogicalResult OpPassManager::run(Operation *op, AnalysisManager am) {
   // Run each of the held passes.
@@ -346,6 +353,16 @@ void OpPassManager::printAsTextualPipeline(raw_ostream &os) {
   ::printAsTextualPipeline(impl->passes, os);
 }
 
+static void registerDialectsForPipeline(const OpPassManager &pm,
+                                        DialectRegistry &dialects) {
+  for (const Pass &pass : pm.getPasses())
+    pass.getDependentDialects(dialects);
+}
+
+void OpPassManager::getDependentDialects(DialectRegistry &dialects) const {
+  registerDialectsForPipeline(*this, dialects);
+}
+
 //===----------------------------------------------------------------------===//
 // OpToOpPassAdaptor
 //===----------------------------------------------------------------------===//
@@ -378,6 +395,11 @@ OpToOpPassAdaptor::OpToOpPassAdaptor(OpPassManager &&mgr) {
   mgrs.emplace_back(std::move(mgr));
 }
 
+void OpToOpPassAdaptor::getDependentDialects(DialectRegistry &dialects) const {
+  for (auto &pm : mgrs)
+    pm.getDependentDialects(dialects);
+}
+
 /// Merge the current pass adaptor into given 'rhs'.
 void OpToOpPassAdaptor::mergeInto(OpToOpPassAdaptor &rhs) {
   for (auto &pm : mgrs) {
@@ -721,6 +743,11 @@ LogicalResult PassManager::run(ModuleOp module) {
   // pipeline.
   getImpl().coalesceAdjacentAdaptorPasses();
 
+  // Register all dialects for the current pipeline.
+  DialectRegistry dependentDialects;
+  getDependentDialects(dependentDialects);
+  dependentDialects.loadAll(module.getContext());
+
   // Construct an analysis manager for the pipeline.
   ModuleAnalysisManager am(module, instrumentor.get());
 
diff --git a/mlir/lib/Pass/PassDetail.h b/mlir/lib/Pass/PassDetail.h
index 2342a1a7af97d8..f69701d85e15a5 100644
--- a/mlir/lib/Pass/PassDetail.h
+++ b/mlir/lib/Pass/PassDetail.h
@@ -43,6 +43,10 @@ class OpToOpPassAdaptor
   /// Returns the pass managers held by this adaptor.
   MutableArrayRef<OpPassManager> getPassManagers() { return mgrs; }
 
+  /// Populate the set of dependent dialects for the passes in the current
+  /// adaptor.
+  void getDependentDialects(DialectRegistry &dialects) const override;
+
   /// Return the async pass managers held by this parallel adaptor.
   MutableArrayRef<SmallVector<OpPassManager, 1>> getParallelPassManagers() {
     return asyncExecutors;
diff --git a/mlir/lib/Support/MlirOptMain.cpp b/mlir/lib/Support/MlirOptMain.cpp
index 699eded9fe285a..77b07605407b73 100644
--- a/mlir/lib/Support/MlirOptMain.cpp
+++ b/mlir/lib/Support/MlirOptMain.cpp
@@ -81,13 +81,18 @@ static LogicalResult processBuffer(raw_ostream &os,
                                    std::unique_ptr<MemoryBuffer> ownedBuffer,
                                    bool verifyDiagnostics, bool verifyPasses,
                                    bool allowUnregisteredDialects,
-                                   const PassPipelineCLParser &passPipeline) {
+                                   bool preloadDialectsInContext,
+                                   const PassPipelineCLParser &passPipeline,
+                                   DialectRegistry &registry) {
   // Tell sourceMgr about this buffer, which is what the parser will pick up.
   SourceMgr sourceMgr;
   sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), SMLoc());
 
   // Parse the input file.
-  MLIRContext context;
+  MLIRContext context(/*loadAllDialects=*/preloadDialectsInContext);
+  registry.appendTo(context.getDialectRegistry());
+  if (preloadDialectsInContext)
+    registry.loadAll(&context);
   context.allowUnregisteredDialects(allowUnregisteredDialects);
   context.printOpOnDiagnostic(!verifyDiagnostics);
 
@@ -115,9 +120,10 @@ static LogicalResult processBuffer(raw_ostream &os,
 LogicalResult mlir::MlirOptMain(raw_ostream &outputStream,
                                 std::unique_ptr<MemoryBuffer> buffer,
                                 const PassPipelineCLParser &passPipeline,
-                                bool splitInputFile, bool verifyDiagnostics,
-                                bool verifyPasses,
-                                bool allowUnregisteredDialects) {
+                                DialectRegistry &registry, bool splitInputFile,
+                                bool verifyDiagnostics, bool verifyPasses,
+                                bool allowUnregisteredDialects,
+                                bool preloadDialectsInContext) {
   // The split-input-file mode is a very specific mode that slices the file
   // up into small pieces and checks each independently.
   if (splitInputFile)
@@ -126,15 +132,19 @@ LogicalResult mlir::MlirOptMain(raw_ostream &outputStream,
         [&](std::unique_ptr<MemoryBuffer> chunkBuffer, raw_ostream &os) {
           return processBuffer(os, std::move(chunkBuffer), verifyDiagnostics,
                                verifyPasses, allowUnregisteredDialects,
-                               passPipeline);
+                               preloadDialectsInContext, passPipeline,
+                               registry);
         },
         outputStream);
 
   return processBuffer(outputStream, std::move(buffer), verifyDiagnostics,
-                       verifyPasses, allowUnregisteredDialects, passPipeline);
+                       verifyPasses, allowUnregisteredDialects,
+                       preloadDialectsInContext, passPipeline, registry);
 }
 
-LogicalResult mlir::MlirOptMain(int argc, char **argv, StringRef toolName) {
+LogicalResult mlir::MlirOptMain(int argc, char **argv, llvm::StringRef toolName,
+                                DialectRegistry &registry,
+                                bool preloadDialectsInContext) {
   static cl::opt<std::string> inputFilename(
       cl::Positional, cl::desc("<input file>"), cl::init("-"));
 
@@ -180,25 +190,19 @@ LogicalResult mlir::MlirOptMain(int argc, char **argv, StringRef toolName) {
   {
     llvm::raw_string_ostream os(helpHeader);
     MLIRContext context;
-    interleaveComma(context.getRegisteredDialects(), os, [&](Dialect *dialect) {
-      StringRef name = dialect->getNamespace();
-      // filter the builtin dialect.
-      if (name.empty())
-        os << "<builtin>";
-      else
-        os << name;
+    interleaveComma(registry, os, [&](auto &registryEntry) {
+      StringRef name = registryEntry.first;
+      os << name;
     });
   }
   // Parse pass names in main to ensure static initialization completed.
   cl::ParseCommandLineOptions(argc, argv, helpHeader);
 
   if (showDialects) {
-    llvm::outs() << "Registered Dialects:\n";
-    MLIRContext context;
+    llvm::outs() << "Available Dialects:\n";
     interleave(
-        context.getRegisteredDialects(), llvm::outs(),
-        [](Dialect *dialect) { llvm::outs() << dialect->getNamespace(); },
-        "\n");
+        registry, llvm::outs(),
+        [](auto &registryEntry) { llvm::outs() << registryEntry.first; }, "\n");
     return success();
   }
 
@@ -216,9 +220,9 @@ LogicalResult mlir::MlirOptMain(int argc, char **argv, StringRef toolName) {
     return failure();
   }
 
-  if (failed(MlirOptMain(output->os(), std::move(file), passPipeline,
+  if (failed(MlirOptMain(output->os(), std::move(file), passPipeline, registry,
                          splitInputFile, verifyDiagnostics, verifyPasses,
-                         allowUnregisteredDialects)))
+                         allowUnregisteredDialects, preloadDialectsInContext)))
     return failure();
 
   // Keep the output file if the invocation of MlirOptMain was successful.
diff --git a/mlir/lib/TableGen/Dialect.cpp b/mlir/lib/TableGen/Dialect.cpp
index 6af77e7df0f6fa..2b5f7e534ecc7b 100644
--- a/mlir/lib/TableGen/Dialect.cpp
+++ b/mlir/lib/TableGen/Dialect.cpp
@@ -15,6 +15,10 @@
 
 using namespace mlir;
 using namespace mlir::tblgen;
+Dialect::Dialect(const llvm::Record *def) : def(def) {
+  for (StringRef dialect : def->getValueAsListOfStrings("dependentDialects"))
+    dependentDialects.push_back(dialect);
+}
 
 StringRef Dialect::getName() const { return def->getValueAsString("name"); }
 
@@ -46,6 +50,10 @@ StringRef Dialect::getDescription() const {
   return getAsStringOrEmpty(*def, "description");
 }
 
+ArrayRef<StringRef> Dialect::getDependentDialects() const {
+  return dependentDialects;
+}
+
 llvm::Optional<StringRef> Dialect::getExtraClassDeclaration() const {
   auto value = def->getValueAsString("extraClassDeclaration");
   return value.empty() ? llvm::Optional<StringRef>() : value;
diff --git a/mlir/lib/TableGen/Pass.cpp b/mlir/lib/TableGen/Pass.cpp
index 4bc46b622c2b9f..f96180689c55af 100644
--- a/mlir/lib/TableGen/Pass.cpp
+++ b/mlir/lib/TableGen/Pass.cpp
@@ -69,6 +69,8 @@ Pass::Pass(const llvm::Record *def) : def(def) {
     options.push_back(PassOption(init));
   for (auto *init : def->getValueAsListOfDefs("statistics"))
     statistics.push_back(PassStatistic(init));
+  for (StringRef dialect : def->getValueAsListOfStrings("dependentDialects"))
+    dependentDialects.push_back(dialect);
 }
 
 StringRef Pass::getArgument() const {
@@ -88,6 +90,9 @@ StringRef Pass::getDescription() const {
 StringRef Pass::getConstructor() const {
   return def->getValueAsString("constructor");
 }
+ArrayRef<StringRef> Pass::getDependentDialects() const {
+  return dependentDialects;
+}
 
 ArrayRef<PassOption> Pass::getOptions() const { return options; }
 
diff --git a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
index 470044bc995372..a5d83389387959 100644
--- a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
+++ b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp
@@ -836,6 +836,7 @@ LogicalResult Importer::processBasicBlock(llvm::BasicBlock *bb, Block *block) {
 OwningModuleRef
 mlir::translateLLVMIRToModule(std::unique_ptr<llvm::Module> llvmModule,
                               MLIRContext *context) {
+  context->loadDialect<LLVMDialect>();
   OwningModuleRef module(ModuleOp::create(
       FileLineColLoc::get("", /*line=*/0, /*column=*/0, context)));
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index f8277d154f2765..21f5201c7d697a 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -302,8 +302,7 @@ ModuleTranslation::ModuleTranslation(Operation *module,
     : mlirModule(module), llvmModule(std::move(llvmModule)),
       debugTranslation(
           std::make_unique<DebugTranslation>(module, *this->llvmModule)),
-      ompDialect(
-          module->getContext()->getRegisteredDialect<omp::OpenMPDialect>()),
+      ompDialect(module->getContext()->getOrLoadDialect<omp::OpenMPDialect>()),
       typeTranslator(this->llvmModule->getContext()) {
   assert(satisfiesLLVMModule(mlirModule) &&
          "mlirModule should honor LLVM's module semantics.");
@@ -944,8 +943,8 @@ ModuleTranslation::lookupValues(ValueRange values) {
 
 std::unique_ptr<llvm::Module> ModuleTranslation::prepareLLVMModule(
     Operation *m, llvm::LLVMContext &llvmContext, StringRef name) {
+  m->getContext()->getOrLoadDialect<LLVM::LLVMDialect>();
   auto llvmModule = std::make_unique<llvm::Module>(name, llvmContext);
-
   if (auto dataLayoutAttr =
           m->getAttr(LLVM::LLVMDialect::getDataLayoutAttrName()))
     llvmModule->setDataLayout(dataLayoutAttr.cast<StringAttr>().getValue());
diff --git a/mlir/lib/Transforms/PassDetail.h b/mlir/lib/Transforms/PassDetail.h
index c6f7e225d71ac5..220ed1aac40797 100644
--- a/mlir/lib/Transforms/PassDetail.h
+++ b/mlir/lib/Transforms/PassDetail.h
@@ -12,6 +12,13 @@
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
+// Forward declaration from Dialect.h
+template <typename ConcreteDialect>
+void registerDialect(DialectRegistry &registry);
+
+namespace linalg {
+class LinalgDialect;
+} // end namespace linalg
 
 #define GEN_PASS_CLASSES
 #include "mlir/Transforms/Passes.h.inc"
diff --git a/mlir/test/CAPI/ir.c b/mlir/test/CAPI/ir.c
index 56b7ecd7fd7c4c..12dc100b0bec9d 100644
--- a/mlir/test/CAPI/ir.c
+++ b/mlir/test/CAPI/ir.c
@@ -381,8 +381,8 @@ static int printStandardTypes(MlirContext ctx) {
 }
 
 int main() {
-  mlirRegisterAllDialects();
   MlirContext ctx = mlirContextCreate();
+  mlirRegisterAllDialects(ctx);
   MlirLocation location = mlirLocationUnknownGet(ctx);
 
   MlirModule moduleOp = makeAdd(ctx, location);
diff --git a/mlir/test/EDSC/builder-api-test.cpp b/mlir/test/EDSC/builder-api-test.cpp
index 3fcfcf24ef8fe4..062e4b5912297a 100644
--- a/mlir/test/EDSC/builder-api-test.cpp
+++ b/mlir/test/EDSC/builder-api-test.cpp
@@ -36,16 +36,18 @@ using namespace mlir::edsc;
 using namespace mlir::edsc::intrinsics;
 
 static MLIRContext &globalContext() {
-  static bool init_once = []() {
-    registerDialect<AffineDialect>();
-    registerDialect<linalg::LinalgDialect>();
-    registerDialect<scf::SCFDialect>();
-    registerDialect<StandardOpsDialect>();
-    registerDialect<vector::VectorDialect>();
+  static thread_local MLIRContext context(/*loadAllDialects=*/false);
+  static thread_local bool initOnce = [&]() {
+    // clang-format off
+    context.loadDialect<AffineDialect,
+                        scf::SCFDialect,
+                        linalg::LinalgDialect,
+                        StandardOpsDialect,
+                        vector::VectorDialect>();
+    // clang-format on
     return true;
   }();
-  (void)init_once;
-  static thread_local MLIRContext context;
+  (void)initOnce;
   context.allowUnregisteredDialects();
   return context;
 }
diff --git a/mlir/test/SDBM/sdbm-api-test.cpp b/mlir/test/SDBM/sdbm-api-test.cpp
index 0b58e2948145cc..ddefc52fb461d5 100644
--- a/mlir/test/SDBM/sdbm-api-test.cpp
+++ b/mlir/test/SDBM/sdbm-api-test.cpp
@@ -19,18 +19,19 @@
 
 using namespace mlir;
 
-// Load the SDBM dialect
-static DialectRegistration<SDBMDialect> SDBMRegistration;
 
 static MLIRContext *ctx() {
-  static thread_local MLIRContext context;
+  static thread_local MLIRContext context(/*loadAllDialects=*/false);
+  static thread_local bool once =
+      (context.getOrLoadDialect<SDBMDialect>(), true);
+  (void)once;
   return &context;
 }
 
 static SDBMDialect *dialect() {
   static thread_local SDBMDialect *d = nullptr;
   if (!d) {
-    d = ctx()->getRegisteredDialect<SDBMDialect>();
+    d = ctx()->getOrLoadDialect<SDBMDialect>();
   }
   return d;
 }
diff --git a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
index a6719b060aac9f..cfac2dce230075 100644
--- a/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Analysis/NestedMatcher.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorUtils.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/Diagnostics.h"
@@ -72,6 +73,9 @@ struct VectorizerTestPass
     : public PassWrapper<VectorizerTestPass, FunctionPass> {
   static constexpr auto kTestAffineMapOpName = "test_affine_map";
   static constexpr auto kTestAffineMapAttrName = "affine_map";
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<vector::VectorDialect>();
+  }
 
   void runOnFunction() override;
   void testVectorShapeRatio(llvm::raw_ostream &outs);
diff --git a/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp b/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp
index 0c1069f38b670a..03c425d6d9062e 100644
--- a/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp
+++ b/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp
@@ -30,7 +30,7 @@ void PrintOpAvailability::runOnFunction() {
   auto f = getFunction();
   llvm::outs() << f.getName() << "\n";
 
-  Dialect *spvDialect = getContext().getRegisteredDialect("spv");
+  Dialect *spvDialect = getContext().getLoadedDialect("spv");
 
   f.getOperation()->walk([&](Operation *op) {
     if (op->getDialect() != spvDialect)
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index 7bea72dbc040b1..bccdfb7c0ce8c4 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -21,6 +21,10 @@
 
 using namespace mlir;
 
+void mlir::registerTestDialect(DialectRegistry &registry) {
+  registry.insert<TestDialect>();
+}
+
 //===----------------------------------------------------------------------===//
 // TestDialect Interfaces
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.h b/mlir/test/lib/Dialect/Test/TestDialect.h
index fd1914cbc62455..34fc1a9534e8d5 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.h
+++ b/mlir/test/lib/Dialect/Test/TestDialect.h
@@ -37,6 +37,8 @@ namespace mlir {
 #define GET_OP_CLASSES
 #include "TestOps.h.inc"
 
+void registerTestDialect(DialectRegistry &registry);
+
 } // end namespace mlir
 
 #endif // MLIR_TESTDIALECT_H
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index f2a17a9f3f5fa6..be5d799a025336 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -768,6 +768,10 @@ struct TestTypeConversionProducer
 
 struct TestTypeConversionDriver
     : public PassWrapper<TestTypeConversionDriver, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<TestDialect>();
+  }
+
   void runOnOperation() override {
     // Initialize the type converter.
     TypeConverter converter;
diff --git a/mlir/test/lib/Transforms/TestAllReduceLowering.cpp b/mlir/test/lib/Transforms/TestAllReduceLowering.cpp
index c043d0f02f8d0b..0c72b6cd2a89c3 100644
--- a/mlir/test/lib/Transforms/TestAllReduceLowering.cpp
+++ b/mlir/test/lib/Transforms/TestAllReduceLowering.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/GPU/Passes.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
 
@@ -19,6 +20,9 @@ using namespace mlir;
 namespace {
 struct TestAllReduceLoweringPass
     : public PassWrapper<TestAllReduceLoweringPass, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<StandardOpsDialect>();
+  }
   void runOnOperation() override {
     OwningRewritePatternList patterns;
     populateGpuRewritePatterns(&getContext(), patterns);
diff --git a/mlir/test/lib/Transforms/TestBufferPlacement.cpp b/mlir/test/lib/Transforms/TestBufferPlacement.cpp
index 5ad441aa15c3e2..6cc0924191cb8f 100644
--- a/mlir/test/lib/Transforms/TestBufferPlacement.cpp
+++ b/mlir/test/lib/Transforms/TestBufferPlacement.cpp
@@ -116,6 +116,10 @@ struct TestBufferPlacementPreparationPass
     patterns->insert<GenericOpConverter>(context, placer, converter);
   }
 
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<linalg::LinalgDialect>();
+  }
+
   void runOnOperation() override {
     MLIRContext &context = this->getContext();
     ConversionTarget target(context);
diff --git a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
index 08862dd061402d..3c2b933e99f6a0 100644
--- a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
+++ b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
@@ -13,6 +13,9 @@
 
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/SPIRV/SPIRVDialect.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/Pass/Pass.h"
 
@@ -26,6 +29,10 @@ namespace {
 class TestGpuMemoryPromotionPass
     : public PassWrapper<TestGpuMemoryPromotionPass,
                          OperationPass<gpu::GPUFuncOp>> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<StandardOpsDialect, scf::SCFDialect>();
+  }
+
   void runOnOperation() override {
     gpu::GPUFuncOp op = getOperation();
     for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) {
diff --git a/mlir/test/lib/Transforms/TestLinalgHoisting.cpp b/mlir/test/lib/Transforms/TestLinalgHoisting.cpp
index d1e478fec3bcba..5d4031f9004366 100644
--- a/mlir/test/lib/Transforms/TestLinalgHoisting.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgHoisting.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Pass/Pass.h"
@@ -22,6 +23,9 @@ struct TestLinalgHoisting
     : public PassWrapper<TestLinalgHoisting, FunctionPass> {
   TestLinalgHoisting() = default;
   TestLinalgHoisting(const TestLinalgHoisting &pass) {}
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AffineDialect>();
+  }
 
   void runOnFunction() override;
 
diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
index dffe4f2a0796a0..4fc880a24277b1 100644
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
@@ -30,6 +31,16 @@ struct TestLinalgTransforms
   TestLinalgTransforms() = default;
   TestLinalgTransforms(const TestLinalgTransforms &pass) {}
 
+  void getDependentDialects(DialectRegistry &registry) const override {
+    // clang-format off
+    registry.insert<AffineDialect,
+                    scf::SCFDialect,
+                    StandardOpsDialect,
+                    vector::VectorDialect,
+                    gpu::GPUDialect>();
+    // clang-format on
+  }
+
   void runOnFunction() override;
 
   Option<bool> testPatterns{*this, "test-patterns",
diff --git a/mlir/test/lib/Transforms/TestVectorTransforms.cpp b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
index 9da3156d535939..ab8460318b49fb 100644
--- a/mlir/test/lib/Transforms/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestVectorTransforms.cpp
@@ -8,6 +8,9 @@
 
 #include <type_traits>
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/Vector/VectorTransforms.h"
@@ -128,6 +131,11 @@ struct TestVectorTransferFullPartialSplitPatterns
   TestVectorTransferFullPartialSplitPatterns() = default;
   TestVectorTransferFullPartialSplitPatterns(
       const TestVectorTransferFullPartialSplitPatterns &pass) {}
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AffineDialect, linalg::LinalgDialect, scf::SCFDialect>();
+  }
+
   Option<bool> useLinalgOps{
       *this, "use-linalg-copy",
       llvm::cl::desc("Split using a unmasked vector.transfer + linalg.fill + "
diff --git a/mlir/test/mlir-opt/commandline.mlir b/mlir/test/mlir-opt/commandline.mlir
index f99a68d6303cef..4cf6ea9d8a698a 100644
--- a/mlir/test/mlir-opt/commandline.mlir
+++ b/mlir/test/mlir-opt/commandline.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt --show-dialects | FileCheck %s
-// CHECK: Registered Dialects:
+// CHECK: Available Dialects:
 // CHECK: affine
 // CHECK: gpu
 // CHECK: linalg
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp
index 12e6aeef9162ff..92efef67e8f4a7 100644
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp
@@ -1703,7 +1703,7 @@ int main(int argc, char **argv) {
   if (testEmitIncludeTdHeader)
     output->os() << "include \"mlir/Dialect/Linalg/IR/LinalgStructuredOps.td\"";
 
-  MLIRContext context;
+  MLIRContext context(/*loadAllDialects=*/false);
   llvm::SourceMgr mgr;
   mgr.AddNewSourceBuffer(std::move(file), llvm::SMLoc());
   Parser parser(mgr, &context);
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index f5f5a477942b9d..6f27949d832977 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -48,6 +48,7 @@ void registerTestConstantFold();
 void registerTestConvertGPUKernelToCubinPass();
 void registerTestConvertGPUKernelToHsacoPass();
 void registerTestDominancePass();
+void registerTestDialect(DialectRegistry &);
 void registerTestExpandTanhPass();
 void registerTestFunc();
 void registerTestGpuMemoryPromotionPass();
@@ -130,5 +131,10 @@ int main(int argc, char **argv) {
 #ifdef MLIR_INCLUDE_TESTS
   registerTestPasses();
 #endif
-  return failed(MlirOptMain(argc, argv, "MLIR modular optimizer driver"));
+  DialectRegistry registry;
+  registerAllDialects(registry);
+  registerTestDialect(registry);
+  return failed(MlirOptMain(argc, argv, "MLIR modular optimizer driver\n",
+                            registry,
+                            /*preloadDialectsInContext=*/false));
 }
diff --git a/mlir/tools/mlir-tblgen/DialectGen.cpp b/mlir/tools/mlir-tblgen/DialectGen.cpp
index 13421c42c3c2cf..3a19379da8a3a3 100644
--- a/mlir/tools/mlir-tblgen/DialectGen.cpp
+++ b/mlir/tools/mlir-tblgen/DialectGen.cpp
@@ -61,11 +61,14 @@ filterForDialect(ArrayRef<llvm::Record *> records, Dialect &dialect) {
 ///
 /// {0}: The name of the dialect class.
 /// {1}: The dialect namespace.
+/// {2}: initialization code that is emitted in the ctor body before calling
+/// initialize()
 static const char *const dialectDeclBeginStr = R"(
 class {0} : public ::mlir::Dialect {
   explicit {0}(::mlir::MLIRContext *context)
     : ::mlir::Dialect(getDialectNamespace(), context,
       ::mlir::TypeID::get<{0}>()) {{
+    {2}
     initialize();
   }
   void initialize();
@@ -74,6 +77,12 @@ class {0} : public ::mlir::Dialect {
   static ::llvm::StringRef getDialectNamespace() { return "{1}"; }
 )";
 
+/// Registration for a single dependent dialect: to be inserted in the ctor
+/// above for each dependent dialect.
+const char *const dialectRegistrationTemplate = R"(
+    getContext()->getOrLoadDialect<{0}>();
+)";
+
 /// The code block for the attribute parser/printer hooks.
 static const char *const attrParserDecl = R"(
   /// Parse an attribute registered to this dialect.
@@ -136,9 +145,18 @@ static void emitDialectDecl(Dialect &dialect,
                             iterator_range<DialectFilterIterator> dialectAttrs,
                             iterator_range<DialectFilterIterator> dialectTypes,
                             raw_ostream &os) {
+  /// Build the list of dependent dialects
+  std::string dependentDialectRegistrations;
+  {
+    llvm::raw_string_ostream dialectsOs(dependentDialectRegistrations);
+    for (StringRef dependentDialect : dialect.getDependentDialects())
+      dialectsOs << llvm::formatv(dialectRegistrationTemplate,
+                                  dependentDialect);
+  }
   // Emit the start of the decl.
   std::string cppName = dialect.getCppClassName();
-  os << llvm::formatv(dialectDeclBeginStr, cppName, dialect.getName());
+  os << llvm::formatv(dialectDeclBeginStr, cppName, dialect.getName(),
+                      dependentDialectRegistrations);
 
   // Check for any attributes/types registered to this dialect.  If there are,
   // add the hooks for parsing/printing.
diff --git a/mlir/tools/mlir-tblgen/PassGen.cpp b/mlir/tools/mlir-tblgen/PassGen.cpp
index c2dcdb8e4ac9b7..c1664a0c826c71 100644
--- a/mlir/tools/mlir-tblgen/PassGen.cpp
+++ b/mlir/tools/mlir-tblgen/PassGen.cpp
@@ -36,6 +36,7 @@ static llvm::cl::opt<std::string>
 /// {0}: The def name of the pass record.
 /// {1}: The base class for the pass.
 /// {2): The command line argument for the pass.
+/// {3}: The dependent dialects registration.
 const char *const passDeclBegin = R"(
 //===----------------------------------------------------------------------===//
 // {0}
@@ -63,9 +64,20 @@ class {0}Base : public {1} {
     return std::make_unique<DerivedT>(*static_cast<const DerivedT *>(this));
   }
 
+  /// Return the dialect that must be loaded in the context before this pass.
+  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
+    {3}
+  }
+
 protected:
 )";
 
+/// Registration for a single dependent dialect, to be inserted for each
+/// dependent dialect in the `getDependentDialects` above.
+const char *const dialectRegistrationTemplate = R"(
+  registry.insert<{0}>();
+)";
+
 /// Emit the declarations for each of the pass options.
 static void emitPassOptionDecls(const Pass &pass, raw_ostream &os) {
   for (const PassOption &opt : pass.getOptions()) {
@@ -94,8 +106,15 @@ static void emitPassStatisticDecls(const Pass &pass, raw_ostream &os) {
 
 static void emitPassDecl(const Pass &pass, raw_ostream &os) {
   StringRef defName = pass.getDef()->getName();
+  std::string dependentDialectRegistrations;
+  {
+    llvm::raw_string_ostream dialectsOs(dependentDialectRegistrations);
+    for (StringRef dependentDialect : pass.getDependentDialects())
+      dialectsOs << llvm::formatv(dialectRegistrationTemplate,
+                                  dependentDialect);
+  }
   os << llvm::formatv(passDeclBegin, defName, pass.getBaseClass(),
-                      pass.getArgument());
+                      pass.getArgument(), dependentDialectRegistrations);
   emitPassOptionDecls(pass, os);
   emitPassStatisticDecls(pass, os);
   os << "};\n";
diff --git a/mlir/tools/mlir-translate/mlir-translate.cpp b/mlir/tools/mlir-translate/mlir-translate.cpp
index 914bd340b3f564..0d67286a8a9142 100644
--- a/mlir/tools/mlir-translate/mlir-translate.cpp
+++ b/mlir/tools/mlir-translate/mlir-translate.cpp
@@ -88,7 +88,8 @@ int main(int argc, char **argv) {
   // Processes the memory buffer with a new MLIRContext.
   auto processBuffer = [&](std::unique_ptr<llvm::MemoryBuffer> ownedBuffer,
                            raw_ostream &os) {
-    MLIRContext context;
+    MLIRContext context(false);
+    registerAllDialects(&context);
     context.allowUnregisteredDialects();
     context.printOpOnDiagnostic(!verifyDiagnostics);
     llvm::SourceMgr sourceMgr;
diff --git a/mlir/unittests/Dialect/Quant/QuantizationUtilsTest.cpp b/mlir/unittests/Dialect/Quant/QuantizationUtilsTest.cpp
index 97c94a54ffc478..bae95e1a13b68f 100644
--- a/mlir/unittests/Dialect/Quant/QuantizationUtilsTest.cpp
+++ b/mlir/unittests/Dialect/Quant/QuantizationUtilsTest.cpp
@@ -17,9 +17,6 @@
 using namespace mlir;
 using namespace mlir::quant;
 
-// Load the quant dialect
-static DialectRegistration<QuantizationDialect> QuantOpsRegistration;
-
 namespace {
 
 // Test UniformQuantizedValueConverter converts all APFloat to a magic number 5.
@@ -78,7 +75,8 @@ UniformQuantizedType getTestQuantizedType(Type storageType, MLIRContext *ctx) {
 }
 
 TEST(QuantizationUtilsTest, convertFloatAttrUniform) {
-  MLIRContext ctx;
+  MLIRContext ctx(/*loadAllDialects=*/false);
+  ctx.getOrLoadDialect<QuantizationDialect>();
   IntegerType convertedType = IntegerType::get(8, &ctx);
   auto quantizedType = getTestQuantizedType(convertedType, &ctx);
   TestUniformQuantizedValueConverter converter(quantizedType);
@@ -95,7 +93,8 @@ TEST(QuantizationUtilsTest, convertFloatAttrUniform) {
 }
 
 TEST(QuantizationUtilsTest, convertRankedDenseAttrUniform) {
-  MLIRContext ctx;
+  MLIRContext ctx(/*loadAllDialects=*/false);
+  ctx.getOrLoadDialect<QuantizationDialect>();
   IntegerType convertedType = IntegerType::get(8, &ctx);
   auto quantizedType = getTestQuantizedType(convertedType, &ctx);
   TestUniformQuantizedValueConverter converter(quantizedType);
@@ -119,7 +118,8 @@ TEST(QuantizationUtilsTest, convertRankedDenseAttrUniform) {
 }
 
 TEST(QuantizationUtilsTest, convertRankedSplatAttrUniform) {
-  MLIRContext ctx;
+  MLIRContext ctx(/*loadAllDialects=*/false);
+  ctx.getOrLoadDialect<QuantizationDialect>();
   IntegerType convertedType = IntegerType::get(8, &ctx);
   auto quantizedType = getTestQuantizedType(convertedType, &ctx);
   TestUniformQuantizedValueConverter converter(quantizedType);
@@ -143,7 +143,8 @@ TEST(QuantizationUtilsTest, convertRankedSplatAttrUniform) {
 }
 
 TEST(QuantizationUtilsTest, convertRankedSparseAttrUniform) {
-  MLIRContext ctx;
+  MLIRContext ctx(/*loadAllDialects=*/false);
+  ctx.getOrLoadDialect<QuantizationDialect>();
   IntegerType convertedType = IntegerType::get(8, &ctx);
   auto quantizedType = getTestQuantizedType(convertedType, &ctx);
   TestUniformQuantizedValueConverter converter(quantizedType);
diff --git a/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp b/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp
index fe5632d7ae1658..4aa2ffed7e2b1f 100644
--- a/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp
+++ b/mlir/unittests/Dialect/SPIRV/DeserializationTest.cpp
@@ -38,7 +38,8 @@ using ::testing::StrEq;
 /// diagnostic checking utilities.
 class DeserializationTest : public ::testing::Test {
 protected:
-  DeserializationTest() {
+  DeserializationTest() : context(/*loadAllDialects=*/false) {
+    context.getOrLoadDialect<mlir::spirv::SPIRVDialect>();
     // Register a diagnostic handler to capture the diagnostic so that we can
     // check it later.
     context.getDiagEngine().registerHandler([&](Diagnostic &diag) {
diff --git a/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
index 3d57e559ca5ec2..cb89cd61de7bdc 100644
--- a/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
+++ b/mlir/unittests/Dialect/SPIRV/SerializationTest.cpp
@@ -36,7 +36,10 @@ using namespace mlir;
 
 class SerializationTest : public ::testing::Test {
 protected:
-  SerializationTest() { createModuleOp(); }
+  SerializationTest() : context(/*loadAllDialects=*/false) {
+    context.getOrLoadDialect<mlir::spirv::SPIRVDialect>();
+    createModuleOp();
+  }
 
   void createModuleOp() {
     OpBuilder builder(&context);
diff --git a/mlir/unittests/IR/AttributeTest.cpp b/mlir/unittests/IR/AttributeTest.cpp
index df449a0da75c56..78f7dd53d8fd7a 100644
--- a/mlir/unittests/IR/AttributeTest.cpp
+++ b/mlir/unittests/IR/AttributeTest.cpp
@@ -32,7 +32,7 @@ static void testSplat(Type eltType, const EltTy &splatElt) {
 
 namespace {
 TEST(DenseSplatTest, BoolSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   IntegerType boolTy = IntegerType::get(1, &context);
   RankedTensorType shape = RankedTensorType::get({2, 2}, boolTy);
 
@@ -57,7 +57,7 @@ TEST(DenseSplatTest, BoolSplat) {
 TEST(DenseSplatTest, LargeBoolSplat) {
   constexpr int64_t boolCount = 56;
 
-  MLIRContext context;
+  MLIRContext context(false);
   IntegerType boolTy = IntegerType::get(1, &context);
   RankedTensorType shape = RankedTensorType::get({boolCount}, boolTy);
 
@@ -80,7 +80,7 @@ TEST(DenseSplatTest, LargeBoolSplat) {
 }
 
 TEST(DenseSplatTest, BoolNonSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   IntegerType boolTy = IntegerType::get(1, &context);
   RankedTensorType shape = RankedTensorType::get({6}, boolTy);
 
@@ -92,7 +92,7 @@ TEST(DenseSplatTest, BoolNonSplat) {
 
 TEST(DenseSplatTest, OddIntSplat) {
   // Test detecting a splat with an odd(non 8-bit) integer bitwidth.
-  MLIRContext context;
+  MLIRContext context(false);
   constexpr size_t intWidth = 19;
   IntegerType intTy = IntegerType::get(intWidth, &context);
   APInt value(intWidth, 10);
@@ -101,7 +101,7 @@ TEST(DenseSplatTest, OddIntSplat) {
 }
 
 TEST(DenseSplatTest, Int32Splat) {
-  MLIRContext context;
+  MLIRContext context(false);
   IntegerType intTy = IntegerType::get(32, &context);
   int value = 64;
 
@@ -109,7 +109,7 @@ TEST(DenseSplatTest, Int32Splat) {
 }
 
 TEST(DenseSplatTest, IntAttrSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   IntegerType intTy = IntegerType::get(85, &context);
   Attribute value = IntegerAttr::get(intTy, 109);
 
@@ -117,7 +117,7 @@ TEST(DenseSplatTest, IntAttrSplat) {
 }
 
 TEST(DenseSplatTest, F32Splat) {
-  MLIRContext context;
+  MLIRContext context(false);
   FloatType floatTy = FloatType::getF32(&context);
   float value = 10.0;
 
@@ -125,7 +125,7 @@ TEST(DenseSplatTest, F32Splat) {
 }
 
 TEST(DenseSplatTest, F64Splat) {
-  MLIRContext context;
+  MLIRContext context(false);
   FloatType floatTy = FloatType::getF64(&context);
   double value = 10.0;
 
@@ -133,7 +133,7 @@ TEST(DenseSplatTest, F64Splat) {
 }
 
 TEST(DenseSplatTest, FloatAttrSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   FloatType floatTy = FloatType::getF32(&context);
   Attribute value = FloatAttr::get(floatTy, 10.0);
 
@@ -141,7 +141,7 @@ TEST(DenseSplatTest, FloatAttrSplat) {
 }
 
 TEST(DenseSplatTest, BF16Splat) {
-  MLIRContext context;
+  MLIRContext context(false);
   FloatType floatTy = FloatType::getBF16(&context);
   Attribute value = FloatAttr::get(floatTy, 10.0);
 
@@ -149,7 +149,7 @@ TEST(DenseSplatTest, BF16Splat) {
 }
 
 TEST(DenseSplatTest, StringSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   Type stringType =
       OpaqueType::get(Identifier::get("test", &context), "string", &context);
   StringRef value = "test-string";
@@ -157,7 +157,7 @@ TEST(DenseSplatTest, StringSplat) {
 }
 
 TEST(DenseSplatTest, StringAttrSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   Type stringType =
       OpaqueType::get(Identifier::get("test", &context), "string", &context);
   Attribute stringAttr = StringAttr::get("test-string", stringType);
@@ -165,28 +165,28 @@ TEST(DenseSplatTest, StringAttrSplat) {
 }
 
 TEST(DenseComplexTest, ComplexFloatSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   ComplexType complexType = ComplexType::get(FloatType::getF32(&context));
   std::complex<float> value(10.0, 15.0);
   testSplat(complexType, value);
 }
 
 TEST(DenseComplexTest, ComplexIntSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   ComplexType complexType = ComplexType::get(IntegerType::get(64, &context));
   std::complex<int64_t> value(10, 15);
   testSplat(complexType, value);
 }
 
 TEST(DenseComplexTest, ComplexAPFloatSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   ComplexType complexType = ComplexType::get(FloatType::getF32(&context));
   std::complex<APFloat> value(APFloat(10.0f), APFloat(15.0f));
   testSplat(complexType, value);
 }
 
 TEST(DenseComplexTest, ComplexAPIntSplat) {
-  MLIRContext context;
+  MLIRContext context(false);
   ComplexType complexType = ComplexType::get(IntegerType::get(64, &context));
   std::complex<APInt> value(APInt(64, 10), APInt(64, 15));
   testSplat(complexType, value);
diff --git a/mlir/unittests/IR/DialectTest.cpp b/mlir/unittests/IR/DialectTest.cpp
index bc389ce1f0daed..5a0a229d2c82ee 100644
--- a/mlir/unittests/IR/DialectTest.cpp
+++ b/mlir/unittests/IR/DialectTest.cpp
@@ -26,12 +26,12 @@ struct AnotherTestDialect : public Dialect {
 };
 
 TEST(DialectDeathTest, MultipleDialectsWithSameNamespace) {
-  MLIRContext context;
+  MLIRContext context(false);
 
   // Registering a dialect with the same namespace twice should result in a
   // failure.
-  context.getOrCreateDialect<TestDialect>();
-  ASSERT_DEATH(context.getOrCreateDialect<AnotherTestDialect>(), "");
+  context.loadDialect<TestDialect>();
+  ASSERT_DEATH(context.loadDialect<AnotherTestDialect>(), "");
 }
 
 } // end namespace
diff --git a/mlir/unittests/IR/OperationSupportTest.cpp b/mlir/unittests/IR/OperationSupportTest.cpp
index 95ddcccc565e3a..96693309bd1319 100644
--- a/mlir/unittests/IR/OperationSupportTest.cpp
+++ b/mlir/unittests/IR/OperationSupportTest.cpp
@@ -25,7 +25,7 @@ static Operation *createOp(MLIRContext *context,
 
 namespace {
 TEST(OperandStorageTest, NonResizable) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   Operation *useOp =
@@ -49,7 +49,7 @@ TEST(OperandStorageTest, NonResizable) {
 }
 
 TEST(OperandStorageTest, Resizable) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   Operation *useOp =
@@ -77,7 +77,7 @@ TEST(OperandStorageTest, Resizable) {
 }
 
 TEST(OperandStorageTest, RangeReplace) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   Operation *useOp =
@@ -113,7 +113,7 @@ TEST(OperandStorageTest, RangeReplace) {
 }
 
 TEST(OperandStorageTest, MutableRange) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   Operation *useOp =
diff --git a/mlir/unittests/Pass/AnalysisManagerTest.cpp b/mlir/unittests/Pass/AnalysisManagerTest.cpp
index 41a90649deef0a..918c8985eb0c23 100644
--- a/mlir/unittests/Pass/AnalysisManagerTest.cpp
+++ b/mlir/unittests/Pass/AnalysisManagerTest.cpp
@@ -29,7 +29,7 @@ struct OpSpecificAnalysis {
 };
 
 TEST(AnalysisManagerTest, FineGrainModuleAnalysisPreservation) {
-  MLIRContext context;
+  MLIRContext context(false);
 
   // Test fine grain invalidation of the module analysis manager.
   OwningModuleRef module(ModuleOp::create(UnknownLoc::get(&context)));
@@ -50,7 +50,7 @@ TEST(AnalysisManagerTest, FineGrainModuleAnalysisPreservation) {
 }
 
 TEST(AnalysisManagerTest, FineGrainFunctionAnalysisPreservation) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   // Create a function and a module.
@@ -79,7 +79,7 @@ TEST(AnalysisManagerTest, FineGrainFunctionAnalysisPreservation) {
 }
 
 TEST(AnalysisManagerTest, FineGrainChildFunctionAnalysisPreservation) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   // Create a function and a module.
@@ -122,7 +122,7 @@ struct CustomInvalidatingAnalysis {
 };
 
 TEST(AnalysisManagerTest, CustomInvalidation) {
-  MLIRContext context;
+  MLIRContext context(false);
   Builder builder(&context);
 
   // Create a function and a module.
diff --git a/mlir/unittests/SDBM/SDBMTest.cpp b/mlir/unittests/SDBM/SDBMTest.cpp
index 61d670650b4bfd..bbe87e3d292c85 100644
--- a/mlir/unittests/SDBM/SDBMTest.cpp
+++ b/mlir/unittests/SDBM/SDBMTest.cpp
@@ -17,18 +17,17 @@
 
 using namespace mlir;
 
-/// Load the SDBM dialect.
-static DialectRegistration<SDBMDialect> SDBMRegistration;
 
 static MLIRContext *ctx() {
-  static thread_local MLIRContext context;
+  static thread_local MLIRContext context(false);
+  context.getOrLoadDialect<SDBMDialect>();
   return &context;
 }
 
 static SDBMDialect *dialect() {
   static thread_local SDBMDialect *d = nullptr;
   if (!d) {
-    d = ctx()->getRegisteredDialect<SDBMDialect>();
+    d = ctx()->getOrLoadDialect<SDBMDialect>();
   }
   return d;
 }
diff --git a/mlir/unittests/TableGen/OpBuildGen.cpp b/mlir/unittests/TableGen/OpBuildGen.cpp
index 3e3256e96cd045..46a37da6e9441f 100644
--- a/mlir/unittests/TableGen/OpBuildGen.cpp
+++ b/mlir/unittests/TableGen/OpBuildGen.cpp
@@ -25,11 +25,16 @@ namespace mlir {
 // Test Fixture
 //===----------------------------------------------------------------------===//
 
+static MLIRContext &getContext() {
+  static MLIRContext ctx(false);
+  ctx.getOrLoadDialect<TestDialect>();
+  return ctx;
+}
 /// Test fixture for providing basic utilities for testing.
 class OpBuildGenTest : public ::testing::Test {
 protected:
   OpBuildGenTest()
-      : ctx{}, builder(&ctx), loc(builder.getUnknownLoc()),
+      : ctx(getContext()), builder(&ctx), loc(builder.getUnknownLoc()),
         i32Ty(builder.getI32Type()), f32Ty(builder.getF32Type()),
         cstI32(builder.create<TableGenConstant>(loc, i32Ty)),
         cstF32(builder.create<TableGenConstant>(loc, f32Ty)),
@@ -86,7 +91,7 @@ class OpBuildGenTest : public ::testing::Test {
   }
 
 protected:
-  MLIRContext ctx;
+  MLIRContext &ctx;
   OpBuilder builder;
   Location loc;
   Type i32Ty;
diff --git a/mlir/unittests/TableGen/StructsGenTest.cpp b/mlir/unittests/TableGen/StructsGenTest.cpp
index c58fedb4ec4f03..14b0abc675bff3 100644
--- a/mlir/unittests/TableGen/StructsGenTest.cpp
+++ b/mlir/unittests/TableGen/StructsGenTest.cpp
@@ -42,7 +42,7 @@ static test::TestStruct getTestStruct(mlir::MLIRContext *context) {
 /// Validates that test::TestStruct::classof correctly identifies a valid
 /// test::TestStruct.
 TEST(StructsGenTest, ClassofTrue) {
-  mlir::MLIRContext context;
+  mlir::MLIRContext context(false);
   auto structAttr = getTestStruct(&context);
   ASSERT_TRUE(test::TestStruct::classof(structAttr));
 }